test-backend-ops: improve msvc build time (#18209 )

Added comments explaining thread block size selection logic based on row count and column size, derived from historical commit context (#18212 )
server : [easy] fix per round speculative decode logging (#18211 )
2026-02-05 13:53:23 +02:00 · 2025-12-20 13:45:45 -06:00 · 2025-12-20 19:28:57 +08:00 · 2025-12-20 10:57:40 +01:00 · 2025-12-20 09:25:27 +01:00 · 2025-12-19 22:29:46 +01:00
659 changed files with 148057 additions and 37753 deletions
--- a/.devops/cann.Dockerfile
+++ b/.devops/cann.Dockerfile
@@ -4,7 +4,7 @@

 # Define the CANN base image for easier version updates later
 ARG CHIP_TYPE=910b
-ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.3.rc1.alpha001-${CHIP_TYPE}-openeuler22.03-py3.11
+ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.3.rc2-${CHIP_TYPE}-openeuler24.03-py3.11

 # ==============================================================================
 # BUILD STAGE
@@ -107,11 +107,11 @@ ENTRYPOINT ["/app/tools.sh"]
 # ENTRYPOINT ["/app/llama-server"]

 ### Target: light
-# Lightweight image containing only llama-cli
+# Lightweight image containing only llama-cli and llama-completion
 # ==============================================================================
 FROM base AS light

-COPY --from=build /app/full/llama-cli /app
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app

 ENTRYPOINT [ "/app/llama-cli" ]

--- a/.devops/cpu.Dockerfile
+++ b/.devops/cpu.Dockerfile
@@ -68,7 +68,7 @@ ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light

-COPY --from=build /app/full/llama-cli /app
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app

 WORKDIR /app

--- a/.devops/cuda.Dockerfile
+++ b/.devops/cuda.Dockerfile
@@ -74,7 +74,7 @@ ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light

-COPY --from=build /app/full/llama-cli /app
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app

 WORKDIR /app

--- a/.devops/intel.Dockerfile
+++ b/.devops/intel.Dockerfile
@@ -73,7 +73,7 @@ ENTRYPOINT ["/app/tools.sh"]
 FROM base AS light

 COPY --from=build /app/lib/ /app
-COPY --from=build /app/full/llama-cli /app
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app

 WORKDIR /app

--- a/.devops/llama-cli-cann.Dockerfile
+++ b/.devops/llama-cli-cann.Dockerfile
@@ -23,11 +23,12 @@ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
 RUN echo "Building with static libs" && \
    source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
    cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_TESTS=OFF  && \
-    cmake --build build --config Release --target llama-cli
+    cmake --build build --config Release --target llama-cli && \
+    cmake --build build --config Release --target llama-completion

 # TODO: use image with NNRT
 FROM ascendai/cann:$ASCEND_VERSION AS runtime
-COPY --from=build /app/build/bin/llama-cli /llama-cli
+COPY --from=build /app/build/bin/llama-cli /app/build/bin/llama-completion /

 ENV LC_ALL=C.utf8

--- a/.devops/llama-cpp-cuda.srpm.spec
+++ b/.devops/llama-cpp-cuda.srpm.spec
@@ -37,6 +37,7 @@ make -j GGML_CUDA=1
 %install
 mkdir -p %{buildroot}%{_bindir}/
 cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli
+cp -p llama-completion %{buildroot}%{_bindir}/llama-cuda-completion
 cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server
 cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple

@@ -68,6 +69,7 @@ rm -rf %{_builddir}/*

 %files
 %{_bindir}/llama-cuda-cli
+%{_bindir}/llama-cuda-completion
 %{_bindir}/llama-cuda-server
 %{_bindir}/llama-cuda-simple
 /usr/lib/systemd/system/llamacuda.service
--- a/.devops/llama-cpp.srpm.spec
+++ b/.devops/llama-cpp.srpm.spec
@@ -39,6 +39,7 @@ make -j
 %install
 mkdir -p %{buildroot}%{_bindir}/
 cp -p llama-cli %{buildroot}%{_bindir}/llama-cli
+cp -p llama-completion %{buildroot}%{_bindir}/llama-completion
 cp -p llama-server %{buildroot}%{_bindir}/llama-server
 cp -p llama-simple %{buildroot}%{_bindir}/llama-simple

@@ -70,6 +71,7 @@ rm -rf %{_builddir}/*

 %files
 %{_bindir}/llama-cli
+%{_bindir}/llama-completion
 %{_bindir}/llama-server
 %{_bindir}/llama-simple
 /usr/lib/systemd/system/llama.service
--- a/.devops/musa.Dockerfile
+++ b/.devops/musa.Dockerfile
@@ -81,7 +81,7 @@ ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light

-COPY --from=build /app/full/llama-cli /app
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app

 WORKDIR /app

--- a/.devops/rocm.Dockerfile
+++ b/.devops/rocm.Dockerfile
@@ -94,7 +94,7 @@ ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light

-COPY --from=build /app/full/llama-cli /app
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app

 WORKDIR /app

--- a/.devops/s390x.Dockerfile
+++ b/.devops/s390x.Dockerfile
@@ -105,7 +105,7 @@ WORKDIR /llama.cpp/bin

 # Copy llama.cpp binaries and libraries
 COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
-COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin
+COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin/llama-completion /llama.cpp/bin

 ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ]

--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@@ -13,6 +13,8 @@ elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
    exec ./llama-quantize "$@"
 elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
    exec ./llama-cli "$@"
+elif [[ "$arg1" == '--run-legacy' || "$arg1" == '-l' ]]; then
+    exec ./llama-completion "$@"
 elif [[ "$arg1" == '--bench' || "$arg1" == '-b' ]]; then
    exec ./llama-bench "$@"
 elif [[ "$arg1" == '--perplexity' || "$arg1" == '-p' ]]; then
@@ -32,8 +34,10 @@ elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
 else
    echo "Unknown command: $arg1"
    echo "Available commands: "
-    echo "  --run (-r): Run a model previously converted into ggml"
-    echo "              ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
+    echo "  --run (-r): Run a model (chat) previously converted into ggml"
+    echo "              ex: -m /models/7B/ggml-model-q4_0.bin"
+    echo "  --run-legacy (-l): Run a model (legacy completion) previously converted into ggml"
+    echo "              ex: -m /models/7B/ggml-model-q4_0.bin -no-cnv -p \"Building a website can be done in 10 simple steps:\" -n 512"
    echo "  --bench (-b): Benchmark the performance of the inference for various parameters."
    echo "              ex: -m model.gguf"
    echo "  --perplexity (-p): Measure the perplexity of a model over a given text."
--- a/.devops/vulkan.Dockerfile
+++ b/.devops/vulkan.Dockerfile
@@ -50,6 +50,7 @@ WORKDIR /app

 RUN apt-get update \
    && apt-get install -y \
+    build-essential \
    git \
    python3 \
    python3-pip \
@@ -67,7 +68,7 @@ ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light

-COPY --from=build /app/full/llama-cli /app
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app

 WORKDIR /app

--- a/.github/ISSUE_TEMPLATE/011-bug-results.yml
+++ b/.github/ISSUE_TEMPLATE/011-bug-results.yml
@@ -11,7 +11,7 @@ body:
        (i.e. the generated text) are incorrect or llama.cpp crashes during model evaluation.
        If you encountered the issue while using an external UI (e.g. ollama),
        please reproduce your issue using one of the examples/binaries in this repository.
-        The `llama-cli` binary can be used for simple and reproducible model inference.
+        The `llama-completion` binary can be used for simple and reproducible model inference.
  - type: textarea
    id: version
    attributes:
@@ -74,9 +74,12 @@ body:
        Please give us a summary of the problem and tell us how to reproduce it.
        If you can narrow down the bug to specific hardware, compile flags, or command line arguments,
        that information would be very much appreciated by us.
+
+        If possible, please try to reproduce the issue using `llama-completion` with `-fit off`.
+        If you can only reproduce the issue with `-fit on`, please provide logs both with and without `--verbose`.
      placeholder: >
-        e.g. when I run llama-cli with -ngl 99 I get garbled outputs.
-        When I use -ngl 0 it works correctly.
+        e.g. when I run llama-completion with `-fa on` I get garbled outputs for very long prompts.
+        With short prompts or `-fa off` it works correctly.
        Here are the exact commands that I used: ...
    validations:
      required: true
--- a/.github/ISSUE_TEMPLATE/019-bug-misc.yml
+++ b/.github/ISSUE_TEMPLATE/019-bug-misc.yml
@@ -86,6 +86,7 @@ body:
      description: >
          If applicable, please copy and paste any relevant log output, including any generated text.
          This will be automatically formatted into code, so no need for backticks.
+          If you are encountering problems specifically with the `llama_params_fit` module, always upload `--verbose` logs as well.
      render: shell
    validations:
      required: false
--- a/.github/actions/windows-setup-cuda/action.yml
+++ b/.github/actions/windows-setup-cuda/action.yml
@@ -65,3 +65,34 @@ runs:
          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
          echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
          echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
+
+    - name: Install Cuda Toolkit 13.1
+      if: ${{ inputs.cuda_version == '13.1' }}
+      shell: pwsh
+      run: |
+          mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1"
+          choco install unzip -y
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_crt/windows-x86_64/cuda_crt-windows-x86_64-13.1.80-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-13.1.80-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-13.1.80-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-13.1.80-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-13.2.0.9-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libnvvm/windows-x86_64/libnvvm-windows-x86_64-13.1.80-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-13.1.68-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-13.1.80-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-13.1.68-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-13.1.78-archive.zip"
+          unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1"
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_crt-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_cudart-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_nvcc-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_nvrtc-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\libcublas-windows-x86_64-13.2.0.9-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\libnvvm-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_nvtx-windows-x86_64-13.1.68-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_profiler_api-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\visual_studio_integration-windows-x86_64-13.1.68-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_cccl-windows-x86_64-13.1.78-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
+          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
+          echo "CUDA_PATH_V13_1=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
--- a/.github/workflows/build-linux-cross.yml
+++ b/.github/workflows/build-linux-cross.yml
@@ -291,6 +291,7 @@ jobs:
                         -DGGML_RVV=ON \
                         -DGGML_RV_ZFH=ON \
                         -DGGML_RV_ZICBOP=ON \
+                         -DGGML_RV_ZIHINTPAUSE=ON \
                         -DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1 \
                         -DCMAKE_TOOLCHAIN_FILE=${PWD}/cmake/riscv64-spacemit-linux-gnu-gcc.cmake

--- a/.github/workflows/build-riscv-native.yml
+++ b/.github/workflows/build-riscv-native.yml
@@ -1,120 +0,0 @@
-name: Build on RISCV Linux Machine by Cloud-V
-on:
-  pull_request:
-  workflow_dispatch:
-  workflow_call:
-
-jobs:
-  debian-13-riscv64-native: # Bianbu 2.2
-    runs-on: [self-hosted, RISCV64]
-
-    steps:
-      - name: Install prerequisites
-        run: |
-          sudo apt-get update || true
-          sudo apt-get install -y libatomic1
-      - uses: actions/checkout@v4
-      - name: Setup Riscv
-        run: |
-          sudo apt-get update || true
-          sudo apt-get install -y --no-install-recommends \
-                  build-essential \
-                  gcc-14-riscv64-linux-gnu \
-                  g++-14-riscv64-linux-gnu \
-                  ccache \
-                  cmake
-
-      - name: Setup ccache
-        run: |
-          mkdir -p $HOME/.ccache
-          ccache -M 5G -d $HOME/.ccache
-          export CCACHE_LOGFILE=/home/runneruser/ccache_debug/ccache.log
-          export CCACHE_DEBUGDIR="/home/runneruser/ccache_debug"
-          echo "$GITHUB_WORKSPACE"
-          echo "CCACHE_LOGFILE=$CCACHE_LOGFILE" >> $GITHUB_ENV
-          echo "CCACHE_DEBUGDIR=$CCACHE_DEBUGDIR" >> $GITHUB_ENV
-          echo "CCACHE_BASEDIR=$GITHUB_WORKSPACE" >> $GITHUB_ENV
-          echo "CCACHE_DIR=$HOME/.ccache" >> $GITHUB_ENV
-
-      - name: Build
-        run: |
-          cmake -B build \
-            -DLLAMA_CURL=OFF \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DGGML_OPENMP=OFF \
-            -DLLAMA_BUILD_EXAMPLES=ON \
-            -DLLAMA_BUILD_TOOLS=ON \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DCMAKE_SYSTEM_NAME=Linux \
-            -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
-            -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-            -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
-            -DCMAKE_C_COMPILER_LAUNCHER=ccache \
-            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-            -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-            -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
-            -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-            -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-            -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
-
-          cmake --build build --config Release -j $(nproc)
-
-  # debian-13-riscv64-spacemit-ime-native: # Bianbu 2.2
-  #   runs-on: [self-hosted, RISCV64]
-
-  #   steps:
-  #     - name: Install prerequisites
-  #       run: |
-  #         sudo apt-get update || true
-  #         sudo apt-get install -y libatomic1
-  #     - uses: actions/checkout@v4
-  #     - name: Setup Riscv
-  #       run: |
-  #         sudo apt-get update || true
-  #         sudo apt-get install -y --no-install-recommends \
-  #                 build-essential \
-  #                 gcc-14-riscv64-linux-gnu \
-  #                 g++-14-riscv64-linux-gnu \
-  #                 ccache \
-  #                 cmake
-  #         sudo apt-get upgrade binutils -y
-
-  #     - name: Setup ccache
-  #       run: |
-  #         mkdir -p $HOME/.ccache
-  #         ccache -M 5G -d $HOME/.ccache
-  #         export CCACHE_LOGFILE=/home/runneruser/ccache_debug/ccache.log
-  #         export CCACHE_DEBUGDIR="/home/runneruser/ccache_debug"
-  #         echo "$GITHUB_WORKSPACE"
-  #         echo "CCACHE_LOGFILE=$CCACHE_LOGFILE" >> $GITHUB_ENV
-  #         echo "CCACHE_DEBUGDIR=$CCACHE_DEBUGDIR" >> $GITHUB_ENV
-  #         echo "CCACHE_BASEDIR=$GITHUB_WORKSPACE" >> $GITHUB_ENV
-  #         echo "CCACHE_DIR=$HOME/.ccache" >> $GITHUB_ENV
-
-  #     - name: Build
-  #       run: |
-  #         cmake -B build \
-  #           -DLLAMA_CURL=OFF \
-  #           -DCMAKE_BUILD_TYPE=Release \
-  #           -DGGML_OPENMP=OFF \
-  #           -DLLAMA_BUILD_EXAMPLES=ON \
-  #           -DLLAMA_BUILD_TOOLS=ON \
-  #           -DLLAMA_BUILD_TESTS=OFF \
-  #           -DCMAKE_SYSTEM_NAME=Linux \
-  #           -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
-  #           -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-  #           -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
-  #           -DCMAKE_C_COMPILER_LAUNCHER=ccache \
-  #           -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-  #           -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-  #           -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
-  #           -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-  #           -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-  #           -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH \
-  #           -DGGML_RVV=ON \
-  #           -DGGML_RV_ZFH=ON \
-  #           -DGGML_RV_ZICBOP=ON \
-  #           -DGGML_CPU_RISCV64_SPACEMIT=ON \
-  #           -DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1
-
-  #         cmake --build build --config Release -j $(nproc)
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -20,7 +20,8 @@ on:
      '**/*.swift',
      '**/*.m',
      '**/*.metal',
-      '**/*.comp'
+      '**/*.comp',
+      '**/*.glsl'
    ]

  pull_request:
@@ -40,7 +41,8 @@ on:
      '**/*.swift',
      '**/*.m',
      '**/*.metal',
-      '**/*.comp'
+      '**/*.comp',
+      '**/*.glsl'
    ]

 concurrency:
@@ -68,6 +70,7 @@ jobs:
        with:
          key: macOS-latest-cmake-arm64
          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Build
        id: cmake_build
@@ -104,6 +107,7 @@ jobs:
        with:
          key: macOS-latest-cmake-x64
          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Build
        id: cmake_build
@@ -140,6 +144,7 @@ jobs:
        with:
          key: macOS-latest-cmake-arm64-webgpu
          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Dawn Dependency
        id: dawn-depends
@@ -193,6 +198,7 @@ jobs:
        with:
          key: ubuntu-cpu-cmake-${{ matrix.build }}
          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Build Dependencies
        id: build_depends
@@ -243,7 +249,7 @@ jobs:
          echo "Fetch llama2c model"
          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
          ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
-          ./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
+          ./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256

      - name: Test llama2c (s390x)
        id: llama2c_test_s390x
@@ -252,7 +258,7 @@ jobs:
          cd build
          echo "Fetch llama2c big-endian model"
          wget https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K-be.gguf
-          ./bin/llama-cli -m stories260K-be.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
+          ./bin/llama-completion -m stories260K-be.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256

  ubuntu-latest-cmake-sanitizer:
    runs-on: ubuntu-latest
@@ -274,6 +280,7 @@ jobs:
        with:
          key: ubuntu-latest-cmake-sanitizer-${{ matrix.sanitizer }}
          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Dependencies
        id: depends
@@ -394,6 +401,7 @@ jobs:
        with:
          key: ubuntu-24-cmake-vulkan-deb
          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Dependencies
        id: depends
@@ -429,6 +437,7 @@ jobs:
        with:
          key: ubuntu-24-cmake-vulkan
          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Dependencies
        id: depends
@@ -488,6 +497,7 @@ jobs:
        with:
          key: ubuntu-24-cmake-webgpu
          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Dependencies
        id: depends
@@ -547,6 +557,47 @@ jobs:
          # This is using llvmpipe and runs slower than other backends
          ctest -L main --verbose --timeout 3600

+  ubuntu-24-wasm-webgpu:
+    runs-on: ubuntu-24.04
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.16
+        with:
+          key: ubuntu-latest-wasm-webgpu
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Install Emscripten
+        run: |
+          git clone https://github.com/emscripten-core/emsdk.git
+          cd emsdk
+          ./emsdk install latest
+          ./emsdk activate latest
+
+      - name: Fetch emdawnwebgpu
+        run: |
+          DAWN_TAG="v20251027.212519"
+          EMDAWN_PKG="emdawnwebgpu_pkg-${DAWN_TAG}.zip"
+          echo "Downloading ${EMDAWN_PKG}"
+          curl -L -o emdawn.zip \
+            "https://github.com/google/dawn/releases/download/${DAWN_TAG}/${EMDAWN_PKG}"
+          unzip emdawn.zip
+
+      - name: Build WASM WebGPU
+        run: |
+          source emsdk/emsdk_env.sh
+          emcmake cmake -B build-wasm \
+            -DGGML_WEBGPU=ON \
+            -DLLAMA_CURL=OFF \
+            -DEMDAWNWEBGPU_DIR=emdawnwebgpu_pkg
+
+          cmake --build build-wasm --target test-backend-ops -j $(nproc)
+
  ubuntu-22-cmake-hip:
    runs-on: ubuntu-22.04
    container: rocm/dev-ubuntu-22.04:6.1.2
@@ -567,6 +618,7 @@ jobs:
        with:
          key: ubuntu-22-cmake-hip
          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Build with native CMake HIP support
        id: cmake_build
@@ -599,6 +651,7 @@ jobs:
        with:
          key: ubuntu-22-cmake-musa
          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Build with native CMake MUSA support
        id: cmake_build
@@ -646,6 +699,7 @@ jobs:
        with:
          key: ubuntu-22-cmake-sycl
          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Build
        id: cmake_build
@@ -696,6 +750,7 @@ jobs:
        with:
          key: ubuntu-22-cmake-sycl-fp16
          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Build
        id: cmake_build
@@ -729,6 +784,7 @@ jobs:
        with:
          key: macOS-latest-cmake-ios
          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Build
        id: cmake_build
@@ -760,6 +816,7 @@ jobs:
        with:
          key: macOS-latest-cmake-tvos
          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Build
        id: cmake_build
@@ -821,6 +878,7 @@ jobs:
        with:
          key: macOS-latest-swift
          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Download xcframework artifact
        uses: actions/download-artifact@v4
@@ -863,6 +921,7 @@ jobs:
          key: windows-msys2
          variant: ccache
          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Setup ${{ matrix.sys }}
        uses: msys2/setup-msys2@v2
@@ -931,6 +990,7 @@ jobs:
          key: windows-latest-cmake-${{ matrix.build }}
          variant: ccache
          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Download OpenBLAS
        id: get_openblas
@@ -1035,6 +1095,7 @@ jobs:
          with:
            key: ubuntu-latest-cmake-cuda
            evict-old-files: 1d
+            save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

        - name: Build with CMake
          run: |
@@ -1067,6 +1128,7 @@ jobs:
          key: windows-cuda-${{ matrix.cuda }}
          variant: ccache
          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Install Cuda Toolkit
        uses: ./.github/actions/windows-setup-cuda
@@ -1118,6 +1180,7 @@ jobs:
          key: windows-latest-cmake-sycl
          variant: ccache
          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Install
        run:  |
@@ -1179,6 +1242,7 @@ jobs:
        with:
          key: ${{ github.job }}
          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Build
        id: cmake_build
@@ -1360,25 +1424,54 @@ jobs:
        chip_type: ['910b', '310p']
        build: ['Release']
    runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
-    container: ascendai/cann:${{ matrix.chip_type == '910b' &&  '8.3.rc1.alpha001-910b-openeuler22.03-py3.11' || '8.2.rc1-310p-openeuler22.03-py3.11' }}
    steps:
      - name: Checkout
        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0

-      - name: Dependencies
+      - name: Free up disk space
+        uses: ggml-org/free-disk-space@v1.3.1
+        with:
+          tool-cache: true
+
+      - name: Set container image
+        id: cann-image
        run: |
-          yum update -y
-          yum install -y git gcc gcc-c++ make cmake libcurl-devel
+          image="ascendai/cann:${{ matrix.chip_type == '910b' &&  '8.3.rc2-910b-openeuler24.03-py3.11' || '8.3.rc2-310p-openeuler24.03-py3.11' }}"
+          echo "image=${image}" >> "${GITHUB_OUTPUT}"
+
+      - name: Pull container image
+        run: docker pull "${{ steps.cann-image.outputs.image }}"

      - name: Build
+        env:
+          BUILD_TYPE: ${{ matrix.build }}
+          SOC_TYPE: ascend${{ matrix.chip_type }}
        run: |
-          export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
+          HOST_UID=$(id -u)
+          HOST_GID=$(id -g)

-          cmake -S . -B build \
-              -DCMAKE_BUILD_TYPE=${{ matrix.build }} \
-              -DGGML_CANN=on \
-              -DSOC_TYPE=ascend${{ matrix.chip_type }}
-          cmake --build build -j $(nproc)
+          docker run --rm \
+            -v "${PWD}:/workspace" \
+            -w /workspace \
+            -e SOC_TYPE=${SOC_TYPE} \
+            -e BUILD_TYPE=${BUILD_TYPE} \
+            "${{ steps.cann-image.outputs.image }}" \
+            bash -lc '
+              set -e
+              yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake libcurl-devel
+              yum clean all && rm -rf /var/cache/yum
+              git config --global --add safe.directory "/workspace"
+              export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
+              cmake -S . -B build \
+                  -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+                  -DGGML_CANN=on \
+                  -DSOC_TYPE=${SOC_TYPE}
+              cmake --build build -j $(nproc)
+
+              chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
+            '

 # TODO: simplify the following workflows using a matrix
 # TODO: run lighter CI on PRs and the full CI only on master (if needed)
@@ -1395,6 +1488,7 @@ jobs:
        with:
          key: ggml-ci-x64-cpu-low-perf
          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Dependencies
        id: depends
@@ -1420,6 +1514,7 @@ jobs:
        with:
          key: ggml-ci-arm64-cpu-low-perf
          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Dependencies
        id: depends
@@ -1445,6 +1540,7 @@ jobs:
        with:
          key: ggml-ci-x64-cpu-high-perf
          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Dependencies
        id: depends
@@ -1470,6 +1566,7 @@ jobs:
        with:
          key: ggml-ci-arm64-cpu-high-perf
          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Dependencies
        id: depends
@@ -1495,6 +1592,7 @@ jobs:
        with:
          key: ggml-ci-arm64-cpu-high-perf-sve
          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Dependencies
        id: depends
@@ -1562,33 +1660,33 @@ jobs:
        run: |
          bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp

-  ggml-ci-x64-amd-vulkan:
-    runs-on: [self-hosted, Linux, X64, AMD]
+  # ggml-ci-x64-amd-vulkan:
+  #   runs-on: [self-hosted, Linux, X64, AMD]

-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
+  #   steps:
+  #     - name: Clone
+  #       id: checkout
+  #       uses: actions/checkout@v4

-      - name: Test
-        id: ggml-ci
-        run: |
-          vulkaninfo --summary
-          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+  #     - name: Test
+  #       id: ggml-ci
+  #       run: |
+  #         vulkaninfo --summary
+  #         GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp

-  ggml-ci-x64-amd-rocm:
-    runs-on: [self-hosted, Linux, X64, AMD]
+  # ggml-ci-x64-amd-rocm:
+  #   runs-on: [self-hosted, Linux, X64, AMD]

-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
+  #   steps:
+  #     - name: Clone
+  #       id: checkout
+  #       uses: actions/checkout@v4

-      - name: Test
-        id: ggml-ci
-        run: |
-          amd-smi static
-          GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+  #     - name: Test
+  #       id: ggml-ci
+  #       run: |
+  #         amd-smi static
+  #         GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp

  ggml-ci-mac-metal:
    runs-on: [self-hosted, macOS, ARM64]
@@ -1630,6 +1728,7 @@ jobs:
         with:
           key: ggml-ci-arm64-cpu-kleidiai
           evict-old-files: 1d
+           save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

       - name: Dependencies
         id: depends
@@ -1642,6 +1741,337 @@ jobs:
         run: |
           GG_BUILD_KLEIDIAI=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt

+  ubuntu-cpu-cmake-riscv64-native:
+    runs-on: RISCV64
+
+    steps:
+      - name: Install dependencies
+        run: |
+          sudo apt-get update
+
+          # Install necessary packages
+          sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential libssl-dev wget ccache
+
+          # Set gcc-14 and g++-14 as the default compilers
+          sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
+          sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
+          sudo ln -sf /usr/bin/gcc-14 /usr/bin/gcc
+          sudo ln -sf /usr/bin/g++-14 /usr/bin/g++
+
+          # Install Rust stable version
+          rustup install stable
+          rustup default stable
+
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Check environment
+        run: |
+          uname -a
+          gcc --version
+          g++ --version
+          ldd --version
+          cmake --version
+          rustc --version
+
+      - name: Setup ccache
+        run: |
+          # Set unique cache directory for this job
+          export CCACHE_DIR="$HOME/.ccache/cpu-cmake-rv64-native"
+          mkdir -p "$CCACHE_DIR"
+
+          # Configure ccache for optimal performance
+          ccache --set-config=max_size=5G
+          ccache --set-config=compression=true
+          ccache --set-config=compression_level=6
+          ccache --set-config=cache_dir="$CCACHE_DIR"
+
+          # Enable more aggressive caching
+          ccache --set-config=sloppiness=file_macro,time_macros,include_file_mtime,include_file_ctime
+          ccache --set-config=hash_dir=false
+
+          # Export for subsequent steps
+          echo "CCACHE_DIR=$CCACHE_DIR" >> $GITHUB_ENV
+          echo "PATH=/usr/lib/ccache:$PATH" >> $GITHUB_ENV
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -B build \
+            -DLLAMA_CURL=OFF \
+            -DLLAMA_OPENSSL=ON \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DGGML_OPENMP=OFF \
+            -DLLAMA_BUILD_EXAMPLES=ON \
+            -DLLAMA_BUILD_TOOLS=ON \
+            -DLLAMA_BUILD_TESTS=ON \
+            -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+            -DGGML_RPC=ON \
+            -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
+            -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
+
+          cmake --build build --config Release -j $(nproc)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L 'main|curl' --verbose --timeout 900
+
+      - name: Test llama2c conversion
+        id: llama2c_test
+        run: |
+          cd build
+          echo "Fetch tokenizer"
+          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
+          echo "Fetch llama2c model"
+          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
+          ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
+          ./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
+
+  ubuntu-cmake-sanitizer-riscv64-native:
+    runs-on: RISCV64
+
+    continue-on-error: true
+
+    strategy:
+      matrix:
+        sanitizer: [ADDRESS, THREAD, UNDEFINED]
+        build_type: [Debug]
+
+    steps:
+      - name: Install dependencies
+        run: |
+          sudo apt-get update
+
+          # Install necessary packages
+          sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential wget ccache
+
+          # Set gcc-14 and g++-14 as the default compilers
+          sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
+          sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
+          sudo ln -sf /usr/bin/gcc-14 /usr/bin/gcc
+          sudo ln -sf /usr/bin/g++-14 /usr/bin/g++
+
+          # Install Rust stable version
+          rustup install stable
+          rustup default stable
+
+      - name: GCC version check
+        run: |
+          gcc --version
+          g++ --version
+
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Setup ccache
+        run: |
+          # Unique cache directory per matrix combination
+          export CCACHE_DIR="$HOME/.ccache/sanitizer-${{ matrix.sanitizer }}-${{ matrix.build_type }}"
+          mkdir -p "$CCACHE_DIR"
+
+          # Configure ccache
+          ccache --set-config=max_size=5G
+          ccache --set-config=compression=true
+          ccache --set-config=compression_level=6
+          ccache --set-config=cache_dir="$CCACHE_DIR"
+          ccache --set-config=sloppiness=file_macro,time_macros,include_file_mtime,include_file_ctime
+          ccache --set-config=hash_dir=false
+
+          # Export for subsequent steps
+          echo "CCACHE_DIR=$CCACHE_DIR" >> $GITHUB_ENV
+          echo "PATH=/usr/lib/ccache:$PATH" >> $GITHUB_ENV
+
+      - name: Build
+        id: cmake_build
+        if: ${{ matrix.sanitizer != 'THREAD' }}
+        run: |
+          cmake -B build \
+            -DLLAMA_CURL=OFF \
+            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
+            -DGGML_OPENMP=ON \
+            -DLLAMA_BUILD_EXAMPLES=ON \
+            -DLLAMA_BUILD_TOOLS=ON \
+            -DLLAMA_BUILD_TESTS=OFF \
+            -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
+            -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
+            -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
+
+          cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
+
+      - name: Build (no OpenMP)
+        id: cmake_build_no_openmp
+        if: ${{ matrix.sanitizer == 'THREAD' }}
+        run: |
+          cmake -B build \
+            -DLLAMA_CURL=OFF \
+            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
+            -DGGML_OPENMP=OFF \
+            -DLLAMA_BUILD_EXAMPLES=ON \
+            -DLLAMA_BUILD_TOOLS=ON \
+            -DLLAMA_BUILD_TESTS=OFF \
+            -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
+            -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
+            -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
+
+          cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L main --verbose --timeout 900
+
+
+  ubuntu-llguidance-riscv64-native:
+    runs-on: RISCV64
+    steps:
+      - name: Install dependencies
+        run: |
+          sudo apt-get update
+
+          # Install necessary packages
+          sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential wget ccache
+
+          # Set gcc-14 and g++-14 as the default compilers
+          sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
+          sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
+          sudo ln -sf /usr/bin/gcc-14 /usr/bin/gcc
+          sudo ln -sf /usr/bin/g++-14 /usr/bin/g++
+
+          # Install Rust stable version
+          rustup install stable
+          rustup default stable
+
+      - name: GCC version check
+        run: |
+          gcc --version
+          g++ --version
+
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Setup ccache
+        run: |
+          export CCACHE_DIR="$HOME/.ccache/llguidance-riscv64"
+          mkdir -p "$CCACHE_DIR"
+
+          ccache --set-config=max_size=5G
+          ccache --set-config=compression=true
+          ccache --set-config=compression_level=6
+          ccache --set-config=cache_dir="$CCACHE_DIR"
+          ccache --set-config=sloppiness=file_macro,time_macros,include_file_mtime,include_file_ctime
+          ccache --set-config=hash_dir=false
+
+          echo "CCACHE_DIR=$CCACHE_DIR" >> $GITHUB_ENV
+          echo "PATH=/usr/lib/ccache:$PATH" >> $GITHUB_ENV
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -B build \
+            -DLLAMA_CURL=OFF \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DGGML_OPENMP=OFF \
+            -DLLAMA_BUILD_EXAMPLES=ON \
+            -DLLAMA_BUILD_TOOLS=ON \
+            -DLLAMA_BUILD_TESTS=OFF \
+            -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+            -DLLAMA_LLGUIDANCE=ON \
+            -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
+            -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
+
+          cmake --build build --config Release -j $(nproc)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L main --verbose --timeout 900
+
+
+  ubuntu-cmake-rpc-riscv64-native:
+    runs-on: RISCV64
+
+    continue-on-error: true
+
+    steps:
+      - name: Install dependencies
+        run: |
+          sudo apt-get update
+
+          # Install necessary packages
+          sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential libssl-dev wget ccache
+
+          # Set gcc-14 and g++-14 as the default compilers
+          sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
+          sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
+          sudo ln -sf /usr/bin/gcc-14 /usr/bin/gcc
+          sudo ln -sf /usr/bin/g++-14 /usr/bin/g++
+
+          # Install Rust stable version
+          rustup install stable
+          rustup default stable
+
+      - name: GCC version check
+        run: |
+          gcc --version
+          g++ --version
+
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Setup ccache
+        run: |
+          export CCACHE_DIR="$HOME/.ccache/rpc-riscv64"
+          mkdir -p "$CCACHE_DIR"
+
+          ccache --set-config=max_size=5G
+          ccache --set-config=compression=true
+          ccache --set-config=compression_level=6
+          ccache --set-config=cache_dir="$CCACHE_DIR"
+          ccache --set-config=sloppiness=file_macro,time_macros,include_file_mtime,include_file_ctime
+          ccache --set-config=hash_dir=false
+
+          echo "CCACHE_DIR=$CCACHE_DIR" >> $GITHUB_ENV
+          echo "PATH=/usr/lib/ccache:$PATH" >> $GITHUB_ENV
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -B build \
+            -DLLAMA_CURL=OFF \
+            -DLLAMA_OPENSSL=ON \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DGGML_OPENMP=OFF \
+            -DLLAMA_BUILD_EXAMPLES=ON \
+            -DLLAMA_BUILD_TOOLS=ON \
+            -DLLAMA_BUILD_TESTS=ON \
+            -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+            -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
+            -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
+            -DGGML_RPC=ON
+
+          cmake --build build --config Release -j $(nproc)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L main --verbose
+
  ggml-ci-arm64-graviton4-kleidiai:
     runs-on: ah-ubuntu_22_04-c8g_8x

@@ -1682,6 +2112,7 @@ jobs:
         with:
           key: ggml-ci-arm64-graviton4-kleidiai
           evict-old-files: 1d
+           save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

       - name: Test
         id: ggml-ci
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -66,13 +66,13 @@ jobs:
        id: pack_artifacts
        run: |
          cp LICENSE ./build/bin/
-          zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
+          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz -s ",./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .

      - name: Upload artifacts
        uses: actions/upload-artifact@v4
        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
-          name: llama-bin-macos-arm64.zip
+          path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz
+          name: llama-bin-macos-arm64.tar.gz

  macOS-x64:
    runs-on: macos-15-intel
@@ -120,13 +120,13 @@ jobs:
        id: pack_artifacts
        run: |
          cp LICENSE ./build/bin/
-          zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
+          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz -s ",./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .

      - name: Upload artifacts
        uses: actions/upload-artifact@v4
        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
-          name: llama-bin-macos-x64.zip
+          path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz
+          name: llama-bin-macos-x64.tar.gz

  ubuntu-22-cpu:
    strategy:
@@ -182,13 +182,13 @@ jobs:
        id: pack_artifacts
        run: |
          cp LICENSE ./build/bin/
-          zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/*
+          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .

      - name: Upload artifacts
        uses: actions/upload-artifact@v4
        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip
-          name: llama-bin-ubuntu-${{ matrix.build }}.zip
+          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.tar.gz
+          name: llama-bin-ubuntu-${{ matrix.build }}.tar.gz

  ubuntu-22-vulkan:
    runs-on: ubuntu-22.04
@@ -235,13 +235,13 @@ jobs:
        id: pack_artifacts
        run: |
          cp LICENSE ./build/bin/
-          zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip ./build/bin/*
+          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .

      - name: Upload artifacts
        uses: actions/upload-artifact@v4
        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip
-          name: llama-bin-ubuntu-vulkan-x64.zip
+          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz
+          name: llama-bin-ubuntu-vulkan-x64.tar.gz

  windows-cpu:
    runs-on: windows-2025
@@ -298,7 +298,7 @@ jobs:
        run: |
          Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\
          Copy-Item "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC\14.44.35112\debug_nonredist\${{ matrix.arch }}\Microsoft.VC143.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\
-          7z a llama-bin-win-cpu-${{ matrix.arch }}.zip .\build\bin\Release\*
+          7z a -snl llama-bin-win-cpu-${{ matrix.arch }}.zip .\build\bin\Release\*

      - name: Upload artifacts
        uses: actions/upload-artifact@v4
@@ -380,7 +380,7 @@ jobs:
      - name: Pack artifacts
        id: pack_artifacts
        run: |
-          7z a llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip .\build\bin\Release\${{ matrix.target }}.dll
+          7z a -snl llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip .\build\bin\Release\${{ matrix.target }}.dll

      - name: Upload artifacts
        uses: actions/upload-artifact@v4
@@ -393,7 +393,7 @@ jobs:

    strategy:
      matrix:
-        cuda: ['12.4']
+        cuda: ['12.4', '13.1']

    steps:
      - name: Clone
@@ -434,7 +434,7 @@ jobs:
      - name: Pack artifacts
        id: pack_artifacts
        run: |
-          7z a llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip .\build\bin\Release\ggml-cuda.dll
+          7z a -snl llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip .\build\bin\Release\ggml-cuda.dll

      - name: Upload artifacts
        uses: actions/upload-artifact@v4
@@ -448,6 +448,7 @@ jobs:
          $dst='.\build\bin\cudart\'
          robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
          robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
+          robocopy "${{env.CUDA_PATH}}\bin\x64" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
          7z a cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip $dst\*

      - name: Upload Cuda runtime
@@ -517,6 +518,8 @@ jobs:
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl-ls.exe" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-fallback-bfloat16.spv" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-native-bfloat16.spv" ./build/bin

          cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
@@ -526,7 +529,7 @@ jobs:
          cp "${{ env.ONEAPI_ROOT }}/umf/latest/bin/umf.dll" ./build/bin

          echo "cp oneAPI running time dll files to ./build/bin done"
-          7z a llama-bin-win-sycl-x64.zip ./build/bin/*
+          7z a -snl llama-bin-win-sycl-x64.zip ./build/bin/*

      - name: Upload the release package
        uses: actions/upload-artifact@v4
@@ -632,7 +635,7 @@ jobs:
      - name: Pack artifacts
        id: pack_artifacts
        run: |
-          7z a llama-bin-win-hip-${{ matrix.name }}-x64.zip .\build\bin\*
+          7z a -snl llama-bin-win-hip-${{ matrix.name }}-x64.zip .\build\bin\*

      - name: Upload artifacts
        uses: actions/upload-artifact@v4
@@ -685,13 +688,14 @@ jobs:
      - name: Pack artifacts
        id: pack_artifacts
        run: |
-          zip --symlinks -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
+          tar -czvf llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz -C build-apple llama.xcframework

      - name: Upload artifacts
        uses: actions/upload-artifact@v4
        with:
-          path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
-          name: llama-${{ steps.tag.outputs.name }}-xcframework
+          path: llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz
+          name: llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz
+

  openEuler-cann:
    strategy:
@@ -700,28 +704,54 @@ jobs:
        chip_type: ['910b', '310p']
        build: ['Release']
    runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
-    container: ascendai/cann:${{ matrix.chip_type == '910b' &&  '8.3.rc1.alpha001-910b-openeuler22.03-py3.11' || '8.2.rc1-310p-openeuler22.03-py3.11' }}
    steps:
      - name: Checkout
        uses: actions/checkout@v4
        with:
          fetch-depth: 0

-      - name: Dependencies
+      - name: Free up disk space
+        uses: ggml-org/free-disk-space@v1.3.1
+        with:
+          tool-cache: true
+
+      - name: Set container image
+        id: cann-image
        run: |
-          yum update -y
-          yum install -y git gcc gcc-c++ make cmake libcurl-devel
-          git config --global --add safe.directory "$GITHUB_WORKSPACE"
+          image="ascendai/cann:${{ matrix.chip_type == '910b' &&  '8.3.rc2-910b-openeuler24.03-py3.11' || '8.3.rc2-310p-openeuler24.03-py3.11' }}"
+          echo "image=${image}" >> "${GITHUB_OUTPUT}"
+
+      - name: Pull container image
+        run: docker pull "${{ steps.cann-image.outputs.image }}"

      - name: Build
+        env:
+          BUILD_TYPE: ${{ matrix.build }}
+          SOC_TYPE: ascend${{ matrix.chip_type }}
        run: |
-          export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
+          HOST_UID=$(id -u)
+          HOST_GID=$(id -g)

-          cmake -S . -B build \
-              -DCMAKE_BUILD_TYPE=${{ matrix.build }} \
-              -DGGML_CANN=on \
-              -DSOC_TYPE=ascend${{ matrix.chip_type }}
-          cmake --build build -j $(nproc)
+          docker run --rm \
+            -v "${PWD}:/workspace" \
+            -w /workspace \
+            -e SOC_TYPE=${SOC_TYPE} \
+            -e BUILD_TYPE=${BUILD_TYPE} \
+            "${{ steps.cann-image.outputs.image }}" \
+            bash -lc '
+              set -e
+              yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake libcurl-devel
+              yum clean all && rm -rf /var/cache/yum
+              git config --global --add safe.directory "/workspace"
+              export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
+              cmake -S . -B build \
+                  -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+                  -DGGML_CANN=on \
+                  -DSOC_TYPE=${SOC_TYPE}
+              cmake --build build -j $(nproc)
+
+              chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
+            '

      - name: Determine tag name
        id: tag
@@ -730,13 +760,13 @@ jobs:
      - name: Pack artifacts
        run: |
          cp LICENSE ./build/bin/
-          zip -r llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.zip ./build/bin/*
+          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .

      - name: Upload artifacts
        uses: actions/upload-artifact@v4
        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.zip
-          name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.zip
+          path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.tar.gz
+          name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.tar.gz

  release:
    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
@@ -814,6 +844,7 @@ jobs:

          echo "Moving other artifacts..."
          mv -v artifact/*.zip release
+          mv -v artifact/*.tar.gz release

      - name: Create release
        id: create_release
@@ -822,6 +853,37 @@ jobs:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        with:
          tag_name: ${{ steps.tag.outputs.name }}
+          body: |
+            <details open>
+
+            ${{ github.event.head_commit.message }}
+
+            </details>
+
+            **macOS/iOS:**
+            - [macOS Apple Silicon (arm64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz)
+            - [macOS Intel (x64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz)
+            - [iOS XCFramework](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz)
+
+            **Linux:**
+            - [Ubuntu x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.tar.gz)
+            - [Ubuntu x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz)
+            - [Ubuntu s390x (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-s390x.tar.gz)
+
+            **Windows:**
+            - [Windows x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-x64.zip)
+            - [Windows arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-arm64.zip)
+            - [Windows x64 (CUDA 12)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-12.4-x64.zip) - [CUDA 12.4 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-12.4-x64.zip)
+            - [Windows x64 (CUDA 13)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-13.1-x64.zip) - [CUDA 13.1 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-13.1-x64.zip)
+            - [Windows x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-vulkan-x64.zip)
+            - [Windows x64 (SYCL)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip)
+            - [Windows x64 (HIP)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-hip-radeon-x64.zip)
+
+            **openEuler:**
+            - [openEuler x86 (310p)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-310p-openEuler-x86.tar.gz)
+            - [openEuler x86 (910b)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-910b-openEuler-x86.tar.gz)
+            - [openEuler aarch64 (310p)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-310p-openEuler-aarch64.tar.gz)
+            - [openEuler aarch64 (910b)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-910b-openEuler-aarch64.tar.gz)

      - name: Upload release
        id: upload_release
@@ -833,7 +895,7 @@ jobs:
            const fs = require('fs');
            const release_id = '${{ steps.create_release.outputs.id }}';
            for (let file of await fs.readdirSync('./release')) {
-              if (path.extname(file) === '.zip') {
+              if (path.extname(file) === '.zip' || file.endsWith('.tar.gz')) {
                console.log('uploadReleaseAsset', file);
                await github.repos.uploadReleaseAsset({
                  owner: context.repo.owner,
--- a/.github/workflows/server-webui.yml
+++ b/.github/workflows/server-webui.yml
@@ -0,0 +1,225 @@
+# Server WebUI build and tests
+name: Server WebUI
+
+on:
+  workflow_dispatch: # allows manual triggering
+    inputs:
+      sha:
+        description: 'Commit SHA1 to build'
+        required: false
+        type: string
+      slow_tests:
+        description: 'Run slow tests'
+        required: true
+        type: boolean
+  push:
+    branches:
+      - master
+    paths: ['.github/workflows/server-webui.yml', 'tools/server/webui/**.*', 'tools/server/tests/**.*', 'tools/server/public/**']
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths: ['.github/workflows/server-webui.yml', 'tools/server/webui/**.*', 'tools/server/tests/**.*', 'tools/server/public/**']
+
+env:
+  LLAMA_LOG_COLORS: 1
+  LLAMA_LOG_PREFIX: 1
+  LLAMA_LOG_TIMESTAMPS: 1
+  LLAMA_LOG_VERBOSITY: 10
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  webui-check:
+    name: WebUI Checks
+    runs-on: ubuntu-latest
+    continue-on-error: true
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
+
+      - name: Setup Node.js
+        id: node
+        uses: actions/setup-node@v4
+        with:
+          node-version: "22"
+          cache: "npm"
+          cache-dependency-path: "tools/server/webui/package-lock.json"
+
+      - name: Install dependencies
+        id: setup
+        if: ${{ steps.node.conclusion == 'success' }}
+        run: npm ci
+        working-directory: tools/server/webui
+
+      - name: Run type checking
+        if: ${{ always() && steps.setup.conclusion == 'success' }}
+        run: npm run check
+        working-directory: tools/server/webui
+
+      - name: Run linting
+        if: ${{ always() && steps.setup.conclusion == 'success' }}
+        run: npm run lint
+        working-directory: tools/server/webui
+
+      - name: Build application
+        if: ${{ always() && steps.setup.conclusion == 'success' }}
+        run: npm run build
+        working-directory: tools/server/webui
+
+      - name: Install Playwright browsers
+        id: playwright
+        if: ${{ always() && steps.setup.conclusion == 'success' }}
+        run: npx playwright install --with-deps
+        working-directory: tools/server/webui
+
+      - name: Build Storybook
+        if: ${{ always() && steps.playwright.conclusion == 'success' }}
+        run: npm run build-storybook
+        working-directory: tools/server/webui
+
+      - name: Run Client tests
+        if: ${{ always() && steps.playwright.conclusion == 'success' }}
+        run: npm run test:client
+        working-directory: tools/server/webui
+
+      - name: Run Unit tests
+        if: ${{ always() && steps.playwright.conclusion == 'success' }}
+        run: npm run test:unit
+        working-directory: tools/server/webui
+
+      - name: Run UI tests
+        if: ${{ always() && steps.playwright.conclusion == 'success' }}
+        run: npm run test:ui -- --testTimeout=60000
+        working-directory: tools/server/webui
+
+      - name: Run E2E tests
+        if: ${{ always() && steps.playwright.conclusion == 'success' }}
+        run: npm run test:e2e
+        working-directory: tools/server/webui
+
+  server-build:
+    runs-on: ubuntu-latest
+
+    strategy:
+      matrix:
+        sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
+        build_type: [RelWithDebInfo]
+        include:
+          - build_type: Release
+            sanitizer: ""
+      fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
+
+    steps:
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get -y install \
+            build-essential \
+            xxd \
+            git \
+            cmake \
+            curl \
+            wget \
+            language-pack-en \
+            libssl-dev
+
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
+
+      - name: Python setup
+        id: setup_python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Tests dependencies
+        id: test_dependencies
+        run: |
+          pip install -r tools/server/tests/requirements.txt
+
+      - name: Setup Node.js for WebUI
+        uses: actions/setup-node@v4
+        with:
+          node-version: "22"
+          cache: "npm"
+          cache-dependency-path: "tools/server/webui/package-lock.json"
+
+      - name: Install WebUI dependencies
+        run: npm ci
+        working-directory: tools/server/webui
+
+      - name: Build WebUI
+        run: npm run build
+        working-directory: tools/server/webui
+
+      - name: Build (no OpenMP)
+        id: cmake_build_no_openmp
+        if: ${{ matrix.sanitizer == 'THREAD' }}
+        run: |
+          cmake -B build \
+              -DGGML_NATIVE=OFF \
+              -DLLAMA_CURL=OFF \
+              -DLLAMA_OPENSSL=ON \
+              -DLLAMA_BUILD_SERVER=ON \
+              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
+              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
+              -DGGML_OPENMP=OFF ;
+          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
+
+      - name: Build (sanitizers)
+        id: cmake_build_sanitizers
+        if: ${{ matrix.sanitizer != '' && matrix.sanitizer != 'THREAD' }}
+        run: |
+          cmake -B build \
+              -DGGML_NATIVE=OFF \
+              -DLLAMA_CURL=OFF \
+              -DLLAMA_OPENSSL=ON \
+              -DLLAMA_BUILD_SERVER=ON \
+              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
+              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
+          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
+
+      - name: Build (sanitizers)
+        id: cmake_build
+        if: ${{ matrix.sanitizer == '' }}
+        run: |
+          cmake -B build \
+              -DGGML_NATIVE=OFF \
+              -DLLAMA_CURL=OFF \
+              -DLLAMA_OPENSSL=ON \
+              -DLLAMA_BUILD_SERVER=ON \
+              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
+          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
+
+      - name: Tests
+        id: server_integration_tests
+        if: ${{ matrix.sanitizer == '' }}
+        env:
+          GITHUB_ACTIONS: "true"
+        run: |
+          cd tools/server/tests
+          ./tests.sh
+
+      - name: Tests (sanitizers)
+        id: server_integration_tests_sanitizers
+        if: ${{ matrix.sanitizer != '' }}
+        run: |
+          cd tools/server/tests
+          LLAMA_SANITIZE=1 ./tests.sh
+
+      - name: Slow tests
+        id: server_integration_tests_slow
+        if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
+        run: |
+          cd tools/server/tests
+          SLOW_TESTS=1 ./tests.sh
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -76,270 +76,6 @@ jobs:
        run: |
          pip install -r tools/server/tests/requirements.txt

-  webui-setup:
-    name: WebUI Setup
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
-      - name: Setup Node.js
-        uses: actions/setup-node@v4
-        with:
-          node-version: "22"
-          cache: "npm"
-          cache-dependency-path: "tools/server/webui/package-lock.json"
-
-      - name: Cache node_modules
-        uses: actions/cache@v4
-        id: cache-node-modules
-        with:
-          path: tools/server/webui/node_modules
-          key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
-          restore-keys: |
-            ${{ runner.os }}-node-modules-
-
-      - name: Install dependencies
-        if: steps.cache-node-modules.outputs.cache-hit != 'true'
-        run: npm ci
-        working-directory: tools/server/webui
-
-  webui-check:
-    needs: webui-setup
-    name: WebUI Check
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
-      - name: Setup Node.js
-        uses: actions/setup-node@v4
-        with:
-          node-version: "22"
-
-      - name: Restore node_modules cache
-        uses: actions/cache@v4
-        with:
-          path: tools/server/webui/node_modules
-          key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
-          restore-keys: |
-            ${{ runner.os }}-node-modules-
-
-      - name: Run type checking
-        run: npm run check
-        working-directory: tools/server/webui
-
-      - name: Run linting
-        run: npm run lint
-        working-directory: tools/server/webui
-
-  webui-build:
-    needs: webui-check
-    name: WebUI Build
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
-      - name: Setup Node.js
-        uses: actions/setup-node@v4
-        with:
-          node-version: "22"
-
-      - name: Restore node_modules cache
-        uses: actions/cache@v4
-        with:
-          path: tools/server/webui/node_modules
-          key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
-          restore-keys: |
-            ${{ runner.os }}-node-modules-
-
-      - name: Build application
-        run: npm run build
-        working-directory: tools/server/webui
-
-  webui-tests:
-    needs: webui-build
-    name: Run WebUI tests
-    permissions:
-      contents: read
-
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-
-      - name: Setup Node.js
-        uses: actions/setup-node@v4
-        with:
-          node-version: "22"
-
-      - name: Restore node_modules cache
-        uses: actions/cache@v4
-        with:
-          path: tools/server/webui/node_modules
-          key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
-          restore-keys: |
-            ${{ runner.os }}-node-modules-
-
-      - name: Install Playwright browsers
-        run: npx playwright install --with-deps
-        working-directory: tools/server/webui
-
-      - name: Build Storybook
-        run: npm run build-storybook
-        working-directory: tools/server/webui
-
-      - name: Run Client tests
-        run: npm run test:client
-        working-directory: tools/server/webui
-
-      - name: Run Server tests
-        run: npm run test:server
-        working-directory: tools/server/webui
-
-      - name: Run UI tests
-        run: npm run test:ui -- --testTimeout=60000
-        working-directory: tools/server/webui
-
-      - name: Run E2E tests
-        run: npm run test:e2e
-        working-directory: tools/server/webui
-
-  server-build:
-    needs: [webui-tests]
-    runs-on: ubuntu-latest
-
-    strategy:
-      matrix:
-        sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
-        build_type: [RelWithDebInfo]
-        include:
-          - build_type: Release
-            sanitizer: ""
-      fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
-
-    steps:
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get -y install \
-            build-essential \
-            xxd \
-            git \
-            cmake \
-            curl \
-            wget \
-            language-pack-en \
-            libssl-dev
-
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
-      - name: Python setup
-        id: setup_python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.11'
-
-      - name: Tests dependencies
-        id: test_dependencies
-        run: |
-          pip install -r tools/server/tests/requirements.txt
-
-      - name: Setup Node.js for WebUI
-        uses: actions/setup-node@v4
-        with:
-          node-version: "22"
-          cache: "npm"
-          cache-dependency-path: "tools/server/webui/package-lock.json"
-
-      - name: Install WebUI dependencies
-        run: npm ci
-        working-directory: tools/server/webui
-
-      - name: Build WebUI
-        run: npm run build
-        working-directory: tools/server/webui
-
-      - name: Build (no OpenMP)
-        id: cmake_build_no_openmp
-        if: ${{ matrix.sanitizer == 'THREAD' }}
-        run: |
-          cmake -B build \
-              -DGGML_NATIVE=OFF \
-              -DLLAMA_CURL=OFF \
-              -DLLAMA_OPENSSL=ON \
-              -DLLAMA_BUILD_SERVER=ON \
-              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-              -DGGML_OPENMP=OFF ;
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
-
-      - name: Build (sanitizers)
-        id: cmake_build_sanitizers
-        if: ${{ matrix.sanitizer != '' && matrix.sanitizer != 'THREAD' }}
-        run: |
-          cmake -B build \
-              -DGGML_NATIVE=OFF \
-              -DLLAMA_CURL=OFF \
-              -DLLAMA_OPENSSL=ON \
-              -DLLAMA_BUILD_SERVER=ON \
-              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
-
-      - name: Build (sanitizers)
-        id: cmake_build
-        if: ${{ matrix.sanitizer == '' }}
-        run: |
-          cmake -B build \
-              -DGGML_NATIVE=OFF \
-              -DLLAMA_CURL=OFF \
-              -DLLAMA_OPENSSL=ON \
-              -DLLAMA_BUILD_SERVER=ON \
-              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
-
-      - name: Tests
-        id: server_integration_tests
-        if: ${{ matrix.sanitizer == '' }}
-        env:
-          GITHUB_ACTIONS: "true"
-        run: |
-          cd tools/server/tests
-          ./tests.sh
-
-      - name: Tests (sanitizers)
-        id: server_integration_tests_sanitizers
-        if: ${{ matrix.sanitizer != '' }}
-        run: |
-          cd tools/server/tests
-          LLAMA_SANITIZE=1 ./tests.sh
-
-      - name: Slow tests
-        id: server_integration_tests_slow
-        if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
-        run: |
-          cd tools/server/tests
-          SLOW_TESTS=1 ./tests.sh
-
-
  server-windows:
    runs-on: windows-2022

--- a/.github/workflows/winget.yml
+++ b/.github/workflows/winget.yml
@@ -9,6 +9,7 @@ jobs:
  update:
    name: Update Winget Package
    runs-on: ubuntu-latest
+    if: github.repository_owner == 'ggml-org'

    steps:
      - name: Install cargo binstall
--- a/.gitignore
+++ b/.gitignore
@@ -54,6 +54,7 @@
 /out/
 /tmp/
 /autogen-*.md
+/common/build-info.cpp

 # Deprecated

@@ -134,3 +135,5 @@ poetry.toml
 # IDE
 /*.code-workspace
 /.windsurf/
+# emscripten
+a.out.*
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,10 +33,24 @@ endif()

 option(LLAMA_USE_SYSTEM_GGML "Use system libggml" OFF)

+option(LLAMA_WASM_MEM64 "llama: use 64-bit memory in WASM builds" ON)
+
 if (EMSCRIPTEN)
    set(BUILD_SHARED_LIBS_DEFAULT OFF)

-    option(LLAMA_WASM_SINGLE_FILE "llama: embed WASM inside the generated llama.js" ON)
+    # Use 64-bit memory to support backend_get_memory queries
+    # TODO: analyze performance impact, see https://spidermonkey.dev/blog/2025/01/15/is-memory64-actually-worth-using
+    if (LLAMA_WASM_MEM64)
+      add_compile_options("-sMEMORY64=1")
+      add_link_options("-sMEMORY64=1")
+    endif()
+    add_link_options("-sALLOW_MEMORY_GROWTH=1")
+
+    option(LLAMA_WASM_SINGLE_FILE "llama: embed WASM inside the generated llama.js" OFF)
+    option(LLAMA_BUILD_HTML "llama: build HTML file" ON)
+    if (LLAMA_BUILD_HTML)
+        set(CMAKE_EXECUTABLE_SUFFIX ".html")
+    endif()
 else()
    if (MINGW)
        set(BUILD_SHARED_LIBS_DEFAULT OFF)
@@ -58,6 +72,12 @@ if (MSVC)
    add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/bigobj>")
 endif()

+if (LLAMA_STANDALONE)
+    # enable parallel builds for msbuild
+    list(APPEND CMAKE_VS_GLOBALS UseMultiToolTask=true)
+    list(APPEND CMAKE_VS_GLOBALS EnforceProcessCountAcrossBuilds=true)
+endif()
+
 if (CMAKE_SYSTEM_NAME STREQUAL "iOS")
    set(LLAMA_TOOLS_INSTALL_DEFAULT OFF)
 else()
@@ -179,11 +199,6 @@ if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML)
    # ... otherwise assume ggml is added by a parent CMakeLists.txt
 endif()

-if (MINGW)
-    # Target Windows 8 for PrefetchVirtualMemory
-    add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
-endif()
-
 #
 # build the library
 #
--- a/46
+++ b/46
@@ -2,23 +2,25 @@
 # multiplie collaborators per item can be specified

 /.devops/*.Dockerfile                   @ngxson
-/.github/actions/                       @slaren @CISC
+/.github/actions/                       @CISC
 /.github/workflows/                     @CISC
-/.github/workflows/release.yml          @slaren
-/.github/workflows/winget.yml           @slaren
 /ci/                                    @ggerganov
 /cmake/                                 @ggerganov
 /common/CMakeLists.txt                  @ggerganov
-/common/arg.*                           @ggerganov @ericcurtin
+/common/arg.*                           @ggerganov
 /common/base64.hpp.*                    @ggerganov
 /common/build-info.*                    @ggerganov
+/common/chat.*                          @pwilkin
+/common/chat-peg-parser.*               @aldehir
 /common/common.*                        @ggerganov
 /common/console.*                       @ggerganov
 /common/http.*                          @angt
 /common/llguidance.*                    @ggerganov
 /common/log.*                           @ggerganov
+/common/peg-parser.*                    @aldehir
 /common/sampling.*                      @ggerganov
 /common/speculative.*                   @ggerganov
+/common/unicode.*                       @aldehir
 /convert_*.py                           @CISC
 /examples/batched.swift/                @ggerganov
 /examples/batched/                      @ggerganov
@@ -30,7 +32,7 @@
 /examples/export-docs/                  @ggerganov
 /examples/gen-docs/                     @ggerganov
 /examples/gguf/                         @ggerganov
-/examples/llama.android/                @ggerganov
+/examples/llama.android/                @ggerganov @hanyin-arm @naco-siren
 /examples/llama.swiftui/                @ggerganov
 /examples/llama.vim                     @ggerganov
 /examples/lookahead/                    @ggerganov
@@ -40,21 +42,14 @@
 /examples/passkey/                      @ggerganov
 /examples/retrieval/                    @ggerganov
 /examples/save-load-state/              @ggerganov
-/examples/simple-chat/                  @slaren
-/examples/simple/                       @slaren
 /examples/speculative-simple/           @ggerganov
 /examples/speculative/                  @ggerganov
 /ggml/cmake/                            @ggerganov
-/ggml/include/                          @ggerganov @slaren
-/ggml/src/ggml-alloc.c                  @slaren
-/ggml/src/ggml-backend*                 @slaren
-/ggml/src/ggml-blas/                    @slaren
-/ggml/src/ggml-common.h                 @ggerganov @slaren
-/ggml/src/ggml-cpu/                     @ggerganov @slaren
+/ggml/include/                          @ggerganov
+/ggml/src/ggml-common.h                 @ggerganov
+/ggml/src/ggml-cpu/                     @ggerganov
 /ggml/src/ggml-cpu/spacemit/            @alex-spacemit
-/ggml/src/ggml-cuda/common.cuh          @slaren
 /ggml/src/ggml-cuda/fattn*              @JohannesGaessler
-/ggml/src/ggml-cuda/ggml-cuda.cu        @slaren
 /ggml/src/ggml-cuda/mmf.*               @JohannesGaessler @am17an
 /ggml/src/ggml-cuda/mmq.*               @JohannesGaessler
 /ggml/src/ggml-cuda/mmvf.*              @JohannesGaessler
@@ -62,19 +57,19 @@
 /ggml/src/ggml-cuda/fattn-wmma*         @IMbackK
 /ggml/src/ggml-hip/                     @IMbackK
 /ggml/src/ggml-cuda/vendors/hip.h       @IMbackK
-/ggml/src/ggml-impl.h                   @ggerganov @slaren
+/ggml/src/ggml-impl.h                   @ggerganov
 /ggml/src/ggml-metal/                   @ggerganov
 /ggml/src/ggml-opencl/                  @lhez @max-krasnyansky
 /ggml/src/ggml-hexagon/                 @max-krasnyansky @lhez
 /ggml/src/ggml-opt.cpp                  @JohannesGaessler
 /ggml/src/ggml-quants.*                 @ggerganov
 /ggml/src/ggml-rpc/                     @rgerganov
-/ggml/src/ggml-threading.*              @ggerganov @slaren
+/ggml/src/ggml-threading.*              @ggerganov
 /ggml/src/ggml-vulkan/                  @0cc4m
 /ggml/src/ggml-webgpu/                  @reeselevine
 /ggml/src/ggml-zdnn/                    @taronaeo @Andreas-Krebbel @AlekseiNikiforovIBM
-/ggml/src/ggml.c                        @ggerganov @slaren
-/ggml/src/ggml.cpp                      @ggerganov @slaren
+/ggml/src/ggml.c                        @ggerganov
+/ggml/src/ggml.cpp                      @ggerganov
 /ggml/src/gguf.cpp                      @JohannesGaessler @Green-Sky
 /gguf-py/                               @CISC
 /media/                                 @ggerganov
@@ -86,28 +81,23 @@
 /src/llama-arch.*                       @CISC
 /src/llama-chat.*                       @ngxson
 /src/llama-graph.*                      @CISC
-/src/llama-model-loader.*               @slaren
 /src/llama-model.*                      @CISC
 /src/llama-vocab.*                      @CISC
 /src/models/                            @CISC
 /tests/                                 @ggerganov
-/tests/test-backend-ops.cpp             @slaren
-/tests/test-thread-safety.cpp           @slaren
+/tests/test-chat-.*                     @pwilkin
 /tools/batched-bench/                   @ggerganov
-/tools/llama-bench/                     @slaren
-/tools/main/                            @ggerganov
+/tools/cli/                             @ngxson
+/tools/completion/                      @ggerganov
 /tools/mtmd/                            @ngxson
 /tools/perplexity/                      @ggerganov
 /tools/quantize/                        @ggerganov
 /tools/rpc/                             @rgerganov
-/tools/run/                             @ericcurtin
-/tools/server/*                         @ngxson @ggerganov @ericcurtin # no subdir
+/tools/server/*                         @ngxson @ggerganov # no subdir
 /tools/server/webui/                    @allozaur
 /tools/tokenize/                        @ggerganov
 /tools/tts/                             @ggerganov
 /vendor/                                @ggerganov
-/.clang-format                          @slaren
-/.clang-tidy                            @slaren
 /AUTHORS                                @ggerganov
 /CMakeLists.txt                         @ggerganov
 /CONTRIBUTING.md                        @ggerganov
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -15,10 +15,12 @@ The project differentiates between 3 levels of contributors:
    - If you modified the `ggml` source, run the `test-backend-ops` tool to check whether different backend implementations of the `ggml` operators produce consistent results (this requires access to at least two different `ggml` backends)
    - If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops`
 - Create separate PRs for each feature or fix. Avoid combining unrelated changes in a single PR
+- When adding support for a new model or feature, focus on **CPU support only** in the initial PR unless you have a good reason not to. Add support for other backends like CUDA in follow-up PRs
 - Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
- If your PR becomes stale, don't hesitate to ping the maintainers in the comments
+- If your PR becomes stale, rebase it on top of latest `master` to get maintainers attention
 - Maintainers will rely on your insights and approval when making a final decision to approve and merge a PR
 - Consider adding yourself to [CODEOWNERS](CODEOWNERS) to indicate your availability for reviewing related PRs
+- Using AI to generate PRs is permitted. However, you must (1) explicitly disclose how AI was used and (2) conduct a thorough manual review before publishing the PR. Note that trivial tab autocompletions do not require disclosure.

 # Pull requests (for maintainers)

--- a/README.md
+++ b/README.md
@@ -61,7 +61,7 @@ range of hardware - locally and in the cloud.
 - Plain C/C++ implementation without any dependencies
 - Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
 - AVX, AVX2, AVX512 and AMX support for x86 architectures
- RVV, ZVFH, ZFH and ZICBOP support for RISC-V architectures
+- RVV, ZVFH, ZFH, ZICBOP and ZIHINTPAUSE support for RISC-V architectures
 - 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
 - Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads GPUs via MUSA)
 - Vulkan and SYCL backend support
@@ -190,6 +190,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama)
 - Delphi [Embarcadero/llama-cpp-delphi](https://github.com/Embarcadero/llama-cpp-delphi)
 - Go (no CGo needed): [hybridgroup/yzma](https://github.com/hybridgroup/yzma)
+- Android: [llama.android](/examples/llama.android)

 </details>

@@ -276,6 +277,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 | [MUSA](docs/build.md#musa) | Moore Threads GPU |
 | [CUDA](docs/build.md#cuda) | Nvidia GPU |
 | [HIP](docs/build.md#hip) | AMD GPU |
+| [ZenDNN](docs/build.md#zendnn) | AMD CPU |
 | [Vulkan](docs/build.md#vulkan) | GPU |
 | [CANN](docs/build.md#cann) | Ascend NPU |
 | [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
@@ -312,7 +314,7 @@ The Hugging Face platform provides a variety of online tools for converting, qua

 To learn more about model quantization, [read this documentation](tools/quantize/README.md)

-## [`llama-cli`](tools/main)
+## [`llama-cli`](tools/cli)

 #### A CLI tool for accessing and experimenting with most of `llama.cpp`'s functionality.

@@ -346,19 +348,6 @@ To learn more about model quantization, [read this documentation](tools/quantize

    </details>

- <details>
-    <summary>Run simple text completion</summary>
-
-    To disable conversation mode explicitly, use `-no-cnv`
-
-    ```bash
-    llama-cli -m model.gguf -p "I believe the meaning of life is" -n 128 -no-cnv
-
-    # I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga – it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
-    ```
-
-    </details>
-
 - <details>
    <summary>Constrain the output with a custom grammar</summary>

@@ -537,7 +526,8 @@ To learn more about model quantization, [read this documentation](tools/quantize

 ## Other documentation

- [main (cli)](tools/main/README.md)
+- [cli](tools/cli/README.md)
+- [completion](tools/completion/README.md)
 - [server](tools/server/README.md)
 - [GBNF grammars](grammars/README.md)

@@ -613,3 +603,4 @@ $ echo "source ~/.llama-completion.bash" >> ~/.bashrc
 - [linenoise.cpp](./tools/run/linenoise.cpp/linenoise.cpp) - C++ library that provides readline-like line editing capabilities, used by `llama-run` - BSD 2-Clause License
 - [curl](https://curl.se/) - Client-side URL transfer library, used by various tools/examples - [CURL License](https://curl.se/docs/copyright.html)
 - [miniaudio.h](https://github.com/mackron/miniaudio) - Single-header audio format decoder, used by multimodal subsystem - Public domain
+- [subprocess.h](https://github.com/sheredom/subprocess.h) - Single-header process launching solution for C and C++ - Public domain
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -65,4 +65,9 @@ However, If you have discovered a security vulnerability in this project, please

 Please disclose it as a private [security advisory](https://github.com/ggml-org/llama.cpp/security/advisories/new).

+Please note that using AI to identify vulnerabilities and generate reports is permitted. However, you must (1) explicitly disclose how AI was used and (2) conduct a thorough manual review before submitting the report.
+
 A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.
+
+> [!IMPORTANT]
+> For collaborators: if you are interested in helping out with reviewing privting security disclosures, please see: https://github.com/ggml-org/llama.cpp/discussions/18080
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -45,7 +45,7 @@ sd=`dirname $0`
 cd $sd/../
 SRC=`pwd`

-CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON"
+CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=${LLAMA_FATAL_WARNINGS:-ON} -DLLAMA_CURL=ON -DGGML_SCHED_NO_REALLOC=ON"

 if [ ! -z ${GG_BUILD_METAL} ]; then
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
@@ -398,18 +398,20 @@ function gg_run_qwen3_0_6b {
    ./bin/llama-quantize ${model_bf16} ${model_q5_k} q5_k $(nproc)
    ./bin/llama-quantize ${model_bf16} ${model_q6_k} q6_k $(nproc)

-    (time ./bin/llama-cli -no-cnv --model ${model_f16}  -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-cli -no-cnv --model ${model_bf16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q8_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q4_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q4_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q5_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q5_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q2_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q3_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q4_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q5_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q6_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
+
+    (time ./bin/llama-completion -no-cnv --model ${model_f16}  -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-completion -no-cnv --model ${model_bf16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log
+    (time ./bin/llama-completion -no-cnv --model ${model_q8_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-completion -no-cnv --model ${model_q4_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-completion -no-cnv --model ${model_q4_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-completion -no-cnv --model ${model_q5_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-completion -no-cnv --model ${model_q5_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-completion -no-cnv --model ${model_q2_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-completion -no-cnv --model ${model_q3_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-completion -no-cnv --model ${model_q4_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-completion -no-cnv --model ${model_q5_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-completion -no-cnv --model ${model_q6_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log

    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
    if [ -z ${GG_BUILD_NO_BF16} ]; then
@@ -428,10 +430,10 @@ function gg_run_qwen3_0_6b {

    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log

-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa on  ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa on  ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa off --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa on  --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa off                ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa on                 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log

    function check_ppl {
        qnt="$1"
@@ -523,8 +525,10 @@ function gg_run_embd_bge_small {

    ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0

-    (time ./bin/llama-embedding --model ${model_f16}  -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
+
+    (time ./bin/llama-embedding --model ${model_f16}  -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log

    set +e
 }
@@ -563,8 +567,10 @@ function gg_run_rerank_tiny {

    model_f16="${path_models}/ggml-model-f16.gguf"

+    (time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
+
    # for this model, the SEP token is "</s>"
-    (time ./bin/llama-embedding --model ${model_f16} -p "what is panda?\thi\nwhat is panda?\tit's a bear\nwhat is panda?\tThe giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
+    (time ./bin/llama-embedding --model ${model_f16} -p "what is panda?\thi\nwhat is panda?\tit's a bear\nwhat is panda?\tThe giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --no-op-offload --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log

    # sample output
    # rerank score 0:    0.029
--- a/cmake/build-info.cmake
+++ b/cmake/build-info.cmake
@@ -39,26 +39,10 @@ if(Git_FOUND)
    endif()
 endif()

-if(MSVC)
-    set(BUILD_COMPILER "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
-    if (CMAKE_VS_PLATFORM_NAME)
-        set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME})
-    else()
-        set(BUILD_TARGET "${CMAKE_SYSTEM_NAME} ${CMAKE_SYSTEM_PROCESSOR}")
-    endif()
-else()
-    execute_process(
-        COMMAND ${CMAKE_C_COMPILER} --version
-        OUTPUT_VARIABLE OUT
-        OUTPUT_STRIP_TRAILING_WHITESPACE
-    )
-    string(REGEX REPLACE " *\n.*" "" OUT "${OUT}")
-    set(BUILD_COMPILER ${OUT})
+set(BUILD_COMPILER "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")

-    execute_process(
-        COMMAND ${CMAKE_C_COMPILER} -dumpmachine
-        OUTPUT_VARIABLE OUT
-        OUTPUT_STRIP_TRAILING_WHITESPACE
-    )
-    set(BUILD_TARGET ${OUT})
+if(CMAKE_VS_PLATFORM_NAME)
+    set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME})
+else()
+    set(BUILD_TARGET "${CMAKE_SYSTEM_NAME} ${CMAKE_SYSTEM_PROCESSOR}")
 endif()
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -52,6 +52,8 @@ add_library(${TARGET} STATIC
    chat-parser.h
    chat-parser-xml-toolcall.h
    chat-parser-xml-toolcall.cpp
+    chat-peg-parser.cpp
+    chat-peg-parser.h
    chat.cpp
    chat.h
    common.cpp
@@ -69,12 +71,18 @@ add_library(${TARGET} STATIC
    log.h
    ngram-cache.cpp
    ngram-cache.h
+    peg-parser.cpp
+    peg-parser.h
+    preset.cpp
+    preset.h
    regex-partial.cpp
    regex-partial.h
    sampling.cpp
    sampling.h
    speculative.cpp
    speculative.h
+    unicode.cpp
+    unicode.h
    )

 if (BUILD_SHARED_LIBS)
--- a/common/arg.cpp
+++ b/common/arg.cpp
--- a/common/arg.h
+++ b/common/arg.h
@@ -3,8 +3,13 @@
 #include "common.h"

 #include <set>
+#include <map>
 #include <string>
 #include <vector>
+#include <cstring>
+
+// pseudo-env variable to identify preset-only arguments
+#define COMMON_ARG_PRESET_LOAD_ON_STARTUP "__PRESET_LOAD_ON_STARTUP"

 //
 // CLI argument parsing
@@ -14,15 +19,20 @@ struct common_arg {
    std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
    std::set<enum llama_example> excludes = {};
    std::vector<const char *> args;
+    std::vector<const char *> args_neg;  // for negated args like --no-xxx
    const char * value_hint   = nullptr; // help text or example for arg value
    const char * value_hint_2 = nullptr; // for second arg value
    const char * env          = nullptr;
    std::string help;
    bool is_sparam = false; // is current arg a sampling param?
+    bool is_preset_only = false; // is current arg preset-only (not treated as CLI arg)
    void (*handler_void)   (common_params & params) = nullptr;
    void (*handler_string) (common_params & params, const std::string &) = nullptr;
    void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr;
    void (*handler_int)    (common_params & params, int) = nullptr;
+    void (*handler_bool)   (common_params & params, bool) = nullptr;
+
+    common_arg() = default;

    common_arg(
        const std::initializer_list<const char *> & args,
@@ -44,6 +54,13 @@ struct common_arg {
        void (*handler)(common_params & params)
    ) : args(args), help(help), handler_void(handler) {}

+    common_arg(
+        const std::initializer_list<const char *> & args,
+        const std::initializer_list<const char *> & args_neg,
+        const std::string & help,
+        void (*handler)(common_params & params, bool)
+    ) : args(args), args_neg(args_neg), help(help), handler_bool(handler) {}
+
    // support 2 values for arg
    common_arg(
        const std::initializer_list<const char *> & args,
@@ -57,13 +74,38 @@ struct common_arg {
    common_arg & set_excludes(std::initializer_list<enum llama_example> excludes);
    common_arg & set_env(const char * env);
    common_arg & set_sparam();
+    common_arg & set_preset_only();
    bool in_example(enum llama_example ex);
    bool is_exclude(enum llama_example ex);
    bool get_value_from_env(std::string & output) const;
    bool has_value_from_env() const;
-    std::string to_string();
+    std::string to_string() const;
+
+    // for using as key in std::map
+    bool operator<(const common_arg& other) const {
+        if (args.empty() || other.args.empty()) {
+            return false;
+        }
+        return strcmp(args[0], other.args[0]) < 0;
+    }
+    bool operator==(const common_arg& other) const {
+        if (args.empty() || other.args.empty()) {
+            return false;
+        }
+        return strcmp(args[0], other.args[0]) == 0;
+    }
+
+    // get all args and env vars (including negated args/env)
+    std::vector<std::string> get_args() const;
+    std::vector<std::string> get_env() const;
 };

+namespace common_arg_utils {
+    bool is_truthy(const std::string & value);
+    bool is_falsey(const std::string & value);
+    bool is_autoy(const std::string & value);
+}
+
 struct common_params_context {
    enum llama_example ex = LLAMA_EXAMPLE_COMMON;
    common_params & params;
@@ -76,7 +118,15 @@ struct common_params_context {
 // if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
 bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);

-// function to be used by test-arg-parser
+// parse input arguments from CLI into a map
+bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map);
+
+// populate preset-only arguments
+// these arguments are not treated as command line arguments
+// see: https://github.com/ggml-org/llama.cpp/issues/18163
+void common_params_add_preset_options(std::vector<common_arg> & args);
+
+// initialize argument parser context - used by test-arg-parser and preset
 common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);

 struct common_remote_params {
--- a/common/chat-parser-xml-toolcall.cpp
+++ b/common/chat-parser-xml-toolcall.cpp
@@ -724,16 +724,10 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons
        if (reasoning_unclosed) {
            if (auto pos = content.find(end_think); pos == std::string::npos && builder.pos() != builder.input().size()) {
                unclosed_reasoning_content += content;
-                if (form.allow_toolcall_in_think) {
-                    builder.move_to(tc->groups[0].begin);
-                    if (!builder.try_consume_xml_tool_calls(form)) {
-                        unclosed_reasoning_content += tool_call_start;
-                        builder.move_to(tc->groups[0].end);
-                    }
-                } else {
+                if (!(form.allow_toolcall_in_think && tc)) {
                    unclosed_reasoning_content += tool_call_start;
+                    continue;
                }
-                continue;
            } else {
                reasoning_unclosed = false;
                std::string reasoning_content;
@@ -781,8 +775,12 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons
                }
            } else {
                // This <tool_call> start is in thinking block, skip this tool call
-                auto pos = think_start + start_think.size();
-                unclosed_reasoning_content = content.substr(pos) + tool_call_start;
+                // This <tool_call> start is in thinking block
+                if (form.allow_toolcall_in_think) {
+                    unclosed_reasoning_content = content.substr(think_start + start_think.size());
+                } else {
+                    unclosed_reasoning_content = content.substr(think_start + start_think.size()) + tool_call_start;
+                }
                reasoning_unclosed = true;
                content.resize(think_start);
                toolcall_in_think = true;
@@ -805,14 +803,35 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons
        }

        // remove potential partial suffix
-        if (content.size() > 0 && builder.pos() == builder.input().size() && unclosed_reasoning_content.empty()) {
-            rstrip(content);
-            trim_potential_partial_word(content);
-            rstrip(content);
+        if (builder.pos() == builder.input().size()) {
+            if (unclosed_reasoning_content.empty()) {
+                rstrip(content);
+                trim_potential_partial_word(content);
+                rstrip(content);
+            } else {
+                rstrip(unclosed_reasoning_content);
+                trim_potential_partial_word(unclosed_reasoning_content);
+                rstrip(unclosed_reasoning_content);
+            }
+        }
+
+        // consume unclosed_reasoning_content if allow_toolcall_in_think is set
+        if (form.allow_toolcall_in_think && !unclosed_reasoning_content.empty()) {
+            if (builder.syntax().reasoning_format != COMMON_REASONING_FORMAT_NONE && !builder.syntax().reasoning_in_content) {
+                builder.add_reasoning_content(unclosed_reasoning_content);
+            } else {
+                if (content.empty()) {
+                    content = start_think + unclosed_reasoning_content;
+                } else {
+                    content += "\n\n" + start_think;
+                    content += unclosed_reasoning_content;
+                }
+            }
+            unclosed_reasoning_content.clear();
        }

        // Add content
-        if (content.size() != 0) {
+        if (!content.empty()) {
            // If there are multiple content blocks
            if (builder.syntax().reasoning_format != COMMON_REASONING_FORMAT_NONE && !builder.syntax().reasoning_in_content && builder.result().content.size() != 0) {
                builder.add_content("\n\n");
@@ -820,7 +839,7 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons
            builder.add_content(content);
        }

-        // This <tool_call> start is in thinking block, skip this tool call
+        // This <tool_call> start is in thinking block and toolcall_in_think not set, skip this tool call
        if (toolcall_in_think && !form.allow_toolcall_in_think) {
            continue;
        }
@@ -829,7 +848,7 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons
        if (!tc) {
            GGML_ASSERT(builder.pos() == builder.input().size());
            GGML_ASSERT(unclosed_reasoning_content.empty());
-            GGML_ASSERT(!reasoning_unclosed);
+            if (!form.allow_toolcall_in_think) GGML_ASSERT(!reasoning_unclosed);
            break;
        }

@@ -854,7 +873,6 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons

 /**
 * Parse content uses reasoning and XML-Style tool call
- * TODO: Note that form.allow_toolcall_in_think is not tested yet. If anyone confirms it works, this comment can be removed.
 */
 void common_chat_msg_parser::consume_reasoning_with_xml_tool_calls(const struct xml_tool_call_format & form, const std::string & start_think, const std::string & end_think) {
    parse_msg_with_xml_tool_calls(*this, form, start_think, end_think);
--- a/common/chat-parser-xml-toolcall.h
+++ b/common/chat-parser-xml-toolcall.h
@@ -31,7 +31,7 @@ struct xml_tool_call_format {
    std::optional<std::string> last_val_end = std::nullopt;
    std::optional<std::string> last_tool_end = std::nullopt;
    bool trim_raw_argval = false;
-    bool allow_toolcall_in_think = false; // TODO: UNTESTED!!!
+    bool allow_toolcall_in_think = false;
 };

 // make a GBNF that accept any strings except those containing any of the forbidden strings.
--- a/common/chat-parser.cpp
+++ b/common/chat-parser.cpp
--- a/common/chat-peg-parser.cpp
+++ b/common/chat-peg-parser.cpp
@@ -0,0 +1,124 @@
+#include "chat-peg-parser.h"
+
+#include <nlohmann/json.hpp>
+
+using json = nlohmann::json;
+
+static std::string_view trim_trailing_space(std::string_view sv, int max = -1) {
+    int count = 0;
+    while (!sv.empty() && std::isspace(static_cast<unsigned char>(sv.back()))) {
+        if (max != -1 && count <= max) {
+            break;
+        }
+        sv.remove_suffix(1);
+        count++;
+    }
+    return sv;
+}
+
+void common_chat_peg_mapper::from_ast(const common_peg_ast_arena & arena, const common_peg_parse_result & result) {
+    arena.visit(result, [this](const common_peg_ast_node & node) {
+        map(node);
+    });
+}
+
+void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
+    bool is_reasoning = node.tag == common_chat_peg_builder::REASONING;
+    bool is_content = node.tag == common_chat_peg_builder::CONTENT;
+
+    if (is_reasoning) {
+        result.reasoning_content = std::string(trim_trailing_space(node.text));
+    }
+
+    if (is_content) {
+        result.content = std::string(trim_trailing_space(node.text));
+    }
+}
+
+void common_chat_peg_native_mapper::map(const common_peg_ast_node & node) {
+    common_chat_peg_mapper::map(node);
+
+    bool is_tool_open = node.tag == common_chat_peg_native_builder::TOOL_OPEN;
+    bool is_tool_name = node.tag == common_chat_peg_native_builder::TOOL_NAME;
+    bool is_tool_id = node.tag == common_chat_peg_native_builder::TOOL_ID;
+    bool is_tool_args = node.tag == common_chat_peg_native_builder::TOOL_ARGS;
+
+    if (is_tool_open) {
+        result.tool_calls.emplace_back();
+        current_tool = &result.tool_calls.back();
+    }
+
+    if (is_tool_id && current_tool) {
+        current_tool->id = std::string(trim_trailing_space(node.text));
+    }
+
+    if (is_tool_name && current_tool) {
+        current_tool->name = std::string(trim_trailing_space(node.text));
+    }
+
+    if (is_tool_args && current_tool) {
+        current_tool->arguments = std::string(trim_trailing_space(node.text));
+    }
+}
+
+void common_chat_peg_constructed_mapper::map(const common_peg_ast_node & node) {
+    common_chat_peg_mapper::map(node);
+
+    bool is_tool_open = node.tag == common_chat_peg_constructed_builder::TOOL_OPEN;
+    bool is_tool_name = node.tag == common_chat_peg_constructed_builder::TOOL_NAME;
+    bool is_tool_close = node.tag == common_chat_peg_constructed_builder::TOOL_CLOSE;
+    bool is_arg_open = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_OPEN;
+    bool is_arg_close = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_CLOSE;
+    bool is_arg_name = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_NAME;
+    bool is_arg_string = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_STRING_VALUE;
+    bool is_arg_json = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_JSON_VALUE;
+
+    if (is_tool_open) {
+        result.tool_calls.emplace_back();
+        current_tool = &result.tool_calls.back();
+        arg_count = 0;
+    }
+
+    if (is_tool_name) {
+        current_tool->name = std::string(node.text);
+        current_tool->arguments = "{";
+    }
+
+    if (is_arg_open) {
+        needs_closing_quote = false;
+    }
+
+    if (is_arg_name && current_tool) {
+        if (arg_count > 0) {
+            current_tool->arguments += ",";
+        }
+        current_tool->arguments += json(trim_trailing_space(node.text)).dump() + ":";
+        ++arg_count;
+    }
+
+    if (is_arg_string && current_tool) {
+        // Serialize to JSON, but exclude the end quote
+        std::string dumped = json(trim_trailing_space(node.text)).dump();
+        current_tool->arguments += dumped.substr(0, dumped.size() - 1);
+        needs_closing_quote = true;
+    }
+
+    if (is_arg_close && current_tool) {
+        if (needs_closing_quote) {
+            current_tool->arguments += "\"";
+            needs_closing_quote = false;
+        }
+    }
+
+    if (is_arg_json && current_tool) {
+        current_tool->arguments += std::string(trim_trailing_space(node.text));
+    }
+
+    if (is_tool_close && current_tool) {
+        if (needs_closing_quote) {
+            current_tool->arguments += "\"";
+            needs_closing_quote = false;
+        }
+        current_tool->arguments += "}";
+    }
+}
--- a/common/chat-peg-parser.h
+++ b/common/chat-peg-parser.h
@@ -0,0 +1,105 @@
+#pragma once
+
+#include "chat.h"
+#include "peg-parser.h"
+
+class common_chat_peg_builder : public common_peg_parser_builder {
+  public:
+    static constexpr const char * REASONING_BLOCK = "reasoning-block";
+    static constexpr const char * REASONING = "reasoning";
+    static constexpr const char * CONTENT = "content";
+
+    common_peg_parser reasoning_block(const common_peg_parser & p) { return tag(REASONING_BLOCK, p); }
+    common_peg_parser reasoning(const common_peg_parser & p) { return tag(REASONING, p); }
+    common_peg_parser content(const common_peg_parser & p) { return tag(CONTENT, p); }
+};
+
+inline common_peg_arena build_chat_peg_parser(const std::function<common_peg_parser(common_chat_peg_builder & builder)> & fn) {
+    common_chat_peg_builder builder;
+    builder.set_root(fn(builder));
+    return builder.build();
+}
+
+class common_chat_peg_mapper {
+  public:
+    common_chat_msg & result;
+
+    common_chat_peg_mapper(common_chat_msg & msg) : result(msg) {}
+
+    virtual void from_ast(const common_peg_ast_arena & arena, const common_peg_parse_result & result);
+    virtual void map(const common_peg_ast_node & node);
+};
+
+class common_chat_peg_native_builder : public common_chat_peg_builder {
+  public:
+    static constexpr const char * TOOL = "tool";
+    static constexpr const char * TOOL_OPEN = "tool-open";
+    static constexpr const char * TOOL_CLOSE = "tool-close";
+    static constexpr const char * TOOL_ID = "tool-id";
+    static constexpr const char * TOOL_NAME = "tool-name";
+    static constexpr const char * TOOL_ARGS = "tool-args";
+
+    common_peg_parser tool(const common_peg_parser & p) { return tag(TOOL, p); }
+    common_peg_parser tool_open(const common_peg_parser & p) { return atomic(tag(TOOL_OPEN, p)); }
+    common_peg_parser tool_close(const common_peg_parser & p) { return atomic(tag(TOOL_CLOSE, p)); }
+    common_peg_parser tool_id(const common_peg_parser & p) { return atomic(tag(TOOL_ID, p)); }
+    common_peg_parser tool_name(const common_peg_parser & p) { return atomic(tag(TOOL_NAME, p)); }
+    common_peg_parser tool_args(const common_peg_parser & p) { return tag(TOOL_ARGS, p); }
+};
+
+class common_chat_peg_native_mapper : public common_chat_peg_mapper {
+    common_chat_tool_call * current_tool;
+
+  public:
+    common_chat_peg_native_mapper(common_chat_msg & msg) : common_chat_peg_mapper(msg) {}
+
+    void map(const common_peg_ast_node & node) override;
+};
+
+inline common_peg_arena build_chat_peg_native_parser(const std::function<common_peg_parser(common_chat_peg_native_builder & builder)> & fn) {
+    common_chat_peg_native_builder builder;
+    builder.set_root(fn(builder));
+    return builder.build();
+}
+
+class common_chat_peg_constructed_builder : public common_chat_peg_builder {
+  public:
+    static constexpr const char * TOOL = "tool";
+    static constexpr const char * TOOL_OPEN = "tool-open";
+    static constexpr const char * TOOL_CLOSE = "tool-close";
+    static constexpr const char * TOOL_NAME = "tool-name";
+    static constexpr const char * TOOL_ARG = "tool-arg";
+    static constexpr const char * TOOL_ARG_OPEN = "tool-arg-open";
+    static constexpr const char * TOOL_ARG_CLOSE = "tool-arg-close";
+    static constexpr const char * TOOL_ARG_NAME = "tool-arg-name";
+    static constexpr const char * TOOL_ARG_STRING_VALUE = "tool-arg-string-value";
+    static constexpr const char * TOOL_ARG_JSON_VALUE = "tool-arg-json-value";
+
+    common_peg_parser tool(const common_peg_parser & p) { return tag(TOOL, p); }
+    common_peg_parser tool_open(const common_peg_parser & p) { return atomic(tag(TOOL_OPEN, p)); }
+    common_peg_parser tool_close(const common_peg_parser & p) { return atomic(tag(TOOL_CLOSE, p)); }
+    common_peg_parser tool_name(const common_peg_parser & p) { return atomic(tag(TOOL_NAME, p)); }
+    common_peg_parser tool_arg(const common_peg_parser & p) { return tag(TOOL_ARG, p); }
+    common_peg_parser tool_arg_open(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_OPEN, p)); }
+    common_peg_parser tool_arg_close(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_CLOSE, p)); }
+    common_peg_parser tool_arg_name(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_NAME, p)); }
+    common_peg_parser tool_arg_string_value(const common_peg_parser & p) { return tag(TOOL_ARG_STRING_VALUE, p); }
+    common_peg_parser tool_arg_json_value(const common_peg_parser & p) { return tag(TOOL_ARG_JSON_VALUE, p); }
+};
+
+class common_chat_peg_constructed_mapper : public common_chat_peg_mapper {
+    common_chat_tool_call * current_tool;
+    int arg_count = 0;
+    bool needs_closing_quote = false;
+
+  public:
+    common_chat_peg_constructed_mapper(common_chat_msg & msg) : common_chat_peg_mapper(msg) {}
+
+    void map(const common_peg_ast_node & node) override;
+};
+
+inline common_peg_arena build_chat_peg_constructed_parser(const std::function<common_peg_parser(common_chat_peg_constructed_builder & builder)> & fn) {
+    common_chat_peg_constructed_builder builder;
+    builder.set_root(fn(builder));
+    return builder.build();
+}
--- a/common/chat.cpp
+++ b/common/chat.cpp
--- a/common/chat.h
+++ b/common/chat.h
@@ -3,6 +3,7 @@
 #pragma once

 #include "common.h"
+#include "peg-parser.h"
 #include <functional>
 #include <chrono>
 #include <string>
@@ -76,7 +77,7 @@ struct common_chat_msg_diff {
    size_t tool_call_index = std::string::npos;
    common_chat_tool_call tool_call_delta;

-    static std::vector<common_chat_msg_diff> compute_diffs(const common_chat_msg & previous_msg, const common_chat_msg & new_msg);
+    static std::vector<common_chat_msg_diff> compute_diffs(const common_chat_msg & msg_prv, const common_chat_msg & msg_new);

    bool operator==(const common_chat_msg_diff & other) const {
        return content_delta == other.content_delta
@@ -124,6 +125,11 @@ enum common_chat_format {
    COMMON_CHAT_FORMAT_APRIEL_1_5,
    COMMON_CHAT_FORMAT_XIAOMI_MIMO,

+    // These are intended to be parsed by the PEG parser
+    COMMON_CHAT_FORMAT_PEG_SIMPLE,
+    COMMON_CHAT_FORMAT_PEG_NATIVE,
+    COMMON_CHAT_FORMAT_PEG_CONSTRUCTED,
+
    COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
 };

@@ -154,6 +160,7 @@ struct common_chat_params {
    std::vector<common_grammar_trigger> grammar_triggers;
    std::vector<std::string>            preserved_tokens;
    std::vector<std::string>            additional_stops;
+    std::string                         parser;
 };

 struct common_chat_syntax {
@@ -163,6 +170,7 @@ struct common_chat_syntax {
    bool                     reasoning_in_content  = false;
    bool                     thinking_forced_open  = false;
    bool                     parse_tool_calls      = true;
+    common_peg_arena         parser                = {};
 };

 // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
@@ -206,6 +214,7 @@ const char*               common_chat_format_name(common_chat_format format);
 const char*               common_reasoning_format_name(common_reasoning_format format);
 common_reasoning_format   common_reasoning_format_from_name(const std::string & format);
 common_chat_msg           common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
+common_chat_msg           common_chat_peg_parse(const common_peg_arena & parser, const std::string & input, bool is_partial, const common_chat_syntax & syntax);

 common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);

--- a/common/common.cpp
+++ b/common/common.cpp
@@ -8,6 +8,7 @@
 #include "common.h"
 #include "log.h"
 #include "llama.h"
+#include "sampling.h"

 #include <algorithm>
 #include <cinttypes>
@@ -693,7 +694,7 @@ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_over

 // Validate if a filename is safe to use
 // To validate a full path, split the path by the OS-specific path separator, and validate each part with this function
-bool fs_validate_filename(const std::string & filename) {
+bool fs_validate_filename(const std::string & filename, bool allow_subdirs) {
    if (!filename.length()) {
        // Empty filename invalid
        return false;
@@ -753,10 +754,14 @@ bool fs_validate_filename(const std::string & filename) {
            || (c >= 0xD800 && c <= 0xDFFF) // UTF-16 surrogate pairs
            || c == 0xFFFD // Replacement Character (UTF-8)
            || c == 0xFEFF // Byte Order Mark (BOM)
-            || c == '/' || c == '\\' || c == ':' || c == '*' // Illegal characters
+            || c == ':' || c == '*' // Illegal characters
            || c == '?' || c == '"' || c == '<' || c == '>' || c == '|') {
            return false;
        }
+        if (!allow_subdirs && (c == '/' || c == '\\')) {
+            // Subdirectories not allowed, reject path separators
+            return false;
+        }
    }

    // Reject any leading or trailing ' ', or any trailing '.', these are stripped on Windows and will cause a different filename
@@ -781,11 +786,29 @@ bool fs_validate_filename(const std::string & filename) {
 #include <iostream>


+#ifdef _WIN32
+static std::wstring utf8_to_wstring(const std::string & str) {
+    if (str.empty()) {
+        return std::wstring();
+    }
+
+    int size = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), (int)str.size(), NULL, 0);
+
+    if (size <= 0) {
+        return std::wstring();
+    }
+
+    std::wstring wstr(size, 0);
+    MultiByteToWideChar(CP_UTF8, 0, str.c_str(), (int)str.size(), &wstr[0], size);
+
+    return wstr;
+}
+#endif
+
 // returns true if successful, false otherwise
 bool fs_create_directory_with_parents(const std::string & path) {
 #ifdef _WIN32
-    std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
-    std::wstring wpath = converter.from_bytes(path);
+    std::wstring wpath = utf8_to_wstring(path);

    // if the path already exists, check whether it's a directory
    const DWORD attributes = GetFileAttributesW(wpath.c_str());
@@ -858,6 +881,11 @@ bool fs_create_directory_with_parents(const std::string & path) {
 #endif // _WIN32
 }

+bool fs_is_directory(const std::string & path) {
+    std::filesystem::path dir(path);
+    return std::filesystem::exists(dir) && std::filesystem::is_directory(dir);
+}
+
 std::string fs_get_cache_directory() {
    std::string cache_directory = "";
    auto ensure_trailing_slash = [](std::string p) {
@@ -892,6 +920,8 @@ std::string fs_get_cache_directory() {
        cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
 #elif defined(_WIN32)
        cache_directory = std::getenv("LOCALAPPDATA");
+#elif defined(__EMSCRIPTEN__)
+        GGML_ABORT("not implemented on this platform");
 #else
 #  error Unknown architecture
 #endif
@@ -911,7 +941,7 @@ std::string fs_get_cache_file(const std::string & filename) {
    return cache_directory + filename;
 }

-std::vector<common_file_info> fs_list_files(const std::string & path) {
+std::vector<common_file_info> fs_list(const std::string & path, bool include_directories) {
    std::vector<common_file_info> files;
    if (path.empty()) return files;

@@ -926,14 +956,22 @@ std::vector<common_file_info> fs_list_files(const std::string & path) {
            const auto & p = entry.path();
            if (std::filesystem::is_regular_file(p)) {
                common_file_info info;
-                info.path = p.string();
-                info.name = p.filename().string();
+                info.path   = p.string();
+                info.name   = p.filename().string();
+                info.is_dir = false;
                try {
                    info.size = static_cast<size_t>(std::filesystem::file_size(p));
                } catch (const std::filesystem::filesystem_error &) {
                    info.size = 0;
                }
                files.push_back(std::move(info));
+            } else if (include_directories && std::filesystem::is_directory(p)) {
+                common_file_info info;
+                info.path   = p.string();
+                info.name   = p.filename().string();
+                info.size   = 0; // Directories have no size
+                info.is_dir = true;
+                files.push_back(std::move(info));
            }
        } catch (const std::filesystem::filesystem_error &) {
            // skip entries we cannot inspect
@@ -944,34 +982,217 @@ std::vector<common_file_info> fs_list_files(const std::string & path) {
    return files;
 }

+//
+// TTY utils
+//
+
+bool tty_can_use_colors() {
+    // Check NO_COLOR environment variable (https://no-color.org/)
+    if (const char * no_color = std::getenv("NO_COLOR")) {
+        if (no_color[0] != '\0') {
+            return false;
+        }
+    }
+
+    // Check TERM environment variable
+    if (const char * term = std::getenv("TERM")) {
+        if (std::strcmp(term, "dumb") == 0) {
+            return false;
+        }
+    }
+
+    // Check if stdout and stderr are connected to a terminal
+    // We check both because log messages can go to either
+    bool stdout_is_tty = isatty(fileno(stdout));
+    bool stderr_is_tty = isatty(fileno(stderr));
+
+    return stdout_is_tty || stderr_is_tty;
+}

 //
 // Model utils
 //

-struct common_init_result common_init_from_params(common_params & params) {
-    common_init_result iparams;
+// TODO: move to common/sampling
+static void common_init_sampler_from_model(
+    const llama_model * model,
+    common_params_sampling & sparams) {
+
+    const uint64_t config = sparams.user_sampling_config;
+
+    auto get_int32 = [&](const char * key, int32_t & dst, uint64_t user_config) {
+        if (config & user_config) {
+            return;
+        }
+
+        char buf[64] = {0};
+        if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
+            char * end = nullptr;
+            int32_t v = strtol(buf, &end, 10);
+            if (end && end != buf) {
+                dst = v;
+            }
+        }
+    };
+
+    auto get_float = [&](const char * key, float & dst, uint64_t user_config) {
+        if (config & user_config) {
+            return;
+        }
+
+        char buf[128] = {0};
+        if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
+            char * end = nullptr;
+            float v = strtof(buf, &end);
+            if (end && end != buf) {
+                dst = v;
+            }
+        }
+    };
+
+    // Sampling sequence
+    if (!(config & common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS)) {
+        char buf[512] = {0};
+        if (llama_model_meta_val_str(model, llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE), buf, sizeof(buf)) > 0) {
+            const std::vector<std::string> sampler_names = string_split<std::string>(std::string(buf), ';');
+            if (!sampler_names.empty()) {
+                sparams.samplers = common_sampler_types_from_names(sampler_names, true);
+            }
+        }
+    }
+
+    get_int32(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_TOP_K),           sparams.top_k,           common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_K);
+    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_TOP_P),           sparams.top_p,           common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_P);
+    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIN_P),           sparams.min_p,           common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIN_P);
+    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY), sparams.xtc_probability, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY);
+    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD),   sparams.xtc_threshold,   common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD);
+    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_TEMP),            sparams.temp,            common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TEMP);
+    get_int32(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N),  sparams.penalty_last_n,  common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N);
+    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT),  sparams.penalty_repeat,  common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT);
+    get_int32(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT),        sparams.mirostat,        common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT);
+    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU),    sparams.mirostat_tau,    common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU);
+    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA),    sparams.mirostat_eta,    common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA);
+}
+
+struct common_init_result::impl {
+    impl() = default;
+    ~impl() = default;
+
+    llama_model_ptr   model;
+    llama_context_ptr context;
+
+    std::vector<llama_adapter_lora_ptr> lora;
+
+    std::vector<common_sampler_ptr> samplers;
+};
+
+common_init_result::common_init_result(common_params & params) :
+    pimpl(new impl{}) {
    auto mparams = common_model_params_to_llama(params);
+    auto cparams = common_context_params_to_llama(params);
+
+    if (params.fit_params) {
+        LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
+        llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
+            params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
+            params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
+    }

    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
    if (model == NULL) {
-        LOG_ERR("%s: failed to load model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
-            __func__, params.model.path.c_str());
-        return iparams;
+        return;
    }

+    pimpl->model.reset(model);
+
    const llama_vocab * vocab = llama_model_get_vocab(model);

-    auto cparams = common_context_params_to_llama(params);
+    // updates params.sampling
+    // TODO: fix naming
+    common_init_sampler_from_model(model, params.sampling);
+
+    if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
+        LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
+        params.sampling.ignore_eos = false;
+    }
+
+    // initialize once
+    for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
+        if (llama_vocab_is_eog(vocab, i)) {
+            LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(vocab, i).c_str(), -INFINITY);
+            params.sampling.logit_bias_eog.push_back({i, -INFINITY});
+        }
+    }
+
+    if (params.sampling.ignore_eos) {
+        // add EOG biases to the active set of logit biases
+        params.sampling.logit_bias.insert(
+                params.sampling.logit_bias.end(),
+                params.sampling.logit_bias_eog.begin(), params.sampling.logit_bias_eog.end());
+    }
+
+    //if (params.sampling.penalty_last_n == -1) {
+    //    LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
+    //    params.sampling.penalty_last_n = llama_n_ctx(lctx);
+    //}
+
+    //if (params.sampling.dry_penalty_last_n == -1) {
+    //    LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
+    //    params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
+    //}
+
+    pimpl->samplers.resize(cparams.n_seq_max);
+
+    for (int i = 0; i < (int) cparams.n_seq_max; ++i) {
+        pimpl->samplers[i].reset(common_sampler_init(model, params.sampling));
+    }

    llama_context * lctx = llama_init_from_model(model, cparams);
    if (lctx == NULL) {
-        LOG_ERR("%s: failed to create context with model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
-            __func__, params.model.path.c_str());
-        llama_model_free(model);
-        return iparams;
+        LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
+        return;
    }

+    pimpl->context.reset(lctx);
+}
+
+llama_model * common_init_result::model() {
+    return pimpl->model.get();
+}
+
+llama_context * common_init_result::context() {
+    return pimpl->context.get();
+}
+
+common_sampler * common_init_result::sampler(llama_seq_id seq_id) {
+    return pimpl->samplers[seq_id].get();
+}
+
+std::vector<llama_adapter_lora_ptr> & common_init_result::lora() {
+    return pimpl->lora;
+}
+
+void common_init_result::free_context() {
+    pimpl->context.reset();
+}
+
+common_init_result_ptr common_init_from_params(common_params & params) {
+    common_init_result_ptr res(new common_init_result(params));
+
+    llama_model * model = res->model();
+    if (model == NULL) {
+        LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
+        return res;
+    }
+
+    llama_context * lctx = res->context();
+    if (lctx == NULL) {
+        LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
+        return res;
+    }
+
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+
    if (params.ctx_shift && !llama_memory_can_shift(llama_get_memory(lctx))) {
        LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
        params.ctx_shift = false;
@@ -983,10 +1204,7 @@ struct common_init_result common_init_from_params(common_params & params) {

        const auto cvec = common_control_vector_load(params.control_vectors);
        if (cvec.n_embd == -1) {
-            llama_free(lctx);
-            llama_model_free(model);
-
-            return iparams;
+            return res;
        }

        int err = llama_apply_adapter_cvec(
@@ -997,10 +1215,7 @@ struct common_init_result common_init_from_params(common_params & params) {
                params.control_vector_layer_start,
                params.control_vector_layer_end);
        if (err) {
-            llama_free(lctx);
-            llama_model_free(model);
-
-            return iparams;
+            return res;
        }
    }

@@ -1024,10 +1239,7 @@ struct common_init_result common_init_from_params(common_params & params) {
        }

        if (!ok) {
-            llama_free(lctx);
-            llama_model_free(model);
-
-            return iparams;
+            return res;
        }
    }

@@ -1037,9 +1249,7 @@ struct common_init_result common_init_from_params(common_params & params) {
        lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
        if (lora == nullptr) {
            LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
-            llama_free(lctx);
-            llama_model_free(model);
-            return iparams;
+            return res;
        }

        char buf[1024];
@@ -1048,43 +1258,13 @@ struct common_init_result common_init_from_params(common_params & params) {
        la.task_name = buf;
        llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
        la.prompt_prefix = buf;
-        iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
+        res->lora().emplace_back(std::move(lora)); // copy to list of loaded adapters
    }

    if (!params.lora_init_without_apply) {
        common_set_adapter_lora(lctx, params.lora_adapters);
    }

-    if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
-        LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
-        params.sampling.ignore_eos = false;
-    }
-
-    // initialize once
-    for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
-        if (llama_vocab_is_eog(vocab, i)) {
-            LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
-            params.sampling.logit_bias_eog.push_back({i, -INFINITY});
-        }
-    }
-
-    if (params.sampling.ignore_eos) {
-        // add EOG biases to the active set of logit biases
-        params.sampling.logit_bias.insert(
-                params.sampling.logit_bias.end(),
-                params.sampling.logit_bias_eog.begin(), params.sampling.logit_bias_eog.end());
-    }
-
-    if (params.sampling.penalty_last_n == -1) {
-        LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
-        params.sampling.penalty_last_n = llama_n_ctx(lctx);
-    }
-
-    if (params.sampling.dry_penalty_last_n == -1) {
-        LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
-        params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
-    }
-
    if (params.warmup) {
        LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);

@@ -1123,12 +1303,11 @@ struct common_init_result common_init_from_params(common_params & params) {
        llama_set_warmup(lctx, false);
    }

-    iparams.model.reset(model);
-    iparams.context.reset(lctx);
-
-    return iparams;
+    return res;
 }

+common_init_result::~common_init_result() = default;
+
 std::string get_model_endpoint() {
    const char * model_endpoint_env = getenv("MODEL_ENDPOINT");
    // We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility.
@@ -1137,7 +1316,9 @@ std::string get_model_endpoint() {
    std::string model_endpoint = "https://huggingface.co/";
    if (endpoint_env) {
        model_endpoint = endpoint_env;
-        if (model_endpoint.back() != '/') model_endpoint += '/';
+        if (model_endpoint.back() != '/') {
+            model_endpoint += '/';
+        }
    }
    return model_endpoint;
 }
--- a/common/common.h
+++ b/common/common.h
@@ -12,6 +12,10 @@
 #include <vector>
 #include <map>

+#if defined(_WIN32) && !defined(_WIN32_WINNT)
+#define _WIN32_WINNT 0x0A00
+#endif
+
 #ifdef _WIN32
 #define DIRECTORY_SEPARATOR '\\'
 #else
@@ -26,8 +30,6 @@
    fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);    \
 } while(0)

-#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
-
 struct common_time_meas {
    common_time_meas(int64_t & t_acc, bool disable = false);
    ~common_time_meas();
@@ -80,7 +82,8 @@ int32_t cpu_get_num_math();
 enum llama_example {
    LLAMA_EXAMPLE_COMMON,
    LLAMA_EXAMPLE_SPECULATIVE,
-    LLAMA_EXAMPLE_MAIN,
+    LLAMA_EXAMPLE_COMPLETION,
+    LLAMA_EXAMPLE_CLI,
    LLAMA_EXAMPLE_EMBEDDING,
    LLAMA_EXAMPLE_PERPLEXITY,
    LLAMA_EXAMPLE_RETRIEVAL,
@@ -96,6 +99,7 @@ enum llama_example {
    LLAMA_EXAMPLE_TTS,
    LLAMA_EXAMPLE_DIFFUSION,
    LLAMA_EXAMPLE_FINETUNE,
+    LLAMA_EXAMPLE_FIT_PARAMS,

    LLAMA_EXAMPLE_COUNT,
 };
@@ -140,6 +144,22 @@ struct common_grammar_trigger {
    llama_token token = LLAMA_TOKEN_NULL;
 };

+enum common_params_sampling_config : uint64_t {
+    COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS        = 1 << 0,
+    COMMON_PARAMS_SAMPLING_CONFIG_TOP_K           = 1 << 1,
+    COMMON_PARAMS_SAMPLING_CONFIG_TOP_P           = 1 << 2,
+    COMMON_PARAMS_SAMPLING_CONFIG_MIN_P           = 1 << 3,
+    COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY = 1 << 4,
+    COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD   = 1 << 5,
+    COMMON_PARAMS_SAMPLING_CONFIG_TEMP            = 1 << 6,
+    COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N  = 1 << 7,
+    COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT  = 1 << 8,
+    COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT        = 1 << 9,
+    COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU    = 1 << 10,
+    COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA    = 1 << 11,
+};
+
+
 // sampling parameters
 struct common_params_sampling {
    uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
@@ -172,8 +192,9 @@ struct common_params_sampling {
    bool    no_perf            = false; // disable performance metrics
    bool    timing_per_token   = false;

-    std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"};     // default sequence breakers for DRY
+    uint64_t user_sampling_config = 0; // bitfield to track user-specified samplers

+    std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"};     // default sequence breakers for DRY

    std::vector<enum common_sampler_type> samplers = {
        COMMON_SAMPLER_TYPE_PENALTIES,
@@ -195,6 +216,10 @@ struct common_params_sampling {
    std::vector<llama_logit_bias> logit_bias;     // logit biases to apply
    std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens

+    bool has_logit_bias() const {
+        return !logit_bias.empty();
+    }
+
    // print the parameters into a string
    std::string print() const;
 };
@@ -205,6 +230,7 @@ struct common_params_model {
    std::string hf_repo     = ""; // HF repo                                                // NOLINT
    std::string hf_file     = ""; // HF file                                                // NOLINT
    std::string docker_repo = ""; // Docker repo                                            // NOLINT
+    std::string name        = ""; // in format <user>/<model>[:<tag>] (tag is optional)     // NOLINT
 };

 struct common_params_speculative {
@@ -281,8 +307,8 @@ struct lr_opt {
 struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);

 struct common_params {
-    int32_t n_predict             =    -1; // new tokens to predict
-    int32_t n_ctx                 =  4096; // context size
+    int32_t n_predict             =    -1; // max. number of new tokens to predict, -1 == no limit
+    int32_t n_ctx                 =     0; // context size, 0 == context the model was trained with
    int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
    int32_t n_ubatch              =   512; // physical batch size for prompt processing (must be >=32 to use BLAS)
    int32_t n_keep                =     0; // number of tokens to keep from initial prompt
@@ -303,9 +329,12 @@ struct common_params {
    // offload params
    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading

-    int32_t n_gpu_layers      = -1;  // number of layers to store in VRAM (-1 - use default)
-    int32_t main_gpu          = 0;   // the GPU that is used for scratch and small tensors
-    float   tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
+    int32_t n_gpu_layers       = -1;               // number of layers to store in VRAM (-1 - use default)
+    int32_t main_gpu           = 0;                // the GPU that is used for scratch and small tensors
+    float   tensor_split[128]  = {0};              // how split tensors should be distributed across GPUs
+    bool    fit_params         = true;             // whether to fit unset model/context parameters to free device memory
+    size_t  fit_params_target  = 1024 * 1024*1024; // margin per device in bytes for fitting parameters to free memory
+    int32_t fit_params_min_ctx = 4096;             // minimum context size to set when trying to reduce memory use

    enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs

@@ -351,7 +380,7 @@ struct common_params {

    std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale

-    int32_t verbosity                  = 0;
+    int32_t verbosity                  = 3;  // LOG_LEVEL_INFO
    int32_t control_vector_layer_start = -1; // layer range for control vector
    int32_t control_vector_layer_end   = -1; // layer range for control vector
    bool    offline                    = false;
@@ -385,6 +414,7 @@ struct common_params {
    bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
    bool cont_batching     = true;  // insert new sequences for decoding on-the-fly
    bool no_perf           = false; // disable performance metrics
+    bool show_timings      = true;  // show timing information on CLI
    bool ctx_shift         = false; // context shift on infinite text generation
    bool swa_full          = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
    bool kv_unified        = false; // enable unified KV cache
@@ -441,7 +471,7 @@ struct common_params {
    std::string public_path   = "";                                                                         // NOLINT
    std::string api_prefix    = "";                                                                         // NOLINT
    std::string chat_template = "";                                                                         // NOLINT
-    bool use_jinja = false;                                                                                 // NOLINT
+    bool use_jinja = true;                                                                                  // NOLINT
    bool enable_chat_template = true;
    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
    int reasoning_budget = -1;
@@ -454,15 +484,25 @@ struct common_params {

    std::map<std::string, std::string> default_template_kwargs;

+    // webui configs
+    bool webui = true;
+    std::string webui_config_json;
+
    // "advanced" endpoints are disabled by default for better security
-    bool webui            = true;
    bool endpoint_slots   = true;
    bool endpoint_props   = false; // only control POST requests, not GET
    bool endpoint_metrics = false;

+    // router server configs
+    std::string models_dir    = ""; // directory containing models for the router server
+    std::string models_preset = ""; // directory containing model presets for the router server
+    int models_max = 4;             // maximum number of models to load simultaneously
+    bool models_autoload = true;    // automatically load models when requested via the router server
+
    bool log_json = false;

    std::string slot_save_path;
+    std::string media_path; // path to directory for loading media files

    float slot_prompt_similarity = 0.1f;

@@ -613,8 +653,9 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat
 // Filesystem utils
 //

-bool fs_validate_filename(const std::string & filename);
+bool fs_validate_filename(const std::string & filename, bool allow_subdirs = false);
 bool fs_create_directory_with_parents(const std::string & path);
+bool fs_is_directory(const std::string & path);

 std::string fs_get_cache_directory();
 std::string fs_get_cache_file(const std::string & filename);
@@ -623,22 +664,44 @@ struct common_file_info {
    std::string path;
    std::string name;
    size_t      size = 0; // in bytes
+    bool        is_dir = false;
 };
-std::vector<common_file_info> fs_list_files(const std::string & path);
+std::vector<common_file_info> fs_list(const std::string & path, bool include_directories);
+
+//
+// TTY utils
+//
+
+// Auto-detect if colors can be enabled based on terminal and environment
+bool tty_can_use_colors();

 //
 // Model utils
 //

-// note: defines object's lifetime
-struct common_init_result {
-    llama_model_ptr   model;
-    llama_context_ptr context;
+struct common_sampler;

-    std::vector<llama_adapter_lora_ptr> lora;
+// note: defines the model, context, samplers, ets. lifetimes
+struct common_init_result {
+    common_init_result(common_params & params);
+    ~common_init_result();
+
+    llama_model * model();
+    llama_context * context();
+    common_sampler * sampler(llama_seq_id seq_id);
+
+    std::vector<llama_adapter_lora_ptr> & lora();
+
+    void free_context();
+
+private:
+    struct impl;
+    std::unique_ptr<impl> pimpl;
 };

-struct common_init_result     common_init_from_params(common_params & params);
+using common_init_result_ptr = std::unique_ptr<common_init_result>;
+
+common_init_result_ptr common_init_from_params(common_params & params);

 struct llama_model_params     common_model_params_to_llama  (      common_params & params);
 struct llama_context_params   common_context_params_to_llama(const common_params & params);
--- a/common/console.cpp
+++ b/common/console.cpp
@@ -1,6 +1,16 @@
 #include "console.h"
+#include "log.h"
 #include <vector>
 #include <iostream>
+#include <cassert>
+#include <cstddef>
+#include <cctype>
+#include <cwctype>
+#include <cstdint>
+#include <condition_variable>
+#include <mutex>
+#include <thread>
+#include <stdarg.h>

 #if defined(_WIN32)
 #define WIN32_LEAN_AND_MEAN
@@ -30,26 +40,44 @@
 #define ANSI_COLOR_BLUE    "\x1b[34m"
 #define ANSI_COLOR_MAGENTA "\x1b[35m"
 #define ANSI_COLOR_CYAN    "\x1b[36m"
+#define ANSI_COLOR_GRAY    "\x1b[90m"
 #define ANSI_COLOR_RESET   "\x1b[0m"
 #define ANSI_BOLD          "\x1b[1m"

 namespace console {

+#if defined (_WIN32)
+    namespace {
+        // Use private-use unicode values to represent special keys that are not reported
+        // as characters (e.g. arrows on Windows). These values should never clash with
+        // real input and let the rest of the code handle navigation uniformly.
+        static constexpr char32_t KEY_ARROW_LEFT       = 0xE000;
+        static constexpr char32_t KEY_ARROW_RIGHT      = 0xE001;
+        static constexpr char32_t KEY_ARROW_UP         = 0xE002;
+        static constexpr char32_t KEY_ARROW_DOWN       = 0xE003;
+        static constexpr char32_t KEY_HOME             = 0xE004;
+        static constexpr char32_t KEY_END              = 0xE005;
+        static constexpr char32_t KEY_CTRL_ARROW_LEFT  = 0xE006;
+        static constexpr char32_t KEY_CTRL_ARROW_RIGHT = 0xE007;
+        static constexpr char32_t KEY_DELETE           = 0xE008;
+    }
+
    //
    // Console state
    //
+#endif

-    static bool      advanced_display = false;
-    static bool      simple_io        = true;
-    static display_t current_display  = reset;
+    static bool         advanced_display = false;
+    static bool         simple_io        = true;
+    static display_type current_display  = DISPLAY_TYPE_RESET;

-    static FILE*     out              = stdout;
+    static FILE*        out              = stdout;

 #if defined (_WIN32)
-    static void*     hConsole;
+    static void*        hConsole;
 #else
-    static FILE*     tty              = nullptr;
-    static termios   initial_state;
+    static FILE*        tty              = nullptr;
+    static termios      initial_state;
 #endif

    //
@@ -120,7 +148,7 @@ namespace console {

    void cleanup() {
        // Reset console display
-        set_display(reset);
+        set_display(DISPLAY_TYPE_RESET);

 #if !defined(_WIN32)
        // Restore settings on POSIX systems
@@ -140,20 +168,26 @@ namespace console {
    //

    // Keep track of current display and only emit ANSI code if it changes
-    void set_display(display_t display) {
+    void set_display(display_type display) {
        if (advanced_display && current_display != display) {
-            fflush(stdout);
+            common_log_flush(common_log_main());
            switch(display) {
-                case reset:
+                case DISPLAY_TYPE_RESET:
                    fprintf(out, ANSI_COLOR_RESET);
                    break;
-                case prompt:
+                case DISPLAY_TYPE_INFO:
+                    fprintf(out, ANSI_COLOR_MAGENTA);
+                    break;
+                case DISPLAY_TYPE_PROMPT:
                    fprintf(out, ANSI_COLOR_YELLOW);
                    break;
-                case user_input:
+                case DISPLAY_TYPE_REASONING:
+                    fprintf(out, ANSI_COLOR_GRAY);
+                    break;
+                case DISPLAY_TYPE_USER_INPUT:
                    fprintf(out, ANSI_BOLD ANSI_COLOR_GREEN);
                    break;
-                case error:
+                case DISPLAY_TYPE_ERROR:
                    fprintf(out, ANSI_BOLD ANSI_COLOR_RED);
            }
            current_display = display;
@@ -176,7 +210,18 @@ namespace console {
            if (record.EventType == KEY_EVENT && record.Event.KeyEvent.bKeyDown) {
                wchar_t wc = record.Event.KeyEvent.uChar.UnicodeChar;
                if (wc == 0) {
-                    continue;
+                    const DWORD ctrl_mask = LEFT_CTRL_PRESSED | RIGHT_CTRL_PRESSED;
+                    const bool ctrl_pressed = (record.Event.KeyEvent.dwControlKeyState & ctrl_mask) != 0;
+                    switch (record.Event.KeyEvent.wVirtualKeyCode) {
+                        case VK_LEFT:   return ctrl_pressed ? KEY_CTRL_ARROW_LEFT  : KEY_ARROW_LEFT;
+                        case VK_RIGHT:  return ctrl_pressed ? KEY_CTRL_ARROW_RIGHT : KEY_ARROW_RIGHT;
+                        case VK_UP:     return KEY_ARROW_UP;
+                        case VK_DOWN:   return KEY_ARROW_DOWN;
+                        case VK_HOME:   return KEY_HOME;
+                        case VK_END:    return KEY_END;
+                        case VK_DELETE: return KEY_DELETE;
+                        default:        continue;
+                    }
                }

                if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate
@@ -315,6 +360,52 @@ namespace console {
 #endif
    }

+    static char32_t decode_utf8(const std::string & input, size_t pos, size_t & advance) {
+        unsigned char c = static_cast<unsigned char>(input[pos]);
+        if ((c & 0x80u) == 0u) {
+            advance = 1;
+            return c;
+        }
+        if ((c & 0xE0u) == 0xC0u && pos + 1 < input.size()) {
+            unsigned char c1 = static_cast<unsigned char>(input[pos + 1]);
+            if ((c1 & 0xC0u) != 0x80u) {
+                advance = 1;
+                return 0xFFFD;
+            }
+            advance = 2;
+            return ((c & 0x1Fu) << 6) | (static_cast<unsigned char>(input[pos + 1]) & 0x3Fu);
+        }
+        if ((c & 0xF0u) == 0xE0u && pos + 2 < input.size()) {
+            unsigned char c1 = static_cast<unsigned char>(input[pos + 1]);
+            unsigned char c2 = static_cast<unsigned char>(input[pos + 2]);
+            if ((c1 & 0xC0u) != 0x80u || (c2 & 0xC0u) != 0x80u) {
+                advance = 1;
+                return 0xFFFD;
+            }
+            advance = 3;
+            return ((c & 0x0Fu) << 12) |
+                   ((static_cast<unsigned char>(input[pos + 1]) & 0x3Fu) << 6) |
+                   (static_cast<unsigned char>(input[pos + 2]) & 0x3Fu);
+        }
+        if ((c & 0xF8u) == 0xF0u && pos + 3 < input.size()) {
+            unsigned char c1 = static_cast<unsigned char>(input[pos + 1]);
+            unsigned char c2 = static_cast<unsigned char>(input[pos + 2]);
+            unsigned char c3 = static_cast<unsigned char>(input[pos + 3]);
+            if ((c1 & 0xC0u) != 0x80u || (c2 & 0xC0u) != 0x80u || (c3 & 0xC0u) != 0x80u) {
+                advance = 1;
+                return 0xFFFD;
+            }
+            advance = 4;
+            return ((c & 0x07u) << 18) |
+                   ((static_cast<unsigned char>(input[pos + 1]) & 0x3Fu) << 12) |
+                   ((static_cast<unsigned char>(input[pos + 2]) & 0x3Fu) << 6) |
+                   (static_cast<unsigned char>(input[pos + 3]) & 0x3Fu);
+        }
+
+        advance = 1;
+        return 0xFFFD; // replacement character for invalid input
+    }
+
    static void append_utf8(char32_t ch, std::string & out) {
        if (ch <= 0x7F) {
            out.push_back(static_cast<unsigned char>(ch));
@@ -336,22 +427,319 @@ namespace console {
    }

    // Helper function to remove the last UTF-8 character from a string
-    static void pop_back_utf8_char(std::string & line) {
-        if (line.empty()) {
+    static size_t prev_utf8_char_pos(const std::string & line, size_t pos) {
+        if (pos == 0) return 0;
+        pos--;
+        while (pos > 0 && (line[pos] & 0xC0) == 0x80) {
+            pos--;
+        }
+        return pos;
+    }
+
+    static size_t next_utf8_char_pos(const std::string & line, size_t pos) {
+        if (pos >= line.length()) return line.length();
+        pos++;
+        while (pos < line.length() && (line[pos] & 0xC0) == 0x80) {
+            pos++;
+        }
+        return pos;
+    }
+
+    static void move_cursor(int delta);
+    static void move_word_left(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths, const std::string & line);
+    static void move_word_right(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths, const std::string & line);
+    static void move_to_line_start(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths);
+    static void move_to_line_end(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths, const std::string & line);
+
+    static void delete_at_cursor(std::string & line, std::vector<int> & widths, size_t & char_pos, size_t & byte_pos) {
+        if (char_pos >= widths.size()) {
            return;
        }

-        size_t pos = line.length() - 1;
+        size_t next_pos = next_utf8_char_pos(line, byte_pos);
+        int w = widths[char_pos];
+        size_t char_len = next_pos - byte_pos;

-        // Find the start of the last UTF-8 character (checking up to 4 bytes back)
-        for (size_t i = 0; i < 3 && pos > 0; ++i, --pos) {
-            if ((line[pos] & 0xC0) != 0x80) {
-                break; // Found the start of the character
-            }
+        line.erase(byte_pos, char_len);
+        widths.erase(widths.begin() + char_pos);
+
+        size_t p = byte_pos;
+        int tail_width = 0;
+        for (size_t i = char_pos; i < widths.size(); ++i) {
+            size_t following = next_utf8_char_pos(line, p);
+            put_codepoint(line.c_str() + p, following - p, widths[i]);
+            tail_width += widths[i];
+            p = following;
        }
-        line.erase(pos);
+
+        for (int i = 0; i < w; ++i) {
+            fputc(' ', out);
+        }
+
+        move_cursor(-(tail_width + w));
    }

+    static void clear_current_line(const std::vector<int> & widths) {
+        int total_width = 0;
+        for (int w : widths) {
+            total_width += (w > 0 ? w : 1);
+        }
+
+        if (total_width > 0) {
+            std::string spaces(total_width, ' ');
+            fwrite(spaces.c_str(), 1, total_width, out);
+            move_cursor(-total_width);
+        }
+    }
+
+    static void set_line_contents(std::string new_line, std::string & line, std::vector<int> & widths, size_t & char_pos,
+                                  size_t & byte_pos) {
+        move_to_line_start(char_pos, byte_pos, widths);
+        clear_current_line(widths);
+
+        line = std::move(new_line);
+        widths.clear();
+        byte_pos = 0;
+        char_pos = 0;
+
+        size_t idx = 0;
+        while (idx < line.size()) {
+            size_t advance = 0;
+            char32_t cp = decode_utf8(line, idx, advance);
+            int expected_width = estimateWidth(cp);
+            int real_width = put_codepoint(line.c_str() + idx, advance, expected_width);
+            if (real_width < 0) real_width = 0;
+            widths.push_back(real_width);
+            idx += advance;
+            ++char_pos;
+            byte_pos = idx;
+        }
+    }
+
+    static void move_to_line_start(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths) {
+        int back_width = 0;
+        for (size_t i = 0; i < char_pos; ++i) {
+            back_width += widths[i];
+        }
+        move_cursor(-back_width);
+        char_pos = 0;
+        byte_pos = 0;
+    }
+
+    static void move_to_line_end(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths, const std::string & line) {
+        int forward_width = 0;
+        for (size_t i = char_pos; i < widths.size(); ++i) {
+            forward_width += widths[i];
+        }
+        move_cursor(forward_width);
+        char_pos = widths.size();
+        byte_pos = line.length();
+    }
+
+    static bool has_ctrl_modifier(const std::string & params) {
+        size_t start = 0;
+        while (start < params.size()) {
+            size_t end = params.find(';', start);
+            size_t len = (end == std::string::npos) ? params.size() - start : end - start;
+            if (len > 0) {
+                int value = 0;
+                for (size_t i = 0; i < len; ++i) {
+                    char ch = params[start + i];
+                    if (!std::isdigit(static_cast<unsigned char>(ch))) {
+                        value = -1;
+                        break;
+                    }
+                    value = value * 10 + (ch - '0');
+                }
+                if (value == 5) {
+                    return true;
+                }
+            }
+
+            if (end == std::string::npos) {
+                break;
+            }
+            start = end + 1;
+        }
+        return false;
+    }
+
+    static bool is_space_codepoint(char32_t cp) {
+        return std::iswspace(static_cast<wint_t>(cp)) != 0;
+    }
+
+    static void move_word_left(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths, const std::string & line) {
+        if (char_pos == 0) {
+            return;
+        }
+
+        size_t new_char_pos = char_pos;
+        size_t new_byte_pos = byte_pos;
+        int move_width = 0;
+
+        while (new_char_pos > 0) {
+            size_t prev_byte = prev_utf8_char_pos(line, new_byte_pos);
+            size_t advance = 0;
+            char32_t cp = decode_utf8(line, prev_byte, advance);
+            if (!is_space_codepoint(cp)) {
+                break;
+            }
+            move_width += widths[new_char_pos - 1];
+            new_char_pos--;
+            new_byte_pos = prev_byte;
+        }
+
+        while (new_char_pos > 0) {
+            size_t prev_byte = prev_utf8_char_pos(line, new_byte_pos);
+            size_t advance = 0;
+            char32_t cp = decode_utf8(line, prev_byte, advance);
+            if (is_space_codepoint(cp)) {
+                break;
+            }
+            move_width += widths[new_char_pos - 1];
+            new_char_pos--;
+            new_byte_pos = prev_byte;
+        }
+
+        move_cursor(-move_width);
+        char_pos = new_char_pos;
+        byte_pos = new_byte_pos;
+    }
+
+    static void move_word_right(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths, const std::string & line) {
+        if (char_pos >= widths.size()) {
+            return;
+        }
+
+        size_t new_char_pos = char_pos;
+        size_t new_byte_pos = byte_pos;
+        int move_width = 0;
+
+        while (new_char_pos < widths.size()) {
+            size_t advance = 0;
+            char32_t cp = decode_utf8(line, new_byte_pos, advance);
+            if (!is_space_codepoint(cp)) {
+                break;
+            }
+            move_width += widths[new_char_pos];
+            new_char_pos++;
+            new_byte_pos += advance;
+        }
+
+        while (new_char_pos < widths.size()) {
+            size_t advance = 0;
+            char32_t cp = decode_utf8(line, new_byte_pos, advance);
+            if (is_space_codepoint(cp)) {
+                break;
+            }
+            move_width += widths[new_char_pos];
+            new_char_pos++;
+            new_byte_pos += advance;
+        }
+
+        while (new_char_pos < widths.size()) {
+            size_t advance = 0;
+            char32_t cp = decode_utf8(line, new_byte_pos, advance);
+            if (!is_space_codepoint(cp)) {
+                break;
+            }
+            move_width += widths[new_char_pos];
+            new_char_pos++;
+            new_byte_pos += advance;
+        }
+
+        move_cursor(move_width);
+        char_pos = new_char_pos;
+        byte_pos = new_byte_pos;
+    }
+
+    static void move_cursor(int delta) {
+        if (delta == 0) return;
+#if defined(_WIN32)
+        if (hConsole != NULL) {
+            CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
+            GetConsoleScreenBufferInfo(hConsole, &bufferInfo);
+            COORD newCursorPosition = bufferInfo.dwCursorPosition;
+            int width = bufferInfo.dwSize.X;
+            int newX = newCursorPosition.X + delta;
+            int newY = newCursorPosition.Y;
+
+            while (newX >= width) {
+                newX -= width;
+                newY++;
+            }
+            while (newX < 0) {
+                newX += width;
+                newY--;
+            }
+
+            newCursorPosition.X = newX;
+            newCursorPosition.Y = newY;
+            SetConsoleCursorPosition(hConsole, newCursorPosition);
+        }
+#else
+        if (delta < 0) {
+            for (int i = 0; i < -delta; i++) fprintf(out, "\b");
+        } else {
+            for (int i = 0; i < delta; i++) fprintf(out, "\033[C");
+        }
+#endif
+    }
+
+    struct history_t {
+        std::vector<std::string> entries;
+        size_t viewing_idx = SIZE_MAX;
+        std::string backup_line; // current line before viewing history
+        void add(const std::string & line) {
+            if (line.empty()) {
+                return;
+            }
+            // avoid duplicates with the last entry
+            if (entries.empty() || entries.back() != line) {
+                entries.push_back(line);
+            }
+            // also clear viewing state
+            end_viewing();
+        }
+        bool prev(std::string & cur_line) {
+            if (entries.empty()) {
+                return false;
+            }
+            if (viewing_idx == SIZE_MAX) {
+                return false;
+            }
+            if (viewing_idx > 0) {
+                viewing_idx--;
+            }
+            cur_line = entries[viewing_idx];
+            return true;
+        }
+        bool next(std::string & cur_line) {
+            if (entries.empty() || viewing_idx == SIZE_MAX) {
+                return false;
+            }
+            viewing_idx++;
+            if (viewing_idx >= entries.size()) {
+                cur_line = backup_line;
+                end_viewing();
+            } else {
+                cur_line = entries[viewing_idx];
+            }
+            return true;
+        }
+        void begin_viewing(const std::string & line) {
+            backup_line = line;
+            viewing_idx = entries.size();
+        }
+        void end_viewing() {
+            viewing_idx = SIZE_MAX;
+            backup_line.clear();
+        }
+        bool is_viewing() const {
+            return viewing_idx != SIZE_MAX;
+        }
+    } history;
+
    static bool readline_advanced(std::string & line, bool multiline_input) {
        if (out != stdout) {
            fflush(stdout);
@@ -362,8 +750,33 @@ namespace console {
        bool is_special_char = false;
        bool end_of_stream = false;

+        size_t byte_pos = 0; // current byte index
+        size_t char_pos = 0; // current character index (one char can be multiple bytes)
+
        char32_t input_char;
        while (true) {
+            assert(char_pos <= byte_pos);
+            assert(char_pos <= widths.size());
+            auto history_prev = [&]() {
+                if (!history.is_viewing()) {
+                    history.begin_viewing(line);
+                }
+                std::string new_line;
+                if (!history.prev(new_line)) {
+                    return;
+                }
+                set_line_contents(new_line, line, widths, char_pos, byte_pos);
+            };
+            auto history_next = [&]() {
+                if (history.is_viewing()) {
+                    std::string new_line;
+                    if (!history.next(new_line)) {
+                        return;
+                    }
+                    set_line_contents(new_line, line, widths, char_pos, byte_pos);
+                }
+            };
+
            fflush(out); // Ensure all output is displayed before waiting for input
            input_char = getchar32();

@@ -371,20 +784,83 @@ namespace console {
                break;
            }

-            if (input_char == (char32_t) WEOF || input_char == 0x04 /* Ctrl+D*/) {
+            if (input_char == (char32_t) WEOF || input_char == 0x04 /* Ctrl+D */) {
                end_of_stream = true;
                break;
            }

            if (is_special_char) {
-                set_display(user_input);
                replace_last(line.back());
                is_special_char = false;
            }

            if (input_char == '\033') { // Escape sequence
                char32_t code = getchar32();
-                if (code == '[' || code == 0x1B) {
+                if (code == '[') {
+                    std::string params;
+                    while (true) {
+                        code = getchar32();
+                        if ((code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z') || code == '~' || code == (char32_t) WEOF) {
+                            break;
+                        }
+                        params.push_back(static_cast<char>(code));
+                    }
+
+                    const bool ctrl_modifier = has_ctrl_modifier(params);
+
+                    if (code == 'D') { // left
+                        if (ctrl_modifier) {
+                            move_word_left(char_pos, byte_pos, widths, line);
+                        } else if (char_pos > 0) {
+                            int w = widths[char_pos - 1];
+                            move_cursor(-w);
+                            char_pos--;
+                            byte_pos = prev_utf8_char_pos(line, byte_pos);
+                        }
+                    } else if (code == 'C') { // right
+                        if (ctrl_modifier) {
+                            move_word_right(char_pos, byte_pos, widths, line);
+                        } else if (char_pos < widths.size()) {
+                            int w = widths[char_pos];
+                            move_cursor(w);
+                            char_pos++;
+                            byte_pos = next_utf8_char_pos(line, byte_pos);
+                        }
+                    } else if (code == 'H') { // home
+                        move_to_line_start(char_pos, byte_pos, widths);
+                    } else if (code == 'F') { // end
+                        move_to_line_end(char_pos, byte_pos, widths, line);
+                    } else if (code == 'A' || code == 'B') {
+                        // up/down
+                        if (code == 'A') {
+                            history_prev();
+                            is_special_char = false;
+                        } else if (code == 'B') {
+                            history_next();
+                            is_special_char = false;
+                        }
+                    } else if ((code == '~' || (code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z')) && !params.empty()) {
+                        std::string digits;
+                        for (char ch : params) {
+                            if (ch == ';') {
+                                break;
+                            }
+                            if (std::isdigit(static_cast<unsigned char>(ch))) {
+                                digits.push_back(ch);
+                            }
+                        }
+
+                        if (code == '~') {
+                            if (digits == "1" || digits == "7") { // home
+                                move_to_line_start(char_pos, byte_pos, widths);
+                            } else if (digits == "4" || digits == "8") { // end
+                                move_to_line_end(char_pos, byte_pos, widths, line);
+                            } else if (digits == "3") { // delete
+                                delete_at_cursor(line, widths, char_pos, byte_pos);
+                            }
+                        }
+                    }
+                } else if (code == 0x1B) {
                    // Discard the rest of the escape sequence
                    while ((code = getchar32()) != (char32_t) WEOF) {
                        if ((code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z') || code == '~') {
@@ -392,32 +868,110 @@ namespace console {
                        }
                    }
                }
+#if defined(_WIN32)
+            } else if (input_char == KEY_ARROW_LEFT) {
+                if (char_pos > 0) {
+                    int w = widths[char_pos - 1];
+                    move_cursor(-w);
+                    char_pos--;
+                    byte_pos = prev_utf8_char_pos(line, byte_pos);
+                }
+            } else if (input_char == KEY_ARROW_RIGHT) {
+                if (char_pos < widths.size()) {
+                    int w = widths[char_pos];
+                    move_cursor(w);
+                    char_pos++;
+                    byte_pos = next_utf8_char_pos(line, byte_pos);
+                }
+            } else if (input_char == KEY_CTRL_ARROW_LEFT) {
+                move_word_left(char_pos, byte_pos, widths, line);
+            } else if (input_char == KEY_CTRL_ARROW_RIGHT) {
+                move_word_right(char_pos, byte_pos, widths, line);
+            } else if (input_char == KEY_HOME) {
+                move_to_line_start(char_pos, byte_pos, widths);
+            } else if (input_char == KEY_END) {
+                move_to_line_end(char_pos, byte_pos, widths, line);
+            } else if (input_char == KEY_DELETE) {
+                delete_at_cursor(line, widths, char_pos, byte_pos);
+            } else if (input_char == KEY_ARROW_UP || input_char == KEY_ARROW_DOWN) {
+                if (input_char == KEY_ARROW_UP) {
+                    history_prev();
+                    is_special_char = false;
+                } else if (input_char == KEY_ARROW_DOWN) {
+                    history_next();
+                    is_special_char = false;
+                }
+#endif
            } else if (input_char == 0x08 || input_char == 0x7F) { // Backspace
-                if (!widths.empty()) {
-                    int count;
-                    do {
-                        count = widths.back();
-                        widths.pop_back();
-                        // Move cursor back, print space, and move cursor back again
-                        for (int i = 0; i < count; i++) {
-                            replace_last(' ');
-                            pop_cursor();
-                        }
-                        pop_back_utf8_char(line);
-                    } while (count == 0 && !widths.empty());
+                if (char_pos > 0) {
+                    int w = widths[char_pos - 1];
+                    move_cursor(-w);
+                    char_pos--;
+                    size_t prev_pos = prev_utf8_char_pos(line, byte_pos);
+                    size_t char_len = byte_pos - prev_pos;
+                    byte_pos = prev_pos;
+
+                    // remove the character
+                    line.erase(byte_pos, char_len);
+                    widths.erase(widths.begin() + char_pos);
+
+                    // redraw tail
+                    size_t p = byte_pos;
+                    int tail_width = 0;
+                    for (size_t i = char_pos; i < widths.size(); ++i) {
+                        size_t next_p = next_utf8_char_pos(line, p);
+                        put_codepoint(line.c_str() + p, next_p - p, widths[i]);
+                        tail_width += widths[i];
+                        p = next_p;
+                    }
+
+                    // clear display
+                    for (int i = 0; i < w; ++i) {
+                        fputc(' ', out);
+                    }
+                    move_cursor(-(tail_width + w));
                }
            } else {
-                int offset = line.length();
-                append_utf8(input_char, line);
-                int width = put_codepoint(line.c_str() + offset, line.length() - offset, estimateWidth(input_char));
-                if (width < 0) {
-                    width = 0;
+                // insert character
+                std::string new_char_str;
+                append_utf8(input_char, new_char_str);
+                int w = estimateWidth(input_char);
+
+                if (char_pos == widths.size()) {
+                    // insert at the end
+                    line += new_char_str;
+                    int real_w = put_codepoint(new_char_str.c_str(), new_char_str.length(), w);
+                    if (real_w < 0) real_w = 0;
+                    widths.push_back(real_w);
+                    byte_pos += new_char_str.length();
+                    char_pos++;
+                } else {
+                    // insert in middle
+                    line.insert(byte_pos, new_char_str);
+
+                    int real_w = put_codepoint(new_char_str.c_str(), new_char_str.length(), w);
+                    if (real_w < 0) real_w = 0;
+
+                    widths.insert(widths.begin() + char_pos, real_w);
+
+                    // print the tail
+                    size_t p = byte_pos + new_char_str.length();
+                    int tail_width = 0;
+                    for (size_t i = char_pos + 1; i < widths.size(); ++i) {
+                        size_t next_p = next_utf8_char_pos(line, p);
+                        put_codepoint(line.c_str() + p, next_p - p, widths[i]);
+                        tail_width += widths[i];
+                        p = next_p;
+                    }
+
+                    move_cursor(-tail_width);
+
+                    byte_pos += new_char_str.length();
+                    char_pos++;
                }
-                widths.push_back(width);
            }

            if (!line.empty() && (line.back() == '\\' || line.back() == '/')) {
-                set_display(prompt);
                replace_last(line.back());
                is_special_char = true;
            }
@@ -451,6 +1005,15 @@ namespace console {
            }
        }

+        if (!end_of_stream && !line.empty()) {
+            // remove the trailing newline for history storage
+            if (!line.empty() && line.back() == '\n') {
+                line.pop_back();
+            }
+            // TODO: maybe support multiline history entries?
+            history.add(line);
+        }
+
        fflush(out);
        return has_more;
    }
@@ -493,12 +1056,82 @@ namespace console {
    }

    bool readline(std::string & line, bool multiline_input) {
-        set_display(user_input);
-
        if (simple_io) {
            return readline_simple(line, multiline_input);
        }
        return readline_advanced(line, multiline_input);
    }

+    namespace spinner {
+        static const char LOADING_CHARS[] = {'|', '/', '-', '\\'};
+        static std::condition_variable cv_stop;
+        static std::thread th;
+        static size_t frame = 0; // only modified by one thread
+        static bool running = false;
+        static std::mutex mtx;
+        static auto wait_time = std::chrono::milliseconds(100);
+        static void draw_next_frame() {
+            // don't need lock because only one thread modifies running
+            frame = (frame + 1) % sizeof(LOADING_CHARS);
+            replace_last(LOADING_CHARS[frame]);
+            fflush(out);
+        }
+        void start() {
+            std::unique_lock<std::mutex> lock(mtx);
+            if (simple_io || running) {
+                return;
+            }
+            common_log_flush(common_log_main());
+            fprintf(out, "%c", LOADING_CHARS[0]);
+            fflush(out);
+            frame = 1;
+            running = true;
+            th = std::thread([]() {
+                std::unique_lock<std::mutex> lock(mtx);
+                while (true) {
+                    if (cv_stop.wait_for(lock, wait_time, []{ return !running; })) {
+                        break;
+                    }
+                    draw_next_frame();
+                }
+            });
+        }
+        void stop() {
+            {
+                std::unique_lock<std::mutex> lock(mtx);
+                if (simple_io || !running) {
+                    return;
+                }
+                running = false;
+                cv_stop.notify_all();
+            }
+            if (th.joinable()) {
+                th.join();
+            }
+            replace_last(' ');
+            pop_cursor();
+            fflush(out);
+        }
+    }
+
+    void log(const char * fmt, ...) {
+        va_list args;
+        va_start(args, fmt);
+        vfprintf(out, fmt, args);
+        va_end(args);
+    }
+
+    void error(const char * fmt, ...) {
+        va_list args;
+        va_start(args, fmt);
+        display_type cur = current_display;
+        set_display(DISPLAY_TYPE_ERROR);
+        vfprintf(out, fmt, args);
+        set_display(cur); // restore previous color
+        va_end(args);
+    }
+
+    void flush() {
+        fflush(out);
+    }
 }
--- a/common/console.h
+++ b/common/console.h
@@ -2,18 +2,40 @@

 #pragma once

+#include "common.h"
+
 #include <string>

-namespace console {
-    enum display_t {
-        reset = 0,
-        prompt,
-        user_input,
-        error
-    };
+enum display_type {
+    DISPLAY_TYPE_RESET = 0,
+    DISPLAY_TYPE_INFO,
+    DISPLAY_TYPE_PROMPT,
+    DISPLAY_TYPE_REASONING,
+    DISPLAY_TYPE_USER_INPUT,
+    DISPLAY_TYPE_ERROR
+};

+namespace console {
    void init(bool use_simple_io, bool use_advanced_display);
    void cleanup();
-    void set_display(display_t display);
+    void set_display(display_type display);
    bool readline(std::string & line, bool multiline_input);
+
+    namespace spinner {
+        void start();
+        void stop();
+    }
+
+    // note: the logging API below output directly to stdout
+    // it can negatively impact performance if used on inference thread
+    // only use in in a dedicated CLI thread
+    // for logging in inference thread, use log.h instead
+
+    LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
+    void log(const char * fmt, ...);
+
+    LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
+    void error(const char * fmt, ...);
+
+    void flush();
 }
--- a/common/download.cpp
+++ b/common/download.cpp
@@ -12,6 +12,8 @@
 #include <filesystem>
 #include <fstream>
 #include <future>
+#include <map>
+#include <mutex>
 #include <regex>
 #include <string>
 #include <thread>
@@ -24,6 +26,7 @@
 #include "http.h"
 #endif

+#ifndef __EMSCRIPTEN__
 #ifdef __linux__
 #include <linux/limits.h>
 #elif defined(_WIN32)
@@ -35,6 +38,8 @@
 #else
 #include <sys/syslimits.h>
 #endif
+#endif
+
 #define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083

 // isatty
@@ -430,7 +435,7 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
    curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
    curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
    curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
-    curl_easy_setopt(curl.get(), CURLOPT_VERBOSE, 1L);
+    curl_easy_setopt(curl.get(), CURLOPT_VERBOSE, 0L);
    typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
    auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
        auto data_vec = static_cast<std::vector<char> *>(data);
@@ -469,36 +474,79 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &

 #elif defined(LLAMA_USE_HTTPLIB)

-static bool is_output_a_tty() {
+class ProgressBar {
+    static inline std::mutex mutex;
+    static inline std::map<const ProgressBar *, int> lines;
+    static inline int max_line = 0;
+
+    static void cleanup(const ProgressBar * line) {
+        lines.erase(line);
+        if (lines.empty()) {
+            max_line = 0;
+        }
+    }
+
+    static bool is_output_a_tty() {
 #if defined(_WIN32)
-    return _isatty(_fileno(stdout));
+        return _isatty(_fileno(stdout));
 #else
-    return isatty(1);
+        return isatty(1);
 #endif
-}
-
-static void print_progress(size_t current, size_t total) {
-    if (!is_output_a_tty()) {
-        return;
    }

-    if (!total) {
-        return;
+public:
+    ProgressBar() = default;
+
+    ~ProgressBar() {
+        std::lock_guard<std::mutex> lock(mutex);
+        cleanup(this);
    }

-    size_t width = 50;
-    size_t pct = (100 * current) / total;
-    size_t pos = (width * current) / total;
+    void update(size_t current, size_t total) {
+        if (!is_output_a_tty()) {
+            return;
+        }

-    std::cout << "["
-              << std::string(pos, '=')
-              << (pos < width ? ">" : "")
-              << std::string(width - pos, ' ')
-              << "] " << std::setw(3) << pct << "%  ("
-              << current / (1024 * 1024) << " MB / "
-              << total / (1024 * 1024) << " MB)\r";
-    std::cout.flush();
-}
+        if (!total) {
+            return;
+        }
+
+        std::lock_guard<std::mutex> lock(mutex);
+
+        if (lines.find(this) == lines.end()) {
+            lines[this] = max_line++;
+            std::cout << "\n";
+        }
+        int lines_up = max_line - lines[this];
+
+        size_t width = 50;
+        size_t pct = (100 * current) / total;
+        size_t pos = (width * current) / total;
+
+        std::cout << "\033[s";
+
+        if (lines_up > 0) {
+            std::cout << "\033[" << lines_up << "A";
+        }
+        std::cout << "\033[2K\r["
+            << std::string(pos, '=')
+            << (pos < width ? ">" : "")
+            << std::string(width - pos, ' ')
+            << "] " << std::setw(3) << pct << "%  ("
+            << current / (1024 * 1024) << " MB / "
+            << total / (1024 * 1024) << " MB) "
+            << "\033[u";
+
+        std::cout.flush();
+
+        if (current == total) {
+             cleanup(this);
+        }
+    }
+
+    ProgressBar(const ProgressBar &) = delete;
+    ProgressBar & operator=(const ProgressBar &) = delete;
+};

 static bool common_pull_file(httplib::Client & cli,
                             const std::string & resolve_path,
@@ -517,16 +565,19 @@ static bool common_pull_file(httplib::Client & cli,
        headers.emplace("Range", "bytes=" + std::to_string(existing_size) + "-");
    }

-    std::atomic<size_t> downloaded{existing_size};
+    const char * func = __func__; // avoid __func__ inside a lambda
+    size_t downloaded = existing_size;
+    size_t progress_step = 0;
+    ProgressBar bar;

    auto res = cli.Get(resolve_path, headers,
        [&](const httplib::Response &response) {
            if (existing_size > 0 && response.status != 206) {
-                LOG_WRN("%s: server did not respond with 206 Partial Content for a resume request. Status: %d\n", __func__, response.status);
+                LOG_WRN("%s: server did not respond with 206 Partial Content for a resume request. Status: %d\n", func, response.status);
                return false;
            }
            if (existing_size == 0 && response.status != 200) {
-                LOG_WRN("%s: download received non-successful status code: %d\n", __func__, response.status);
+                LOG_WRN("%s: download received non-successful status code: %d\n", func, response.status);
                return false;
            }
            if (total_size == 0 && response.has_header("Content-Length")) {
@@ -534,7 +585,7 @@ static bool common_pull_file(httplib::Client & cli,
                    size_t content_length = std::stoull(response.get_header_value("Content-Length"));
                    total_size = existing_size + content_length;
                } catch (const std::exception &e) {
-                    LOG_WRN("%s: invalid Content-Length header: %s\n", __func__, e.what());
+                    LOG_WRN("%s: invalid Content-Length header: %s\n", func, e.what());
                }
            }
            return true;
@@ -542,18 +593,21 @@ static bool common_pull_file(httplib::Client & cli,
        [&](const char *data, size_t len) {
            ofs.write(data, len);
            if (!ofs) {
-                LOG_ERR("%s: error writing to file: %s\n", __func__, path_tmp.c_str());
+                LOG_ERR("%s: error writing to file: %s\n", func, path_tmp.c_str());
                return false;
            }
            downloaded += len;
-            print_progress(downloaded, total_size);
+            progress_step += len;
+
+            if (progress_step >= total_size / 1000 || downloaded == total_size) {
+                bar.update(downloaded, total_size);
+                progress_step = 0;
+            }
            return true;
        },
        nullptr
    );

-    std::cout << "\n";
-
    if (!res) {
        LOG_ERR("%s: error during download. Status: %d\n", __func__, res ? res->status : -1);
        return false;
@@ -1047,7 +1101,7 @@ std::string common_docker_resolve_model(const std::string &) {
 std::vector<common_cached_model_info> common_list_cached_models() {
    std::vector<common_cached_model_info> models;
    const std::string cache_dir = fs_get_cache_directory();
-    const std::vector<common_file_info> files = fs_list_files(cache_dir);
+    const std::vector<common_file_info> files = fs_list(cache_dir, false);
    for (const auto & file : files) {
        if (string_starts_with(file.name, "manifest=") && string_ends_with(file.name, ".json")) {
            common_cached_model_info model_info;
--- a/common/download.h
+++ b/common/download.h
@@ -14,8 +14,10 @@ struct common_cached_model_info {
    std::string model;
    std::string tag;
    size_t      size = 0; // GGUF size in bytes
+    // return string representation like "user/model:tag"
+    // if tag is "latest", it will be omitted
    std::string to_string() const {
-        return user + "/" + model + ":" + tag;
+        return user + "/" + model + (tag == "latest" ? "" : ":" + tag);
    }
 };

--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -268,10 +268,10 @@ static bool is_reserved_name(const std::string & name) {
 }

 std::regex INVALID_RULE_CHARS_RE("[^a-zA-Z0-9-]+");
-std::regex GRAMMAR_LITERAL_ESCAPE_RE("[\r\n\"]");
+std::regex GRAMMAR_LITERAL_ESCAPE_RE("[\r\n\"\\\\]");
 std::regex GRAMMAR_RANGE_LITERAL_ESCAPE_RE("[\r\n\"\\]\\-\\\\]");
 std::unordered_map<char, std::string> GRAMMAR_LITERAL_ESCAPES = {
-    {'\r', "\\r"}, {'\n', "\\n"}, {'"', "\\\""}, {'-', "\\-"}, {']', "\\]"}
+    {'\r', "\\r"}, {'\n', "\\n"}, {'"', "\\\""}, {'-', "\\-"}, {']', "\\]"}, {'\\', "\\\\"}
 };

 std::unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
@@ -305,8 +305,9 @@ static std::string format_literal(const std::string & literal) {

 std::string gbnf_format_literal(const std::string & literal) { return format_literal(literal); }

-class SchemaConverter {
+class common_schema_converter {
 private:
+    friend class common_schema_info;
    friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
    std::function<json(const std::string &)> _fetch_json;
    bool _dotall;
@@ -729,7 +730,7 @@ private:
    }

 public:
-    SchemaConverter(
+    common_schema_converter(
        const std::function<json(const std::string &)> & fetch_json,
        bool dotall)
          : _fetch_json(fetch_json), _dotall(dotall)
@@ -974,7 +975,7 @@ public:

    void check_errors() {
        if (!_errors.empty()) {
-            throw std::runtime_error("JSON schema conversion failed:\n" + string_join(_errors, "\n"));
+            throw std::invalid_argument("JSON schema conversion failed:\n" + string_join(_errors, "\n"));
        }
        if (!_warnings.empty()) {
            fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n", string_join(_warnings, "; ").c_str());
@@ -990,6 +991,134 @@ public:
    }
 };

+// common_schema_info implementation (pimpl)
+
+common_schema_info::common_schema_info()
+    : impl_(std::make_unique<common_schema_converter>(
+        [](const std::string &) { return json(); },
+        false)) {}
+
+common_schema_info::~common_schema_info() = default;
+
+common_schema_info::common_schema_info(common_schema_info &&) noexcept = default;
+common_schema_info & common_schema_info::operator=(common_schema_info &&) noexcept = default;
+
+void common_schema_info::resolve_refs(nlohmann::ordered_json & schema) {
+    impl_->resolve_refs(schema, "");
+}
+
+// Determines if a JSON schema can resolve to a string type through any path.
+// Some models emit raw string values rather than JSON-encoded strings for string parameters.
+// If any branch of the schema (via oneOf, anyOf, $ref, etc.) permits a string, this returns
+// true, allowing callers to handle the value as a raw string for simplicity.
+bool common_schema_info::resolves_to_string(const nlohmann::ordered_json & schema) {
+    std::unordered_set<std::string> visited_refs;
+
+    std::function<bool(const json &)> check = [&](const json & s) -> bool {
+        if (!s.is_object()) {
+            return false;
+        }
+
+        // Handle $ref
+        if (s.contains("$ref")) {
+            const std::string & ref = s["$ref"];
+            if (visited_refs.find(ref) != visited_refs.end()) {
+                // Circular reference, assume not a string to be safe
+                return false;
+            }
+            visited_refs.insert(ref);
+            auto it = impl_->_refs.find(ref);
+            if (it != impl_->_refs.end()) {
+                return check(it->second);
+            }
+            return false;
+        }
+
+        // Check type field
+        if (s.contains("type")) {
+            const json & schema_type = s["type"];
+            if (schema_type.is_string()) {
+                if (schema_type == "string") {
+                    return true;
+                }
+            } else if (schema_type.is_array()) {
+                // Type can be an array like ["string", "null"]
+                for (const auto & t : schema_type) {
+                    if (t == "string") {
+                        return true;
+                    }
+                }
+            }
+        }
+
+        // Check oneOf/anyOf - if any alternative can be a string
+        if (s.contains("oneOf")) {
+            for (const auto & alt : s["oneOf"]) {
+                if (check(alt)) {
+                    return true;
+                }
+            }
+        }
+        if (s.contains("anyOf")) {
+            for (const auto & alt : s["anyOf"]) {
+                if (check(alt)) {
+                    return true;
+                }
+            }
+        }
+
+        // Check allOf - all components must be compatible with string type
+        if (s.contains("allOf")) {
+            bool all_string = true;
+            for (const auto & component : s["allOf"]) {
+                if (!check(component)) {
+                    all_string = false;
+                    break;
+                }
+            }
+            if (all_string) {
+                return true;
+            }
+        }
+
+        // Check const - if the constant value is a string
+        if (s.contains("const")) {
+            if (s["const"].is_string()) {
+                return true;
+            }
+        }
+
+        // Check enum - if any enum value is a string
+        if (s.contains("enum")) {
+            for (const auto & val : s["enum"]) {
+                if (val.is_string()) {
+                    return true;
+                }
+            }
+        }
+
+        // String-specific keywords imply string type
+        if (s.contains("pattern") || s.contains("minLength") || s.contains("maxLength")) {
+            return true;
+        }
+
+        // Check format - many formats imply string
+        if (s.contains("format")) {
+            const std::string & fmt = s["format"];
+            if (fmt == "date" || fmt == "time" || fmt == "date-time" ||
+                fmt == "uri" || fmt == "email" || fmt == "hostname" ||
+                fmt == "ipv4" || fmt == "ipv6" || fmt == "uuid" ||
+                fmt.find("uuid") == 0) {
+                return true;
+            }
+        }
+
+        return false;
+    };
+
+    return check(schema);
+}
+
 std::string json_schema_to_grammar(const json & schema, bool force_gbnf) {
 #ifdef LLAMA_USE_LLGUIDANCE
    if (!force_gbnf) {
@@ -1006,7 +1135,7 @@ std::string json_schema_to_grammar(const json & schema, bool force_gbnf) {
 }

 std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options) {
-    SchemaConverter converter([&](const std::string &) { return json(); }, options.dotall);
+    common_schema_converter converter([&](const std::string &) { return json(); }, options.dotall);
    common_grammar_builder builder {
        /* .add_rule = */ [&](const std::string & name, const std::string & rule) {
            return converter._add_rule(name, rule);
--- a/common/json-schema-to-grammar.h
+++ b/common/json-schema-to-grammar.h
@@ -3,11 +3,31 @@
 #include <nlohmann/json_fwd.hpp>

 #include <functional>
+#include <memory>
 #include <string>

 std::string json_schema_to_grammar(const nlohmann::ordered_json & schema,
                                   bool force_gbnf = false);

+class common_schema_converter;
+
+// Probes a JSON schema to extract information about its structure and type constraints.
+class common_schema_info {
+    std::unique_ptr<common_schema_converter> impl_;
+
+  public:
+    common_schema_info();
+    ~common_schema_info();
+
+    common_schema_info(const common_schema_info &) = delete;
+    common_schema_info & operator=(const common_schema_info &) = delete;
+    common_schema_info(common_schema_info &&) noexcept;
+    common_schema_info & operator=(common_schema_info &&) noexcept;
+
+    void resolve_refs(nlohmann::ordered_json & schema);
+    bool resolves_to_string(const nlohmann::ordered_json & schema);
+};
+
 struct common_grammar_builder {
    std::function<std::string(const std::string &, const std::string &)> add_rule;
    std::function<std::string(const std::string &, const nlohmann::ordered_json &)> add_schema;
--- a/common/log.cpp
+++ b/common/log.cpp
@@ -1,3 +1,4 @@
+#include "common.h"
 #include "log.h"

 #include <chrono>
@@ -26,30 +27,6 @@ void common_log_set_verbosity_thold(int verbosity) {
    common_log_verbosity_thold = verbosity;
 }

-// Auto-detect if colors should be enabled based on terminal and environment
-static bool common_log_should_use_colors_auto() {
-    // Check NO_COLOR environment variable (https://no-color.org/)
-    if (const char * no_color = std::getenv("NO_COLOR")) {
-        if (no_color[0] != '\0') {
-            return false;
-        }
-    }
-
-    // Check TERM environment variable
-    if (const char * term = std::getenv("TERM")) {
-        if (std::strcmp(term, "dumb") == 0) {
-            return false;
-        }
-    }
-
-    // Check if stdout and stderr are connected to a terminal
-    // We check both because log messages can go to either
-    bool stdout_is_tty = isatty(fileno(stdout));
-    bool stderr_is_tty = isatty(fileno(stderr));
-
-    return stdout_is_tty || stderr_is_tty;
-}
-
 static int64_t t_us() {
    return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
 }
@@ -391,7 +368,7 @@ struct common_log * common_log_main() {
    static std::once_flag    init_flag;
    std::call_once(init_flag, [&]() {
        // Set default to auto-detect colors
-        log.set_colors(common_log_should_use_colors_auto());
+        log.set_colors(tty_can_use_colors());
    });

    return &log;
@@ -422,7 +399,7 @@ void common_log_set_file(struct common_log * log, const char * file) {

 void common_log_set_colors(struct common_log * log, log_colors colors) {
    if (colors == LOG_COLORS_AUTO) {
-        log->set_colors(common_log_should_use_colors_auto());
+        log->set_colors(tty_can_use_colors());
        return;
    }

@@ -443,8 +420,27 @@ void common_log_set_timestamps(struct common_log * log, bool timestamps) {
    log->set_timestamps(timestamps);
 }

+void common_log_flush(struct common_log * log) {
+    log->pause();
+    log->resume();
+}
+
+static int common_get_verbosity(enum ggml_log_level level) {
+    switch (level) {
+        case GGML_LOG_LEVEL_DEBUG: return LOG_LEVEL_DEBUG;
+        case GGML_LOG_LEVEL_INFO:  return LOG_LEVEL_INFO;
+        case GGML_LOG_LEVEL_WARN:  return LOG_LEVEL_WARN;
+        case GGML_LOG_LEVEL_ERROR: return LOG_LEVEL_ERROR;
+        case GGML_LOG_LEVEL_CONT:  return LOG_LEVEL_INFO; // same as INFO
+        case GGML_LOG_LEVEL_NONE:
+        default:
+            return LOG_LEVEL_OUTPUT;
+    }
+}
+
 void common_log_default_callback(enum ggml_log_level level, const char * text, void * /*user_data*/) {
-    if (LOG_DEFAULT_LLAMA <= common_log_verbosity_thold) {
+    auto verbosity = common_get_verbosity(level);
+    if (verbosity <= common_log_verbosity_thold) {
        common_log_add(common_log_main(), level, "%s", text);
    }
 }
--- a/common/log.h
+++ b/common/log.h
@@ -21,8 +21,14 @@
 #    define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
 #endif

-#define LOG_DEFAULT_DEBUG 1
-#define LOG_DEFAULT_LLAMA 0
+#define LOG_LEVEL_DEBUG  4
+#define LOG_LEVEL_INFO   3
+#define LOG_LEVEL_WARN   2
+#define LOG_LEVEL_ERROR  1
+#define LOG_LEVEL_OUTPUT 0 // output data from tools
+
+#define LOG_DEFAULT_DEBUG LOG_LEVEL_DEBUG
+#define LOG_DEFAULT_LLAMA LOG_LEVEL_INFO

 enum log_colors {
    LOG_COLORS_AUTO     = -1,
@@ -67,16 +73,18 @@ void common_log_add(struct common_log * log, enum ggml_log_level level, const ch
 //   0.00.090.578 I llm_load_tensors: offloading 32 repeating layers to GPU
 //   0.00.090.579 I llm_load_tensors: offloading non-repeating layers to GPU
 //
-// I - info    (stdout, V = 0)
-// W - warning (stderr, V = 0)
-// E - error   (stderr, V = 0)
 // D - debug   (stderr, V = LOG_DEFAULT_DEBUG)
+// I - info    (stdout, V = LOG_DEFAULT_INFO)
+// W - warning (stderr, V = LOG_DEFAULT_WARN)
+// E - error   (stderr, V = LOG_DEFAULT_ERROR)
+// O - output  (stdout, V = LOG_DEFAULT_OUTPUT)
 //

 void common_log_set_file      (struct common_log * log, const char * file); // not thread-safe
 void common_log_set_colors    (struct common_log * log, log_colors colors); // not thread-safe
 void common_log_set_prefix    (struct common_log * log, bool prefix);       // whether to output prefix to each log
 void common_log_set_timestamps(struct common_log * log, bool timestamps);   // whether to output timestamps in the prefix
+void common_log_flush         (struct common_log * log);                    // flush all pending log messages

 // helper macros for logging
 // use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
@@ -95,14 +103,14 @@ void common_log_set_timestamps(struct common_log * log, bool timestamps);   // w
        } \
    } while (0)

-#define LOG(...)             LOG_TMPL(GGML_LOG_LEVEL_NONE, 0,         __VA_ARGS__)
-#define LOGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_NONE, verbosity, __VA_ARGS__)
+#define LOG(...)             LOG_TMPL(GGML_LOG_LEVEL_NONE, LOG_LEVEL_OUTPUT, __VA_ARGS__)
+#define LOGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_NONE, verbosity,        __VA_ARGS__)

-#define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO,  0,                 __VA_ARGS__)
-#define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN,  0,                 __VA_ARGS__)
-#define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, 0,                 __VA_ARGS__)
-#define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_DEFAULT_DEBUG, __VA_ARGS__)
-#define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT,  0,                 __VA_ARGS__)
+#define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_LEVEL_DEBUG,  __VA_ARGS__)
+#define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO,  LOG_LEVEL_INFO,   __VA_ARGS__)
+#define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN,  LOG_LEVEL_WARN,   __VA_ARGS__)
+#define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, LOG_LEVEL_ERROR,  __VA_ARGS__)
+#define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT,  LOG_LEVEL_INFO,   __VA_ARGS__) // same as INFO

 #define LOG_INFV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_INFO,  verbosity, __VA_ARGS__)
 #define LOG_WRNV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_WARN,  verbosity, __VA_ARGS__)
--- a/common/peg-parser.cpp
+++ b/common/peg-parser.cpp
--- a/common/peg-parser.h
+++ b/common/peg-parser.h
@@ -0,0 +1,459 @@
+#pragma once
+
+#include <nlohmann/json_fwd.hpp>
+
+#include <memory>
+#include <unordered_map>
+#include <string>
+#include <string_view>
+#include <functional>
+#include <vector>
+#include <variant>
+
+struct common_grammar_builder;
+
+class common_peg_parser_builder;
+
+using common_peg_parser_id = size_t;
+constexpr common_peg_parser_id COMMON_PEG_INVALID_PARSER_ID = static_cast<common_peg_parser_id>(-1);
+
+using common_peg_ast_id = size_t;
+constexpr common_peg_ast_id COMMON_PEG_INVALID_AST_ID = static_cast<common_peg_ast_id>(-1);
+
+// Lightweight wrapper around common_peg_parser_id for convenience
+class common_peg_parser {
+    common_peg_parser_id id_;
+    common_peg_parser_builder & builder_;
+
+  public:
+    common_peg_parser(const common_peg_parser & other) : id_(other.id_), builder_(other.builder_) {}
+    common_peg_parser(common_peg_parser_id id, common_peg_parser_builder & builder) : id_(id), builder_(builder) {}
+
+    common_peg_parser & operator=(const common_peg_parser & other);
+    common_peg_parser & operator+=(const common_peg_parser & other);
+    common_peg_parser & operator|=(const common_peg_parser & other);
+
+    operator common_peg_parser_id() const { return id_; }
+    common_peg_parser_id id() const { return id_; }
+
+    common_peg_parser_builder & builder() const { return builder_; }
+
+    // Creates a sequence
+    common_peg_parser operator+(const common_peg_parser & other) const;
+
+    // Creates a sequence separated by spaces.
+    common_peg_parser operator<<(const common_peg_parser & other) const;
+
+    // Creates a choice
+    common_peg_parser operator|(const common_peg_parser & other) const;
+
+    common_peg_parser operator+(const char * str) const;
+    common_peg_parser operator+(const std::string & str) const;
+    common_peg_parser operator<<(const char * str) const;
+    common_peg_parser operator<<(const std::string & str) const;
+    common_peg_parser operator|(const char * str) const;
+    common_peg_parser operator|(const std::string & str) const;
+};
+
+common_peg_parser operator+(const char * str, const common_peg_parser & p);
+common_peg_parser operator+(const std::string & str, const common_peg_parser & p);
+common_peg_parser operator<<(const char * str, const common_peg_parser & p);
+common_peg_parser operator<<(const std::string & str, const common_peg_parser & p);
+common_peg_parser operator|(const char * str, const common_peg_parser & p);
+common_peg_parser operator|(const std::string & str, const common_peg_parser & p);
+
+enum common_peg_parse_result_type {
+    COMMON_PEG_PARSE_RESULT_FAIL            = 0,
+    COMMON_PEG_PARSE_RESULT_SUCCESS         = 1,
+    COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT = 2,
+};
+
+const char * common_peg_parse_result_type_name(common_peg_parse_result_type type);
+
+struct common_peg_ast_node {
+    common_peg_ast_id id;
+    std::string rule;
+    std::string tag;
+    size_t start;
+    size_t end;
+    std::string_view text;
+    std::vector<common_peg_ast_id> children;
+
+    bool is_partial = false;
+};
+
+struct common_peg_parse_result;
+
+using common_peg_ast_visitor = std::function<void(const common_peg_ast_node & node)>;
+
+class common_peg_ast_arena {
+    std::vector<common_peg_ast_node> nodes_;
+  public:
+    common_peg_ast_id add_node(
+        const std::string & rule,
+        const std::string & tag,
+        size_t start,
+        size_t end,
+        std::string_view text,
+        std::vector<common_peg_ast_id> children,
+        bool is_partial = false
+    ) {
+        common_peg_ast_id id = nodes_.size();
+        nodes_.push_back({id, rule, tag, start, end, text, std::move(children), is_partial});
+        return id;
+    }
+
+    const common_peg_ast_node & get(common_peg_ast_id id) const { return nodes_.at(id); }
+
+    size_t size() const { return nodes_.size(); }
+
+    void clear() { nodes_.clear(); }
+
+    void visit(common_peg_ast_id id, const common_peg_ast_visitor & visitor) const;
+    void visit(const common_peg_parse_result & result, const common_peg_ast_visitor & visitor) const;
+};
+
+struct common_peg_parse_result {
+    common_peg_parse_result_type type = COMMON_PEG_PARSE_RESULT_FAIL;
+    size_t start = 0;
+    size_t end = 0;
+
+    std::vector<common_peg_ast_id> nodes;
+
+    common_peg_parse_result() = default;
+
+    common_peg_parse_result(common_peg_parse_result_type type, size_t start)
+        : type(type), start(start), end(start) {}
+
+    common_peg_parse_result(common_peg_parse_result_type type, size_t start, size_t end)
+        : type(type), start(start), end(end) {}
+
+    common_peg_parse_result(common_peg_parse_result_type type, size_t start, size_t end, std::vector<common_peg_ast_id> nodes)
+        : type(type), start(start), end(end), nodes(std::move(nodes)) {}
+
+    bool fail() const { return type == COMMON_PEG_PARSE_RESULT_FAIL; }
+    bool need_more_input() const { return type == COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT; }
+    bool success() const { return type == COMMON_PEG_PARSE_RESULT_SUCCESS; }
+};
+
+struct common_peg_parse_context {
+    std::string input;
+    bool is_partial;
+    common_peg_ast_arena ast;
+
+    int parse_depth;
+
+    common_peg_parse_context()
+        : is_partial(false), parse_depth(0) {}
+
+    common_peg_parse_context(const std::string & input)
+        : input(input), is_partial(false), parse_depth(0) {}
+
+    common_peg_parse_context(const std::string & input, bool is_partial)
+        : input(input), is_partial(is_partial), parse_depth(0) {}
+};
+
+class common_peg_arena;
+
+// Parser variants
+struct common_peg_epsilon_parser {};
+
+struct common_peg_start_parser {};
+
+struct common_peg_end_parser {};
+
+struct common_peg_literal_parser {
+    std::string literal;
+};
+
+struct common_peg_sequence_parser {
+    std::vector<common_peg_parser_id> children;
+};
+
+struct common_peg_choice_parser {
+    std::vector<common_peg_parser_id> children;
+};
+
+struct common_peg_repetition_parser {
+    common_peg_parser_id child;
+    int min_count;
+    int max_count;  // -1 for unbounded
+};
+
+struct common_peg_and_parser {
+    common_peg_parser_id child;
+};
+
+struct common_peg_not_parser {
+    common_peg_parser_id child;
+};
+
+struct common_peg_any_parser {};
+
+struct common_peg_space_parser {};
+
+struct common_peg_chars_parser {
+    struct char_range {
+        uint32_t start;
+        uint32_t end;
+        bool contains(uint32_t codepoint) const { return codepoint >= start && codepoint <= end; }
+    };
+
+    std::string pattern;
+    std::vector<char_range> ranges;
+    bool negated;
+    int min_count;
+    int max_count;  // -1 for unbounded
+};
+
+struct common_peg_json_string_parser {};
+
+struct common_peg_until_parser {
+    std::vector<std::string> delimiters;
+};
+
+struct common_peg_schema_parser {
+    common_peg_parser_id child;
+    std::string name;
+    std::shared_ptr<nlohmann::ordered_json> schema;
+
+    // Indicates if the GBNF should accept a raw string that matches the schema.
+    bool raw;
+};
+
+struct common_peg_rule_parser {
+    std::string name;
+    common_peg_parser_id child;
+    bool trigger;
+};
+
+struct common_peg_ref_parser {
+    std::string name;
+};
+
+struct common_peg_atomic_parser {
+    common_peg_parser_id child;
+};
+
+struct common_peg_tag_parser {
+    common_peg_parser_id child;
+    std::string tag;
+};
+
+// Variant holding all parser types
+using common_peg_parser_variant = std::variant<
+    common_peg_epsilon_parser,
+    common_peg_start_parser,
+    common_peg_end_parser,
+    common_peg_literal_parser,
+    common_peg_sequence_parser,
+    common_peg_choice_parser,
+    common_peg_repetition_parser,
+    common_peg_and_parser,
+    common_peg_not_parser,
+    common_peg_any_parser,
+    common_peg_space_parser,
+    common_peg_chars_parser,
+    common_peg_json_string_parser,
+    common_peg_until_parser,
+    common_peg_schema_parser,
+    common_peg_rule_parser,
+    common_peg_ref_parser,
+    common_peg_atomic_parser,
+    common_peg_tag_parser
+>;
+
+class common_peg_arena {
+    std::vector<common_peg_parser_variant> parsers_;
+    std::unordered_map<std::string, common_peg_parser_id> rules_;
+    common_peg_parser_id root_ = COMMON_PEG_INVALID_PARSER_ID;
+
+  public:
+    const common_peg_parser_variant & get(common_peg_parser_id id) const { return parsers_.at(id); }
+    common_peg_parser_variant & get(common_peg_parser_id id) { return parsers_.at(id); }
+
+    size_t size() const { return parsers_.size(); }
+    bool empty() const { return parsers_.empty(); }
+
+    common_peg_parser_id get_rule(const std::string & name) const;
+    bool has_rule(const std::string & name) const { return rules_.find(name) != rules_.end(); }
+
+    common_peg_parser_id root() const { return root_; }
+    void set_root(common_peg_parser_id id) { root_ = id; }
+
+    common_peg_parse_result parse(common_peg_parse_context & ctx, size_t start = 0) const;
+    common_peg_parse_result parse(common_peg_parser_id id, common_peg_parse_context & ctx, size_t start) const;
+
+    void resolve_refs();
+
+    void build_grammar(const common_grammar_builder & builder, bool lazy = false) const;
+
+    std::string dump(common_peg_parser_id id) const;
+
+    nlohmann::json to_json() const;
+    static common_peg_arena from_json(const nlohmann::json & j);
+
+    std::string save() const;
+    void load(const std::string & data);
+
+    friend class common_peg_parser_builder;
+
+  private:
+    common_peg_parser_id add_parser(common_peg_parser_variant parser);
+    void add_rule(const std::string & name, common_peg_parser_id id);
+
+    common_peg_parser_id resolve_ref(common_peg_parser_id id);
+};
+
+class common_peg_parser_builder {
+    common_peg_arena arena_;
+
+    common_peg_parser wrap(common_peg_parser_id id) { return common_peg_parser(id, *this); }
+    common_peg_parser add(const common_peg_parser_variant & p) { return wrap(arena_.add_parser(p)); }
+
+  public:
+    common_peg_parser_builder();
+
+    // Match nothing, always succeed.
+    //   S -> ε
+    common_peg_parser eps() { return add(common_peg_epsilon_parser{}); }
+
+    // Matches the start of the input.
+    //   S -> ^
+    common_peg_parser start() { return add(common_peg_start_parser{}); }
+
+    // Matches the end of the input.
+    //   S -> $
+    common_peg_parser end() { return add(common_peg_end_parser{}); }
+
+    // Matches an exact literal string.
+    //   S -> "hello"
+    common_peg_parser literal(const std::string & literal) { return add(common_peg_literal_parser{literal}); }
+
+    // Matches a sequence of parsers in order, all must succeed.
+    //   S -> A B C
+    common_peg_parser sequence() { return add(common_peg_sequence_parser{}); }
+    common_peg_parser sequence(const std::vector<common_peg_parser_id> & parsers);
+    common_peg_parser sequence(const std::vector<common_peg_parser> & parsers);
+    common_peg_parser sequence(std::initializer_list<common_peg_parser> parsers);
+
+    // Matches the first parser that succeeds from a list of alternatives.
+    //   S -> A | B | C
+    common_peg_parser choice() { return add(common_peg_choice_parser{}); }
+    common_peg_parser choice(const std::vector<common_peg_parser_id> & parsers);
+    common_peg_parser choice(const std::vector<common_peg_parser> & parsers);
+    common_peg_parser choice(std::initializer_list<common_peg_parser> parsers);
+
+    // Matches one or more repetitions of a parser.
+    //   S -> A+
+    common_peg_parser one_or_more(const common_peg_parser & p) { return repeat(p, 1, -1); }
+
+    // Matches zero or more repetitions of a parser, always succeeds.
+    //   S -> A*
+    common_peg_parser zero_or_more(const common_peg_parser & p) { return repeat(p, 0, -1); }
+
+    // Matches zero or one occurrence of a parser, always succeeds.
+    //   S -> A?
+    common_peg_parser optional(const common_peg_parser & p) { return repeat(p, 0, 1); }
+
+    // Positive lookahead: succeeds if child parser succeeds, consumes no input.
+    //   S -> &A
+    common_peg_parser peek(const common_peg_parser & p) { return add(common_peg_and_parser{p}); }
+
+    // Negative lookahead: succeeds if child parser fails, consumes no input.
+    //   S -> !A
+    common_peg_parser negate(const common_peg_parser & p) { return add(common_peg_not_parser{p}); }
+
+    // Matches any single character.
+    //   S -> .
+    common_peg_parser any() { return add(common_peg_any_parser{}); }
+
+    // Matches between min and max repetitions of characters from a character class.
+    //   S -> [a-z]{m,n}
+    //
+    // Use -1 for max to represent unbounded repetition (equivalent to {m,})
+    common_peg_parser chars(const std::string & classes, int min = 1, int max = -1);
+
+    // Creates a lightweight reference to a named rule (resolved during build()).
+    // Use this for forward references in recursive grammars.
+    //   expr_ref -> expr
+    common_peg_parser ref(const std::string & name) { return add(common_peg_ref_parser{name}); }
+
+    // Matches zero or more whitespace characters (space, tab, newline).
+    //   S -> [ \t\n]*
+    common_peg_parser space() { return add(common_peg_space_parser{}); }
+
+    // Matches all characters until a delimiter is found (delimiter not consumed).
+    //   S -> (!delim .)*
+    common_peg_parser until(const std::string & delimiter) { return add(common_peg_until_parser{{delimiter}}); }
+
+    // Matches all characters until one of the delimiters in the list is found (delimiter not consumed).
+    //   S -> (!delim .)*
+    common_peg_parser until_one_of(const std::vector<std::string> & delimiters) { return add(common_peg_until_parser{delimiters}); }
+
+    // Matches everything
+    //   S -> .*
+    common_peg_parser rest() { return until_one_of({}); }
+
+    // Matches between min and max repetitions of a parser (inclusive).
+    //   S -> A{m,n}
+    // Use -1 for max to represent unbounded repetition (equivalent to {m,})
+    common_peg_parser repeat(const common_peg_parser & p, int min, int max) { return add(common_peg_repetition_parser{p, min,max}); }
+
+    // Matches exactly n repetitions of a parser.
+    //   S -> A{n}
+    common_peg_parser repeat(const common_peg_parser & p, int n) { return repeat(p, n, n); }
+
+    // Creates a complete JSON parser supporting objects, arrays, strings, numbers, booleans, and null.
+    //   value -> object | array | string | number | true | false | null
+    common_peg_parser json();
+    common_peg_parser json_object();
+    common_peg_parser json_string();
+    common_peg_parser json_array();
+    common_peg_parser json_number();
+    common_peg_parser json_bool();
+    common_peg_parser json_null();
+
+    // Matches JSON string content without the surrounding quotes.
+    // Useful for extracting content within a JSON string.
+    common_peg_parser json_string_content();
+
+    // Matches a JSON object member with a key and associated parser as the
+    // value.
+    common_peg_parser json_member(const std::string & key, const common_peg_parser & p);
+
+    // Wraps a parser with JSON schema metadata for grammar generation.
+    // Used internally to convert JSON schemas to GBNF grammar rules.
+    common_peg_parser schema(const common_peg_parser & p, const std::string & name, const nlohmann::ordered_json & schema, bool raw = false);
+
+    // Creates a named rule, stores it in the grammar, and returns a ref.
+    // If trigger=true, marks this rule as an entry point for lazy grammar generation.
+    //   auto json = p.rule("json", json_obj | json_arr | ...)
+    common_peg_parser rule(const std::string & name, const common_peg_parser & p, bool trigger = false);
+
+    // Creates a named rule using a builder function, and returns a ref.
+    // If trigger=true, marks this rule as an entry point for lazy grammar generation.
+    //   auto json = p.rule("json", [&]() { return json_object() | json_array() | ... })
+    common_peg_parser rule(const std::string & name, const std::function<common_peg_parser()> & builder, bool trigger = false);
+
+    // Creates a trigger rule. When generating a lazy grammar from the parser,
+    // only trigger rules and descendents are emitted.
+    common_peg_parser trigger_rule(const std::string & name, const common_peg_parser & p) { return rule(name, p, true); }
+    common_peg_parser trigger_rule(const std::string & name, const std::function<common_peg_parser()> & builder) { return rule(name, builder, true); }
+
+    // Creates an atomic parser. Atomic parsers do not create an AST node if
+    // the child results in a partial parse, i.e. NEEDS_MORE_INPUT. This is
+    // intended for situations where partial output is undesirable.
+    common_peg_parser atomic(const common_peg_parser & p) { return add(common_peg_atomic_parser{p}); }
+
+    // Tags create nodes in the generated AST for semantic purposes.
+    // Unlike rules, you can tag multiple nodes with the same tag.
+    common_peg_parser tag(const std::string & tag, const common_peg_parser & p) { return add(common_peg_tag_parser{p.id(), tag}); }
+
+    void set_root(const common_peg_parser & p);
+
+    common_peg_arena build();
+};
+
+// Helper function for building parsers
+common_peg_arena build_peg_parser(const std::function<common_peg_parser(common_peg_parser_builder & builder)> & fn);
--- a/common/preset.cpp
+++ b/common/preset.cpp
@@ -0,0 +1,398 @@
+#include "arg.h"
+#include "preset.h"
+#include "peg-parser.h"
+#include "log.h"
+#include "download.h"
+
+#include <fstream>
+#include <sstream>
+#include <filesystem>
+
+static std::string rm_leading_dashes(const std::string & str) {
+    size_t pos = 0;
+    while (pos < str.size() && str[pos] == '-') {
+        ++pos;
+    }
+    return str.substr(pos);
+}
+
+std::vector<std::string> common_preset::to_args(const std::string & bin_path) const {
+    std::vector<std::string> args;
+
+    if (!bin_path.empty()) {
+        args.push_back(bin_path);
+    }
+
+    for (const auto & [opt, value] : options) {
+        if (opt.is_preset_only) {
+            continue; // skip preset-only options (they are not CLI args)
+        }
+
+        // use the last arg as the main arg (i.e. --long-form)
+        args.push_back(opt.args.back());
+
+        // handle value(s)
+        if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) {
+            // flag option, no value
+            if (common_arg_utils::is_falsey(value)) {
+                // use negative arg if available
+                if (!opt.args_neg.empty()) {
+                    args.back() = opt.args_neg.back();
+                } else {
+                    // otherwise, skip the flag
+                    // TODO: maybe throw an error instead?
+                    args.pop_back();
+                }
+            }
+        }
+        if (opt.value_hint != nullptr) {
+            // single value
+            args.push_back(value);
+        }
+        if (opt.value_hint != nullptr && opt.value_hint_2 != nullptr) {
+            throw std::runtime_error(string_format(
+                "common_preset::to_args(): option '%s' has two values, which is not supported yet",
+                opt.args.back()
+            ));
+        }
+    }
+
+    return args;
+}
+
+std::string common_preset::to_ini() const {
+    std::ostringstream ss;
+
+    ss << "[" << name << "]\n";
+    for (const auto & [opt, value] : options) {
+        auto espaced_value = value;
+        string_replace_all(espaced_value, "\n", "\\\n");
+        ss << rm_leading_dashes(opt.args.back()) << " = ";
+        ss << espaced_value << "\n";
+    }
+    ss << "\n";
+
+    return ss.str();
+}
+
+void common_preset::set_option(const common_preset_context & ctx, const std::string & env, const std::string & value) {
+    // try if option exists, update it
+    for (auto & [opt, val] : options) {
+        if (opt.env && env == opt.env) {
+            val = value;
+            return;
+        }
+    }
+    // if option does not exist, we need to add it
+    if (ctx.key_to_opt.find(env) == ctx.key_to_opt.end()) {
+        throw std::runtime_error(string_format(
+            "%s: option with env '%s' not found in ctx_params",
+            __func__, env.c_str()
+        ));
+    }
+    options[ctx.key_to_opt.at(env)] = value;
+}
+
+void common_preset::unset_option(const std::string & env) {
+    for (auto it = options.begin(); it != options.end(); ) {
+        const common_arg & opt = it->first;
+        if (opt.env && env == opt.env) {
+            it = options.erase(it);
+            return;
+        } else {
+            ++it;
+        }
+    }
+}
+
+bool common_preset::get_option(const std::string & env, std::string & value) const {
+    for (const auto & [opt, val] : options) {
+        if (opt.env && env == opt.env) {
+            value = val;
+            return true;
+        }
+    }
+    return false;
+}
+
+void common_preset::merge(const common_preset & other) {
+    for (const auto & [opt, val] : other.options) {
+        options[opt] = val; // overwrite existing options
+    }
+}
+
+static std::map<std::string, std::map<std::string, std::string>> parse_ini_from_file(const std::string & path) {
+    std::map<std::string, std::map<std::string, std::string>> parsed;
+
+    if (!std::filesystem::exists(path)) {
+        throw std::runtime_error("preset file does not exist: " + path);
+    }
+
+    std::ifstream file(path);
+    if (!file.good()) {
+        throw std::runtime_error("failed to open server preset file: " + path);
+    }
+
+    std::string contents((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
+
+    static const auto parser = build_peg_parser([](auto & p) {
+        // newline ::= "\r\n" / "\n" / "\r"
+        auto newline = p.rule("newline", p.literal("\r\n") | p.literal("\n") | p.literal("\r"));
+
+        // ws ::= [ \t]*
+        auto ws = p.rule("ws", p.chars("[ \t]", 0, -1));
+
+        // comment ::= [;#] (!newline .)*
+        auto comment = p.rule("comment", p.chars("[;#]", 1, 1) + p.zero_or_more(p.negate(newline) + p.any()));
+
+        // eol ::= ws comment? (newline / EOF)
+        auto eol = p.rule("eol", ws + p.optional(comment) + (newline | p.end()));
+
+        // ident ::= [a-zA-Z_] [a-zA-Z0-9_.-]*
+        auto ident = p.rule("ident", p.chars("[a-zA-Z_]", 1, 1) + p.chars("[a-zA-Z0-9_.-]", 0, -1));
+
+        // value ::= (!eol-start .)*
+        auto eol_start = p.rule("eol-start", ws + (p.chars("[;#]", 1, 1) | newline | p.end()));
+        auto value = p.rule("value", p.zero_or_more(p.negate(eol_start) + p.any()));
+
+        // header-line ::= "[" ws ident ws "]" eol
+        auto header_line = p.rule("header-line", "[" + ws + p.tag("section-name", p.chars("[^]]")) + ws + "]" + eol);
+
+        // kv-line ::= ident ws "=" ws value eol
+        auto kv_line = p.rule("kv-line", p.tag("key", ident) + ws + "=" + ws + p.tag("value", value) + eol);
+
+        // comment-line ::= ws comment (newline / EOF)
+        auto comment_line = p.rule("comment-line", ws + comment + (newline | p.end()));
+
+        // blank-line ::= ws (newline / EOF)
+        auto blank_line = p.rule("blank-line", ws + (newline | p.end()));
+
+        // line ::= header-line / kv-line / comment-line / blank-line
+        auto line = p.rule("line", header_line | kv_line | comment_line | blank_line);
+
+        // ini ::= line* EOF
+        auto ini = p.rule("ini", p.zero_or_more(line) + p.end());
+
+        return ini;
+    });
+
+    common_peg_parse_context ctx(contents);
+    const auto result = parser.parse(ctx);
+    if (!result.success()) {
+        throw std::runtime_error("failed to parse server config file: " + path);
+    }
+
+    std::string current_section = COMMON_PRESET_DEFAULT_NAME;
+    std::string current_key;
+
+    ctx.ast.visit(result, [&](const auto & node) {
+        if (node.tag == "section-name") {
+            const std::string section = std::string(node.text);
+            current_section = section;
+            parsed[current_section] = {};
+        } else if (node.tag == "key") {
+            const std::string key = std::string(node.text);
+            current_key = key;
+        } else if (node.tag == "value" && !current_key.empty() && !current_section.empty()) {
+            parsed[current_section][current_key] = std::string(node.text);
+            current_key.clear();
+        }
+    });
+
+    return parsed;
+}
+
+static std::map<std::string, common_arg> get_map_key_opt(common_params_context & ctx_params) {
+    std::map<std::string, common_arg> mapping;
+    for (const auto & opt : ctx_params.options) {
+        for (const auto & env : opt.get_env()) {
+            mapping[env] = opt;
+        }
+        for (const auto & arg : opt.get_args()) {
+            mapping[rm_leading_dashes(arg)] = opt;
+        }
+    }
+    return mapping;
+}
+
+static bool is_bool_arg(const common_arg & arg) {
+    return !arg.args_neg.empty();
+}
+
+static std::string parse_bool_arg(const common_arg & arg, const std::string & key, const std::string & value) {
+    // if this is a negated arg, we need to reverse the value
+    for (const auto & neg_arg : arg.args_neg) {
+        if (rm_leading_dashes(neg_arg) == key) {
+            return common_arg_utils::is_truthy(value) ? "false" : "true";
+        }
+    }
+    // otherwise, not negated
+    return value;
+}
+
+common_preset_context::common_preset_context(llama_example ex)
+        : ctx_params(common_params_parser_init(default_params, ex)) {
+    common_params_add_preset_options(ctx_params.options);
+    key_to_opt = get_map_key_opt(ctx_params);
+}
+
+common_presets common_preset_context::load_from_ini(const std::string & path, common_preset & global) const {
+    common_presets out;
+    auto ini_data = parse_ini_from_file(path);
+
+    for (auto section : ini_data) {
+        common_preset preset;
+        if (section.first.empty()) {
+            preset.name = COMMON_PRESET_DEFAULT_NAME;
+        } else {
+            preset.name = section.first;
+        }
+        LOG_DBG("loading preset: %s\n", preset.name.c_str());
+        for (const auto & [key, value] : section.second) {
+            LOG_DBG("option: %s = %s\n", key.c_str(), value.c_str());
+            if (key_to_opt.find(key) != key_to_opt.end()) {
+                const auto & opt = key_to_opt.at(key);
+                if (is_bool_arg(opt)) {
+                    preset.options[opt] = parse_bool_arg(opt, key, value);
+                } else {
+                    preset.options[opt] = value;
+                }
+                LOG_DBG("accepted option: %s = %s\n", key.c_str(), preset.options[opt].c_str());
+            } else {
+                // TODO: maybe warn about unknown key?
+            }
+        }
+
+        if (preset.name == "*") {
+            // handle global preset
+            global = preset;
+        } else {
+            out[preset.name] = preset;
+        }
+    }
+
+    return out;
+}
+
+common_presets common_preset_context::load_from_cache() const {
+    common_presets out;
+
+    auto cached_models = common_list_cached_models();
+    for (const auto & model : cached_models) {
+        common_preset preset;
+        preset.name = model.to_string();
+        preset.set_option(*this, "LLAMA_ARG_HF_REPO", model.to_string());
+        out[preset.name] = preset;
+    }
+
+    return out;
+}
+
+struct local_model {
+    std::string name;
+    std::string path;
+    std::string path_mmproj;
+};
+
+common_presets common_preset_context::load_from_models_dir(const std::string & models_dir) const {
+    if (!std::filesystem::exists(models_dir) || !std::filesystem::is_directory(models_dir)) {
+        throw std::runtime_error(string_format("error: '%s' does not exist or is not a directory\n", models_dir.c_str()));
+    }
+
+    std::vector<local_model> models;
+    auto scan_subdir = [&models](const std::string & subdir_path, const std::string & name) {
+        auto files = fs_list(subdir_path, false);
+        common_file_info model_file;
+        common_file_info first_shard_file;
+        common_file_info mmproj_file;
+        for (const auto & file : files) {
+            if (string_ends_with(file.name, ".gguf")) {
+                if (file.name.find("mmproj") != std::string::npos) {
+                    mmproj_file = file;
+                } else if (file.name.find("-00001-of-") != std::string::npos) {
+                    first_shard_file = file;
+                } else {
+                    model_file = file;
+                }
+            }
+        }
+        // single file model
+        local_model model{
+            /* name        */ name,
+            /* path        */ first_shard_file.path.empty() ? model_file.path : first_shard_file.path,
+            /* path_mmproj */ mmproj_file.path // can be empty
+        };
+        if (!model.path.empty()) {
+            models.push_back(model);
+        }
+    };
+
+    auto files = fs_list(models_dir, true);
+    for (const auto & file : files) {
+        if (file.is_dir) {
+            scan_subdir(file.path, file.name);
+        } else if (string_ends_with(file.name, ".gguf")) {
+            // single file model
+            std::string name = file.name;
+            string_replace_all(name, ".gguf", "");
+            local_model model{
+                /* name        */ name,
+                /* path        */ file.path,
+                /* path_mmproj */ ""
+            };
+            models.push_back(model);
+        }
+    }
+
+    // convert local models to presets
+    common_presets out;
+    for (const auto & model : models) {
+        common_preset preset;
+        preset.name = model.name;
+        preset.set_option(*this, "LLAMA_ARG_MODEL", model.path);
+        if (!model.path_mmproj.empty()) {
+            preset.set_option(*this, "LLAMA_ARG_MMPROJ", model.path_mmproj);
+        }
+        out[preset.name] = preset;
+    }
+
+    return out;
+}
+
+common_preset common_preset_context::load_from_args(int argc, char ** argv) const {
+    common_preset preset;
+    preset.name = COMMON_PRESET_DEFAULT_NAME;
+
+    bool ok = common_params_to_map(argc, argv, ctx_params.ex, preset.options);
+    if (!ok) {
+        throw std::runtime_error("failed to parse CLI arguments into preset");
+    }
+
+    return preset;
+}
+
+common_presets common_preset_context::cascade(const common_presets & base, const common_presets & added) const {
+    common_presets out = base; // copy
+    for (const auto & [name, preset_added] : added) {
+        if (out.find(name) != out.end()) {
+            // if exists, merge
+            common_preset & target = out[name];
+            target.merge(preset_added);
+        } else {
+            // otherwise, add directly
+            out[name] = preset_added;
+        }
+    }
+    return out;
+}
+
+common_presets common_preset_context::cascade(const common_preset & base, const common_presets & presets) const {
+    common_presets out;
+    for (const auto & [name, preset] : presets) {
+        common_preset tmp = base; // copy
+        tmp.name = name;
+        tmp.merge(preset);
+        out[name] = std::move(tmp);
+    }
+    return out;
+}
--- a/common/preset.h
+++ b/common/preset.h
@@ -0,0 +1,74 @@
+#pragma once
+
+#include "common.h"
+#include "arg.h"
+
+#include <string>
+#include <vector>
+#include <map>
+
+//
+// INI preset parser and writer
+//
+
+constexpr const char * COMMON_PRESET_DEFAULT_NAME = "default";
+
+struct common_preset_context;
+
+struct common_preset {
+    std::string name;
+
+    // options are stored as common_arg to string mapping, representing CLI arg and its value
+    std::map<common_arg, std::string> options;
+
+    // convert preset to CLI argument list
+    std::vector<std::string> to_args(const std::string & bin_path = "") const;
+
+    // convert preset to INI format string
+    std::string to_ini() const;
+
+    // TODO: maybe implement to_env() if needed
+
+    // modify preset options where argument is identified by its env variable
+    void set_option(const common_preset_context & ctx, const std::string & env, const std::string & value);
+
+    // unset option by its env variable
+    void unset_option(const std::string & env);
+
+    // get option value by its env variable, return false if not found
+    bool get_option(const std::string & env, std::string & value) const;
+
+    // merge another preset into this one, overwriting existing options
+    void merge(const common_preset & other);
+};
+
+// interface for multiple presets in one file
+using common_presets = std::map<std::string, common_preset>;
+
+// context for loading and editing presets
+struct common_preset_context {
+    common_params default_params; // unused for now
+    common_params_context ctx_params;
+    std::map<std::string, common_arg> key_to_opt;
+    common_preset_context(llama_example ex);
+
+    // load presets from INI file
+    common_presets load_from_ini(const std::string & path, common_preset & global) const;
+
+    // generate presets from cached models
+    common_presets load_from_cache() const;
+
+    // generate presets from local models directory
+    // for the directory structure, see "Using multiple models" in server/README.md
+    common_presets load_from_models_dir(const std::string & models_dir) const;
+
+    // generate one preset from CLI arguments
+    common_preset load_from_args(int argc, char ** argv) const;
+
+    // cascade multiple presets if exist on both: base < added
+    // if preset does not exist in base, it will be added without modification
+    common_presets cascade(const common_presets & base, const common_presets & added) const;
+
+    // apply presets over a base preset (same idea as CSS cascading)
+    common_presets cascade(const common_preset & base, const common_presets & presets) const;
+};
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -116,7 +116,6 @@ struct common_sampler {
    void reset() {
        prev.clear();

-        llama_sampler_reset(grmr);
        llama_sampler_reset(chain);
    }

@@ -167,7 +166,11 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co

    lparams.no_perf = params.no_perf;

-    struct llama_sampler * grmr;
+    llama_sampler * grmr = nullptr;
+    llama_sampler * chain = llama_sampler_chain_init(lparams);
+
+    std::vector<llama_sampler *> samplers;
+
    if (params.grammar.compare(0, 11, "%llguidance") == 0) {
 #ifdef LLAMA_USE_LLGUIDANCE
        grmr = llama_sampler_init_llg(vocab, "lark", params.grammar.c_str());
@@ -217,30 +220,20 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
            trigger_patterns_c.push_back(regex.c_str());
        }

-        grmr = params.grammar_lazy
-             ? llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
-                                                        trigger_patterns_c.data(), trigger_patterns_c.size(),
-                                                        trigger_tokens.data(), trigger_tokens.size())
-             :      llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
-        if (!grmr) {
-            return nullptr;
+        if (!params.grammar.empty()) {
+             if (params.grammar_lazy) {
+                 grmr = llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
+                         trigger_patterns_c.data(), trigger_patterns_c.size(),
+                         trigger_tokens.data(), trigger_tokens.size());
+             } else {
+                 grmr = llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
+             }
        }
    }

-    auto * result = new common_sampler {
-        /* .params = */ params,
-        /* .grmr   = */ grmr,
-        /* .chain  = */ llama_sampler_chain_init(lparams),
-        /* .prev   = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
-        /* .cur    = */ {},
-        /* .cur_p  = */ {},
-    };
-
-    llama_sampler_chain_add(result->chain,
-            llama_sampler_init_logit_bias(
-                llama_vocab_n_tokens(vocab),
-                params.logit_bias.size(),
-                params.logit_bias.data()));
+    if (params.has_logit_bias()) {
+        samplers.push_back(llama_sampler_init_logit_bias(llama_vocab_n_tokens(vocab), params.logit_bias.size(), params.logit_bias.data()));
+    }

    if (params.mirostat == 0) {
        for (const auto & cnstr : params.samplers) {
@@ -253,58 +246,71 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
                            c_breakers.push_back(str.c_str());
                        }

-                        llama_sampler_chain_add(result->chain, llama_sampler_init_dry      (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
+                        samplers.push_back(llama_sampler_init_dry    (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
                    }
                    break;
                case COMMON_SAMPLER_TYPE_TOP_K:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_top_k       (params.top_k));
+                    samplers.push_back(llama_sampler_init_top_k      (params.top_k));
                    break;
                case COMMON_SAMPLER_TYPE_TOP_P:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_top_p       (params.top_p, params.min_keep));
+                    samplers.push_back(llama_sampler_init_top_p      (params.top_p, params.min_keep));
                    break;
                case COMMON_SAMPLER_TYPE_TOP_N_SIGMA:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
+                    samplers.push_back(llama_sampler_init_top_n_sigma(params.top_n_sigma));
                    break;
                case COMMON_SAMPLER_TYPE_MIN_P:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_min_p       (params.min_p, params.min_keep));
+                    samplers.push_back(llama_sampler_init_min_p      (params.min_p, params.min_keep));
                    break;
                case COMMON_SAMPLER_TYPE_XTC:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_xtc         (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
+                    samplers.push_back(llama_sampler_init_xtc        (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
                    break;
                case COMMON_SAMPLER_TYPE_TYPICAL_P:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_typical     (params.typ_p, params.min_keep));
+                    samplers.push_back(llama_sampler_init_typical    (params.typ_p, params.min_keep));
                    break;
                case COMMON_SAMPLER_TYPE_TEMPERATURE:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext    (params.temp, params.dynatemp_range, params.dynatemp_exponent));
+                    samplers.push_back(llama_sampler_init_temp_ext   (params.temp, params.dynatemp_range, params.dynatemp_exponent));
                    break;
                case COMMON_SAMPLER_TYPE_INFILL:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_infill      (vocab));
+                    samplers.push_back(llama_sampler_init_infill     (vocab));
                    break;
                case COMMON_SAMPLER_TYPE_PENALTIES:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_penalties   (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
+                    samplers.push_back(llama_sampler_init_penalties  (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
                    break;
                default:
                    GGML_ASSERT(false && "unknown sampler type");
            }
        }
-        llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
+
+        samplers.push_back(llama_sampler_init_dist(params.seed));
    } else if (params.mirostat == 1) {
-        llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
-        llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
+        samplers.push_back(llama_sampler_init_temp(params.temp));
+        samplers.push_back(llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
    } else if (params.mirostat == 2) {
-        llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
-        llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
+        samplers.push_back(llama_sampler_init_temp(params.temp));
+        samplers.push_back(llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
    } else {
        GGML_ASSERT(false && "unknown mirostat version");
    }

+    for (auto * smpl : samplers) {
+        llama_sampler_chain_add(chain, smpl);
+    }
+
+    auto * result = new common_sampler {
+        /* .params  = */ params,
+        /* .grmr    = */ grmr,
+        /* .chain   = */ chain,
+        /* .prev    = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
+        /* .cur     = */ {},
+        /* .cur_p   = */ {},
+    };
+
    return result;
 }

 void common_sampler_free(struct common_sampler * gsmpl) {
    if (gsmpl) {
        llama_sampler_free(gsmpl->grmr);
-
        llama_sampler_free(gsmpl->chain);

        delete gsmpl;
@@ -314,7 +320,7 @@ void common_sampler_free(struct common_sampler * gsmpl) {
 void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
    const auto tm = gsmpl->tm();

-    if (accept_grammar) {
+    if (gsmpl->grmr && accept_grammar) {
        llama_sampler_accept(gsmpl->grmr, token);
    }

@@ -329,12 +335,12 @@ void common_sampler_reset(struct common_sampler * gsmpl) {

 struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
    return new common_sampler {
-        /* .params = */ gsmpl->params,
-        /* .grmr   = */ llama_sampler_clone(gsmpl->grmr),
-        /* .chain  = */ llama_sampler_clone(gsmpl->chain),
-        /* .prev   = */ gsmpl->prev,
-        /* .cur    = */ gsmpl->cur,
-        /* .cur_p  = */ gsmpl->cur_p,
+        /* .params  = */ gsmpl->params,
+        /* .grmr    = */ llama_sampler_clone(gsmpl->grmr),
+        /* .chain   = */ llama_sampler_clone(gsmpl->chain),
+        /* .prev    = */ gsmpl->prev,
+        /* .cur     = */ gsmpl->cur,
+        /* .cur_p   = */ gsmpl->cur_p,
    };
 }

@@ -383,33 +389,37 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
    }
 }

+struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl) {
+    return gsmpl->chain;
+}
+
 llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
    llama_synchronize(ctx);

    // start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations
    const auto tm = gsmpl->tm();

-    gsmpl->set_logits(ctx, idx);
+    llama_token id = LLAMA_TOKEN_NULL;

    auto & grmr  = gsmpl->grmr;
    auto & chain = gsmpl->chain;
    auto & cur_p = gsmpl->cur_p; // initialized by set_logits

+    gsmpl->set_logits(ctx, idx);
+
    if (grammar_first) {
        llama_sampler_apply(grmr, &cur_p);
    }

    llama_sampler_apply(chain, &cur_p);

-    GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
-
-    const llama_token id = cur_p.data[cur_p.selected].id;
+    id = cur_p.data[cur_p.selected].id;

    if (grammar_first) {
        return id;
    }

-    // check if it the sampled token fits the grammar
+    // check if it the sampled token fits the grammar (grammar-based rejection sampling)
    {
        llama_token_data       single_token_data       = { id, 1.0f, 0.0f };
        llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
@@ -429,9 +439,11 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
    llama_sampler_apply(grmr,  &cur_p);
    llama_sampler_apply(chain, &cur_p);

-    GGML_ASSERT(cur_p.selected != -1 && "no selected token during re-sampling - check your sampling configuration");
+    GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");

-    return cur_p.data[cur_p.selected].id;
+    id = cur_p.data[cur_p.selected].id;
+
+    return id;
 }

 std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first) {
@@ -515,7 +527,8 @@ std::string common_sampler_print(const struct common_sampler * gsmpl) {

    for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
        const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
-        result += std::string("-> ") + llama_sampler_name(smpl) + " ";
+        result += std::string("-> ");
+        result += std::string(llama_sampler_name(smpl)) + " ";
    }

    return result;
--- a/common/sampling.h
+++ b/common/sampling.h
@@ -48,6 +48,8 @@ struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
 // arguments can be nullptr to skip printing
 void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);

+struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl);
+
 // extended sampling implementation:
 //
 // - set logits
@@ -107,3 +109,9 @@ std::vector<enum common_sampler_type> common_sampler_types_from_chars(const std:

 llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab,
                const char * grammar_kind, const char * grammar_data);
+
+struct common_sampler_deleter {
+    void operator()(common_sampler * s) { common_sampler_free(s); }
+};
+
+typedef std::unique_ptr<common_sampler, common_sampler_deleter> common_sampler_ptr;
--- a/common/unicode.cpp
+++ b/common/unicode.cpp
@@ -0,0 +1,64 @@
+#include "unicode.h"
+
+// implementation adopted from src/unicode.cpp
+
+size_t utf8_sequence_length(unsigned char first_byte) {
+    const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
+    uint8_t highbits = static_cast<uint8_t>(first_byte) >> 4;
+    return lookup[highbits];
+}
+
+utf8_parse_result parse_utf8_codepoint(std::string_view input, size_t offset) {
+    if (offset >= input.size()) {
+        return utf8_parse_result(utf8_parse_result::INCOMPLETE);
+    }
+
+    // ASCII fast path
+    if (!(input[offset] & 0x80)) {
+        return utf8_parse_result(utf8_parse_result::SUCCESS, input[offset], 1);
+    }
+
+    // Invalid: continuation byte as first byte
+    if (!(input[offset] & 0x40)) {
+        return utf8_parse_result(utf8_parse_result::INVALID);
+    }
+
+    // 2-byte sequence
+    if (!(input[offset] & 0x20)) {
+        if (offset + 1 >= input.size()) {
+            return utf8_parse_result(utf8_parse_result::INCOMPLETE);
+        }
+        if ((input[offset + 1] & 0xc0) != 0x80) {
+            return utf8_parse_result(utf8_parse_result::INVALID);
+        }
+        auto result = ((input[offset] & 0x1f) << 6) | (input[offset + 1] & 0x3f);
+        return utf8_parse_result(utf8_parse_result::SUCCESS, result, 2);
+    }
+
+    // 3-byte sequence
+    if (!(input[offset] & 0x10)) {
+        if (offset + 2 >= input.size()) {
+            return utf8_parse_result(utf8_parse_result::INCOMPLETE);
+        }
+        if ((input[offset + 1] & 0xc0) != 0x80 || (input[offset + 2] & 0xc0) != 0x80) {
+            return utf8_parse_result(utf8_parse_result::INVALID);
+        }
+        auto result = ((input[offset] & 0x0f) << 12) | ((input[offset + 1] & 0x3f) << 6) | (input[offset + 2] & 0x3f);
+        return utf8_parse_result(utf8_parse_result::SUCCESS, result, 3);
+    }
+
+    // 4-byte sequence
+    if (!(input[offset] & 0x08)) {
+        if (offset + 3 >= input.size()) {
+            return utf8_parse_result(utf8_parse_result::INCOMPLETE);
+        }
+        if ((input[offset + 1] & 0xc0) != 0x80 || (input[offset + 2] & 0xc0) != 0x80 || (input[offset + 3] & 0xc0) != 0x80) {
+            return utf8_parse_result(utf8_parse_result::INVALID);
+        }
+        auto result = ((input[offset] & 0x07) << 18) | ((input[offset + 1] & 0x3f) << 12) | ((input[offset + 2] & 0x3f) << 6) | (input[offset + 3] & 0x3f);
+        return utf8_parse_result(utf8_parse_result::SUCCESS, result, 4);
+    }
+
+    // Invalid first byte
+    return utf8_parse_result(utf8_parse_result::INVALID);
+}
--- a/common/unicode.h
+++ b/common/unicode.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <cstdint>
+#include <string_view>
+
+// UTF-8 parsing utilities for streaming-aware unicode support
+
+struct utf8_parse_result {
+    uint32_t codepoint;      // Decoded codepoint (only valid if status == SUCCESS)
+    size_t bytes_consumed;   // How many bytes this codepoint uses (1-4)
+    enum status { SUCCESS, INCOMPLETE, INVALID } status;
+
+    utf8_parse_result(enum status s, uint32_t cp = 0, size_t bytes = 0)
+        : codepoint(cp), bytes_consumed(bytes), status(s) {}
+};
+
+// Determine the expected length of a UTF-8 sequence from its first byte
+// Returns 0 for invalid first bytes
+size_t utf8_sequence_length(unsigned char first_byte);
+
+// Parse a single UTF-8 codepoint from input
+utf8_parse_result parse_utf8_codepoint(std::string_view input, size_t offset);
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -143,6 +143,7 @@ models = [
    {"name": "bailingmoe2",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-mini-base-2.0", },
    {"name": "granite-docling",  "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-docling-258M", },
    {"name": "minimax-m2",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/MiniMaxAI/MiniMax-M2", },
+    {"name": "kormo",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/KORMo-Team/KORMo-tokenizer", },
 ]

 # some models are known to be broken upstream, so we will skip them as exceptions
--- a/docs/android.md
+++ b/docs/android.md
@@ -1,7 +1,27 @@

 # Android

-## Build on Android using Termux
+## Build GUI binding using Android Studio
+
+Import the `examples/llama.android` directory into Android Studio, then perform a Gradle sync and build the project.
+![Project imported into Android Studio](./android/imported-into-android-studio.jpg)
+
+This Android binding supports hardware acceleration up to `SME2` for **Arm** and `AMX` for **x86-64** CPUs on Android and ChromeOS devices.
+It automatically detects the host's hardware to load compatible kernels. As a result, it runs seamlessly on both the latest premium devices and older devices that may lack modern CPU features or have limited RAM, without requiring any manual configuration.
+
+A minimal Android app frontend is included to showcase the binding’s core functionalities:
+1.	**Parse GGUF metadata** via `GgufMetadataReader` from either a `ContentResolver` provided `Uri` from shared storage, or a local `File` from your app's private storage.
+2.	**Obtain a `InferenceEngine`** instance through the `AiChat` facade and load your selected model via its app-private file path.
+3.	**Send a raw user prompt** for automatic template formatting, prefill, and batch decoding. Then collect the generated tokens in a Kotlin `Flow`.
+
+For a production-ready experience that leverages advanced features such as system prompts and benchmarks, plus friendly UI features such as model management and Arm feature visualizer, check out [Arm AI Chat](https://play.google.com/store/apps/details?id=com.arm.aichat) on Google Play.
+This project is made possible through a collaborative effort by Arm's **CT-ML**, **CE-ML** and **STE** groups:
+
+| ![Home screen](https://naco-siren.github.io/ai-chat/policy/index/1-llm-starter-pack.png)  | ![System prompt](https://naco-siren.github.io/ai-chat/policy/index/5-system-prompt.png)  | !["Haiku"](https://naco-siren.github.io/ai-chat/policy/index/4-metrics.png)  |
+|:------------------------------------------------------:|:----------------------------------------------------:|:--------------------------------------------------------:|
+|                      Home screen                       |                    System prompt                     |                         "Haiku"                          |
+
+## Build CLI on Android using Termux

 [Termux](https://termux.dev/en/) is an Android terminal emulator and Linux environment app (no root required). As of writing, Termux is available experimentally in the Google Play Store; otherwise, it may be obtained directly from the project repo or on F-Droid.

@@ -32,7 +52,7 @@ To see what it might look like visually, here's an old demo of an interactive se

 https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4

-## Cross-compile using Android NDK
+## Cross-compile CLI using Android NDK
 It's possible to build `llama.cpp` for Android on your host system via CMake and the Android NDK. If you are interested in this path, ensure you already have an environment prepared to cross-compile programs for Android (i.e., install the Android SDK). Note that, unlike desktop environments, the Android environment ships with a limited set of native libraries, and so only those libraries are available to CMake when building with the Android NDK (see: https://developer.android.com/ndk/guides/stable_apis.)

 Once you're ready and have cloned `llama.cpp`, invoke the following in the project directory:
--- a/docs/android/imported-into-android-studio.jpg
+++ b/docs/android/imported-into-android-studio.jpg
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -42,6 +42,9 @@ The following releases are verified and recommended:

 ## News

+- 2025.11
+  - Support malloc memory on device more than 4GB.
+
 - 2025.2
  - Optimize MUL_MAT Q4_0 on Intel GPU for all dGPUs and built-in GPUs since MTL. Increase the performance of LLM (llama-2-7b.Q4_0.gguf) 21%-87% on Intel GPUs (MTL, ARL-H, Arc, Flex, PVC).
    |GPU|Base tokens/s|Increased tokens/s|Percent|
@@ -100,6 +103,8 @@ SYCL backend supports Intel GPU Family:
 - Intel Built-in Arc GPU
 - Intel iGPU in Core CPU (11th Generation Core CPU and newer, refer to [oneAPI supported GPU](https://www.intel.com/content/www/us/en/developer/articles/system-requirements/intel-oneapi-base-toolkit-system-requirements.html#inpage-nav-1-1)).

+On older Intel GPUs, you may try [OpenCL](/docs/backend/OPENCL.md) although the performance is not optimal, and some GPUs may not support OpenCL nor have any GPGPU capabilities.
+
 #### Verified devices

 | Intel GPU                     | Status  | Verified Model                        |
@@ -789,6 +794,8 @@ use 1 SYCL GPUs: [0] with Max compute units:512
 | GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because graph performance isn't yet better than non-graph performance. |
 | GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. |
 | ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
+| UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS | 0 (default) or 1 | Support malloc device memory more than 4GB.|
+


 ## Known Issues
@@ -835,6 +842,14 @@ use 1 SYCL GPUs: [0] with Max compute units:512
  | The default context is too big. It leads to excessive memory usage.|Set `-c 8192` or a smaller value.|
  | The model is too big and requires more memory than what is available.|Choose a smaller model or change to a smaller quantization, like Q5 -> Q4;<br>Alternatively, use more than one device to load model.|

+- `ggml_backend_sycl_buffer_type_alloc_buffer: can't allocate 5000000000 Bytes of memory on device`
+
+  You need to enable to support 4GB memory malloc by:
+  ```
+    export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
+    set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
+  ```
+
 ### **GitHub contribution**:
 Please add the `SYCL :` prefix/tag in issues/PRs titles to help the SYCL contributors to check/address them without delay.

--- a/docs/backend/ZenDNN.md
+++ b/docs/backend/ZenDNN.md
@@ -0,0 +1,258 @@
+# llama.cpp for AMD ZenDNN
+
+> [!WARNING]
+> **Note:** ZenDNN is **not** the same as zDNN.
+> - **ZenDNN** (this page): AMD's deep learning library for AMD EPYC CPUs
+> - **zDNN**: IBM's Deep Neural Network acceleration library for IBM Z & LinuxONE Mainframes ([see zDNN documentation](zDNN.md))
+
+- [Background](#background)
+- [OS](#os)
+- [Hardware](#hardware)
+- [Supported Operations](#supported-operations)
+- [DataType Supports](#datatype-supports)
+- [Linux](#linux)
+- [Environment Variable](#environment-variable)
+- [Performance Optimization](#performance-optimization)
+- [Known Issues](#known-issues)
+- [TODO](#todo)
+
+## Background
+
+**ZenDNN** (Zen Deep Neural Network Library) is AMD's high-performance deep learning inference library optimized for AMD EPYC™ CPUs. It provides optimized implementations of key deep learning primitives and operations, delivering significant performance improvements for neural network workloads on AMD Zen-based processor architectures.
+
+**Llama.cpp + ZenDNN**
+
+The llama.cpp ZenDNN backend leverages AMD's optimized matrix multiplication primitives to accelerate inference on AMD CPUs. It utilizes ZenDNN's **LowOHA (Low Overhead Hardware Accelerated)** MatMul operator for efficient GEMM operations with minimal execution overhead, built-in weight caching, and direct access to backend libraries (AOCL BLIS, LibXSMM, OneDNN).
+
+For more information about ZenDNN, visit: https://www.amd.com/en/developer/zendnn.html
+
+## OS
+
+| OS      | Status  | Verified                                       |
+|:-------:|:-------:|:----------------------------------------------:|
+| Linux   | Support | Ubuntu 20.04, 22.04, 24.04                     |
+
+For the latest list of supported operating systems, see the [ZenDNN Supported OS](https://github.com/amd/ZenDNN/blob/zendnnl/README.md#15-supported-os).
+
+## Hardware
+
+### AMD CPUs
+
+**Recommended Processors**
+
+ZenDNN is optimized for AMD EPYC™ processors and AMD Ryzen™ processors based on "Zen" microarchitecture and newer.
+
+| CPU Family                    | Status  | Notes                              |
+|:-----------------------------:|:-------:|:----------------------------------:|
+| AMD EPYC™ 9005 Series (Turin)| Support | 5th Gen - Zen 5 architecture       |
+| AMD EPYC™ 9004 Series (Genoa)| Support | 4th Gen - Zen 4 architecture       |
+| AMD EPYC™ 7003 Series (Milan)| Support | 3rd Gen - Zen 3 architecture       |
+| AMD Ryzen™ AI MAX (Strix Halo)| Support | High-performance mobile processors |
+
+*Notes:*
+
+- Best performance is achieved on AMD EPYC™ processors with high core counts (e.g., EPYC 9005 series).
+- ZenDNN leverages AMD's advanced CPU features including AVX2 and AVX-512 instruction sets.
+- For optimal performance, ensure your system has sufficient memory bandwidth.
+
+## Supported Operations
+
+The ZenDNN backend currently accelerates **matrix multiplication (MUL_MAT)** operations only. Other operations are handled by the standard CPU backend.
+
+| Operation    | Status  | Notes                                          |
+|:-------------|:-------:|:----------------------------------------------:|
+| MUL_MAT      |    ✓    | Accelerated via ZenDNN LowOHA MatMul           |
+
+*Note:* Since only MUL_MAT is accelerated, models will benefit most from ZenDNN when matrix multiplications dominate the computational workload (which is typical for transformer-based LLMs).
+
+## DataType Supports
+
+| DataType               | Status  | Notes                                         |
+|:----------------------:|:-------:|:---------------------------------------------:|
+| FP32                   | Support | Full precision floating point                 |
+| BF16                   | Support | BFloat16 (best performance on Zen 4/Zen 5)    |
+
+*Notes:*
+
+- **BF16** provides best performance on Zen 4 and Zen 5 EPYC™ processors (Genoa, Turin).
+
+## Linux
+
+### I. Setup Environment
+
+You have two options to set up ZenDNN:
+
+#### Option 1: Automatic Download and Build (Recommended)
+
+CMake will automatically download and build ZenDNN for you:
+
+```sh
+# Build llama.cpp - ZenDNN will be automatically downloaded and built
+cmake -B build -DGGML_ZENDNN=ON -DCMAKE_BUILD_TYPE=Release
+cmake --build build --config Release -j $(nproc)
+```
+
+No manual ZenDNN installation required. CMake will handle everything automatically.
+
+#### Option 2: Use Custom ZenDNN Installation
+
+If you want to build ZenDNN yourself or use a specific version:
+
+**Step 1: Build ZenDNN from source**
+
+```sh
+# Clone ZenDNN repository
+git clone https://github.com/amd/ZenDNN.git
+cd ZenDNN
+git checkout zendnnl
+
+# Build and install (requires CMake >= 3.25)
+mkdir build && cd build
+cmake ..
+cmake --build . --target all
+```
+
+Default installation path: `ZenDNN/build/install`
+
+**For detailed build instructions**, refer to the [ZenDNN README](https://github.com/amd/ZenDNN/blob/zendnnl/README.md).
+
+**Step 2: Build llama.cpp with custom ZenDNN path**
+
+```sh
+# Using environment variable
+export ZENDNN_ROOT=/path/to/ZenDNN/build/install
+cmake -B build -DGGML_ZENDNN=ON -DCMAKE_BUILD_TYPE=Release
+cmake --build build --config Release -j $(nproc)
+
+# OR specify path directly in CMake
+cmake -B build -DGGML_ZENDNN=ON -DZENDNN_ROOT=/path/to/ZenDNN/build/install -DCMAKE_BUILD_TYPE=Release
+cmake --build build --config Release -j $(nproc)
+```
+
+### II. Run the Server
+
+#### 1. Download Model
+
+Download LLaMA 3.1 8B Instruct BF16 model:
+
+```sh
+# Download from Hugging Face
+huggingface-cli download meta-llama/Llama-3.1-8B-Instruct-GGUF --local-dir models/
+```
+
+#### 2. Start Server
+
+Run llama.cpp server with ZenDNN acceleration:
+
+```sh
+# Set optimal configuration
+export OMP_NUM_THREADS=64  # Adjust to your CPU core count
+export ZENDNNL_MATMUL_ALGO=2  # Blocked AOCL BLIS for best performance
+
+# Start server
+./build/bin/llama-server \
+    -m models/Llama-3.1-8B-Instruct.BF16.gguf \
+    --host 0.0.0.0 \
+    --port 8080 \
+    -t 64
+```
+
+Access the server at `http://localhost:8080`.
+
+**Performance tips**:
+- Set `OMP_NUM_THREADS` to match your physical core count
+- Use `ZENDNNL_MATMUL_ALGO=2` for optimal performance
+- For NUMA systems: `numactl --cpunodebind=0 --membind=0 ./build/bin/llama-server ...`
+
+## Environment Variable
+
+### Build Time
+
+| Name               | Value                                 | Function                                    |
+|--------------------|---------------------------------------|---------------------------------------------|
+| GGML_ZENDNN        | ON/OFF                                | Enable ZenDNN backend support               |
+| ZENDNN_ROOT        | Path to ZenDNN installation           | Set ZenDNN installation directory           |
+| GGML_OPENMP        | ON/OFF (recommended: ON)              | Enable OpenMP for multi-threading           |
+
+### Runtime
+
+| Name                    | Value                    | Function                                                          |
+|-------------------------|--------------------------|-------------------------------------------------------------------|
+| OMP_NUM_THREADS         | Number (e.g., 64)        | Set number of OpenMP threads (recommended: physical core count)   |
+| ZENDNNL_MATMUL_ALGO     | 0-5                      | Select MatMul backend algorithm (see Performance Optimization)    |
+| ZENDNNL_PROFILE_LOG_LEVEL | 0-4                    | Profiling log level (0=disabled, 4=verbose)                       |
+| ZENDNNL_ENABLE_PROFILER | 0 or 1                   | Enable detailed profiling (1=enabled)                             |
+| ZENDNNL_API_LOG_LEVEL   | 0-4                      | API log level (0=disabled, 4=verbose)                             |
+
+**Example**:
+
+```sh
+export OMP_NUM_THREADS=64
+export ZENDNNL_MATMUL_ALGO=2  # Use Blocked AOCL BLIS for best performance
+./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Test" -n 100
+```
+
+## Performance Optimization
+
+### MatMul Algorithm Selection
+
+ZenDNN's LowOHA MatMul supports multiple backend algorithms. For **best performance**, use the **Blocked AOCL BLIS** algorithm:
+
+```sh
+export ZENDNNL_MATMUL_ALGO=2  # Blocked AOCL BLIS (recommended)
+```
+
+**Available algorithms**:
+
+| Value | Algorithm              | Description                                    |
+|:-----:|:-----------------------|:----------------------------------------------|
+| 0     | Dynamic Dispatch       | Automatic backend selection (default)         |
+| 1     | AOCL BLIS              | AOCL BLIS backend                             |
+| 2     | AOCL BLIS Blocked      | **Blocked AOCL BLIS (recommended)**           |
+| 3     | OneDNN                 | OneDNN backend                                |
+| 4     | OneDNN Blocked         | Blocked OneDNN                                |
+| 5     | LibXSMM                | LibXSMM backend                               |
+
+### Profiling and Debugging
+
+For detailed profiling and logging options, refer to the [ZenDNN Logging Documentation](https://github.com/amd/ZenDNN/blob/zendnnl/docs/logging.md).
+
+## Known Issues
+
+- **Limited operation support**: Currently only matrix multiplication (MUL_MAT) is accelerated via ZenDNN. Other operations fall back to the standard CPU backend.
+- **BF16 support**: BF16 operations require AMD Zen 4 or Zen 5 architecture (EPYC 9004/9005 series). On older CPUs, operations will use FP32.
+- **NUMA awareness**: For multi-socket systems, manual NUMA binding may be required for optimal performance.
+
+## Q&A
+
+**Q: How do I verify that ZenDNN backend is being used?**
+
+A: Check the log output when running llama.cpp. You should see messages indicating the ZenDNN backend is initialized. You can also check the backend name in the output.
+
+**Q: What performance improvement can I expect?**
+
+A: Performance gains vary depending on the model size, batch size, and CPU architecture. On AMD EPYC processors, you can typically expect 1.1x-2x speedup compared to standard CPU inference for matrix multiplication operations.
+
+**Q: Can I use ZenDNN on non-AMD processors?**
+
+A: ZenDNN is optimized specifically for AMD processors. While it may work on other x86-64 CPUs, performance benefits are only guaranteed on AMD Zen-based architectures.
+
+**Q: Does ZenDNN support quantized models?**
+
+A: Currently, ZenDNN primarily supports FP32 and BF16 data types. Quantized model support is not available at this time.
+
+**Q: Why is my inference not faster with ZenDNN?**
+
+A: Ensure:
+1. You're using an AMD EPYC or Ryzen processor (Zen 2 or newer)
+2. `OMP_NUM_THREADS` is set appropriately (physical core count)
+3. `ZENDNNL_MATMUL_ALGO=2` is set for best performance (Blocked AOCL BLIS)
+4. You're using a sufficiently large model (small models may not benefit as much)
+5. Enable profiling to verify ZenDNN MatMul is being called
+
+### **GitHub Contribution**:
+Please add the **[ZenDNN]** prefix/tag in issues/PRs titles to help the ZenDNN-team check/address them without delay.
+
+## TODO
+
+- Expand operation support beyond MUL_MAT (attention operations, activations, etc.)
--- a/docs/backend/hexagon/CMakeUserPresets.json
+++ b/docs/backend/hexagon/CMakeUserPresets.json
@@ -22,6 +22,7 @@
            "GGML_LLAMAFILE":   "OFF",
            "GGML_OPENCL":      "ON",
            "GGML_HEXAGON":     "ON",
+            "GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE": "128",
            "LLAMA_CURL":       "OFF"
        }
    },
@@ -36,6 +37,7 @@
            "GGML_LLAMAFILE":   "OFF",
            "GGML_OPENCL":      "ON",
            "GGML_HEXAGON":     "ON",
+            "GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE": "128",
            "LLAMA_CURL":       "OFF"
        }
    },
--- a/docs/backend/zDNN.md
+++ b/docs/backend/zDNN.md
@@ -1,5 +1,10 @@
 # llama.cpp for IBM zDNN Accelerator

+> [!WARNING]
+> **Note:** zDNN is **not** the same as ZenDNN.
+> - **zDNN** (this page): IBM's Deep Neural Network acceleration library for IBM Z & LinuxONE Mainframes
+> - **ZenDNN**: AMD's deep learning library for AMD EPYC CPUs ([see ZenDNN documentation](ZenDNN.md))
+
 ## Background

 IBM zDNN (Z Deep Neural Network) is a hardware acceleration library designed specifically to leverage the IBM NNPA (Neural Network Processor Assist) accelerator located within IBM Telum I and II processors. It provides significant performance improvements for neural network inference operations.
--- a/docs/build-riscv64-spacemit.md
+++ b/docs/build-riscv64-spacemit.md
@@ -19,6 +19,7 @@ cmake -B build \
    -DGGML_RVV=ON \
    -DGGML_RV_ZFH=ON \
    -DGGML_RV_ZICBOP=ON \
+    -DGGML_RV_ZIHINTPAUSE=ON \
    -DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1 \
    -DCMAKE_TOOLCHAIN_FILE=${PWD}/cmake/riscv64-spacemit-linux-gnu-gcc.cmake \
    -DCMAKE_INSTALL_PREFIX=build/installed
--- a/docs/build.md
+++ b/docs/build.md
@@ -431,11 +431,22 @@ docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/ren

 ### For Linux users:

+#### Using the LunarG Vulkan SDK
+
 First, follow the official LunarG instructions for the installation and setup of the Vulkan SDK in the [Getting Started with the Linux Tarball Vulkan SDK](https://vulkan.lunarg.com/doc/sdk/latest/linux/getting_started.html) guide.

 > [!IMPORTANT]
 > After completing the first step, ensure that you have used the `source` command on the `setup_env.sh` file inside of the Vulkan SDK in your current terminal session. Otherwise, the build won't work. Additionally, if you close out of your terminal, you must perform this step again if you intend to perform a build. However, there are ways to make this persistent. Refer to the Vulkan SDK guide linked in the first step for more information about any of this.

+#### Using system packages
+
+On Debian / Ubuntu, you can install the required dependencies using:
+```sh
+sudo apt-get install libvulkan-dev glslc
+```
+
+#### Common steps
+
 Second, after verifying that you have followed all of the SDK installation/setup steps, use this command to make sure before proceeding:
 ```bash
 vulkaninfo
@@ -484,6 +495,38 @@ llama_new_context_with_model:       CANN compute buffer size =  1260.81 MiB

 For detailed info, such as model/device supports, CANN install, please refer to [llama.cpp for CANN](./backend/CANN.md).

+## ZenDNN
+
+ZenDNN provides optimized deep learning primitives for AMD EPYC™ CPUs. It accelerates matrix multiplication operations for inference workloads.
+
+### Compilation
+
+- Using `CMake` on Linux (automatic build):
+
+    ```bash
+    cmake -B build -DGGML_ZENDNN=ON
+    cmake --build build --config Release
+    ```
+
+    The first build will automatically download and build ZenDNN, which may take 5-10 minutes. Subsequent builds will be much faster.
+
+- Using `CMake` with custom ZenDNN installation:
+
+    ```bash
+    cmake -B build -DGGML_ZENDNN=ON -DZENDNN_ROOT=/path/to/zendnn/install
+    cmake --build build --config Release
+    ```
+
+### Testing
+
+You can test with:
+
+```bash
+./build/bin/llama-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -n 50
+```
+
+For detailed information about hardware support, setup instructions, and performance optimization, refer to [llama.cpp for ZenDNN](./backend/ZenDNN.md).
+
 ## Arm® KleidiAI™
 KleidiAI is a library of optimized microkernels for AI workloads, specifically designed for Arm CPUs. These microkernels enhance performance and can be enabled for use by the CPU backend.

--- a/docs/development/HOWTO-add-model.md
+++ b/docs/development/HOWTO-add-model.md
@@ -9,7 +9,8 @@ Adding a model requires few steps:
 After following these steps, you can open PR.

 Also, it is important to check that the examples and main ggml backends (CUDA, METAL, CPU) are working with the new architecture, especially:
- [main](/tools/main/)
+- [cli](/tools/cli/)
+- [completion](/tools/completion/)
 - [imatrix](/tools/imatrix/)
 - [quantize](/tools/quantize/)
 - [server](/tools/server/)
@@ -96,7 +97,7 @@ The model params and tensors layout must be defined in `llama.cpp` source files:
 1. Define a new `llm_arch` enum value in `src/llama-arch.h`.
 2. In `src/llama-arch.cpp`:
    - Add the architecture name to the `LLM_ARCH_NAMES` map.
-    - Add the tensor mappings to the `LLM_TENSOR_NAMES` map.
+    - Add the list of model tensors to `llm_get_tensor_names` (you may also need to update `LLM_TENSOR_NAMES`)
 3. Add any non-standard metadata loading in the `llama_model_loader` constructor in `src/llama-model-loader.cpp`.
 4. If the model has a RoPE operation, add a case for the architecture in `llama_model_rope_type` function in `src/llama-model.cpp`.

--- a/docs/development/parsing.md
+++ b/docs/development/parsing.md
@@ -0,0 +1,288 @@
+# Parsing Model Output
+
+The `common` library contains a PEG parser implementation suitable for parsing
+model output.
+
+Types with the prefix `common_peg_*` are intended for general use and may have
+applications beyond parsing model output, such as parsing user-provided regex
+patterns.
+
+Types with the prefix `common_chat_peg_*` are specialized helpers for model
+output.
+
+The parser features:
+
+- Partial parsing of streaming input
+- Built-in JSON parsers
+- AST generation with semantics via "tagged" nodes
+
+## Example
+
+Below is a contrived example demonstrating how to use the PEG parser to parse
+output from a model that emits arguments as JSON.
+
+```cpp
+auto parser = build_chat_peg_native_parser([&](common_chat_peg_native_builder & p) {
+    // Build a choice of all available tools
+    auto tool_choice = p.choice();
+    for (const auto & tool : tools) {
+        const auto & function = tool.at("function");
+        std::string name = function.at("name");
+        const auto & schema = function.at("parameters");
+
+        auto tool_name = p.json_member("name", "\"" + p.literal(name) + "\"");
+        auto tool_args = p.json_member("arguments", p.schema(p.json(), "tool-" + name + "-schema", schema));
+
+        tool_choice |= p.rule("tool-" + name, "{" << tool_name << "," << tool_args << "}");
+    }
+
+    // Define the tool call structure: <tool_call>[{tool}]</tool_call>
+    auto tool_call = p.trigger_rule("tool-call",
+        p.sequence({
+            p.literal("<tool_call>["),
+            tool_choice,
+            p.literal("]</tool_call>")
+        })
+    );
+
+    // Parser accepts content, optionally followed by a tool call
+    return p.sequence({
+        p.content(p.until("<tool_call>")),
+        p.optional(tool_call),
+        p.end()
+    });
+});
+```
+
+For a more complete example, see `test_example_native()` in
+[tests/test-chat-peg-parser.cpp](tests/test-chat-peg-parser.cpp).
+
+## Parsers/Combinators
+
+### Basic Matchers
+
+- **`eps()`** - Matches nothing and always succeeds (epsilon/empty match)
+- **`start()`** - Matches the start of input (anchor `^`)
+- **`end()`** - Matches the end of input (anchor `$`)
+- **`literal(string)`** - Matches an exact literal string
+- **`any()`** - Matches any single character (`.`)
+
+### Combinators
+
+- **`sequence(...)`** - Matches parsers in order; all must succeed
+- **`choice(...)`** - Matches the first parser that succeeds from alternatives (ordered choice)
+- **`one_or_more(p)`** - Matches one or more repetitions (`+`)
+- **`zero_or_more(p)`** - Matches zero or more repetitions (`*`)
+- **`optional(p)`** - Matches zero or one occurrence (`?`)
+- **`repeat(p, min, max)`** - Matches between min and max repetitions (use `-1` for unbounded)
+- **`repeat(p, n)`** - Matches exactly n repetitions
+
+### Lookahead
+
+- **`peek(p)`** - Positive lookahead: succeeds if parser succeeds without consuming input (`&`)
+- **`negate(p)`** - Negative lookahead: succeeds if parser fails without consuming input (`!`)
+
+### Character Classes & Utilities
+
+- **`chars(classes, min, max)`** - Matches repetitions of characters from a character class
+- **`space()`** - Matches zero or more whitespace characters (space, tab, newline)
+- **`until(delimiter)`** - Matches characters until delimiter is found (delimiter not consumed)
+- **`until_one_of(delimiters)`** - Matches characters until any delimiter in the list is found
+- **`rest()`** - Matches everything remaining (`.*`)
+
+### JSON Parsers
+
+- **`json()`** - Complete JSON parser (objects, arrays, strings, numbers, booleans, null)
+- **`json_object()`** - JSON object parser
+- **`json_array()`** - JSON array parser
+- **`json_string()`** - JSON string parser
+- **`json_number()`** - JSON number parser
+- **`json_bool()`** - JSON boolean parser
+- **`json_null()`** - JSON null parser
+- **`json_string_content()`** - JSON string content without surrounding quotes
+- **`json_member(key, p)`** - JSON object member with specific key and value parser
+
+### Grammar Building
+
+- **`ref(name)`** - Creates a lightweight reference to a named rule (for recursive grammars)
+- **`rule(name, p, trigger)`** - Creates a named rule and returns a reference
+- **`trigger_rule(name, p)`** - Creates a trigger rule (entry point for lazy grammar generation)
+- **`schema(p, name, schema, raw)`** - Wraps parser with JSON schema metadata for grammar generation
+
+### AST Control
+
+- **`atomic(p)`** - Prevents AST node creation for partial parses
+- **`tag(tag, p)`** - Creates AST nodes with semantic tags (multiple nodes can share tags)
+
+## GBNF Grammar Generation
+
+The PEG parser also acts as a convenient DSL for generating GBNF grammars, with
+some exceptions.
+
+```cpp
+data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+    foreach_function(params.tools, [&](const json & fn) {
+        builder.resolve_refs(fn.at("parameters"));
+    });
+    parser.build_grammar(builder, data.grammar_lazy);
+});
+```
+
+The notable exception is the `negate(p)` lookahead parser, which cannot be
+defined as a CFG grammar and therefore does not produce a rule. Its usage
+should be limited and preferably hidden behind a `schema()` parser. In many
+cases, `until(delimiter)` or `until_one_of(delimiters)` is a better choice.
+
+Another limitation is that the PEG parser requires an unambiguous grammar. In
+contrast, the `llama-grammar` implementation can support ambiguous grammars,
+though they are difficult to parse.
+
+### Lazy Grammars
+
+During lazy grammar generation, only rules reachable from a `trigger_rule(p)`
+are emitted in the grammar. All trigger rules are added as alternations in the
+root rule. It is still necessary to define trigger patterns, as the parser has
+no interaction with the grammar sampling.
+
+### JSON Schema
+
+The `schema(p, name, schema, raw)` parser will use the `json-schema-to-grammar`
+implementation to generate the grammar instead of the underlying parser.
+
+The `raw` option emits a grammar suitable for a raw string instead of a JSON
+string. In other words, it won't be wrapped in quotes or require escaping
+quotes. It should only be used when `type == "string"`.
+
+The downside is that it can potentially lead to ambiguous grammars. For
+example, if a user provides the pattern `^.*$`, the following grammar may be
+generated:
+
+```
+root ::= "<arg>" .* "</arg>"
+```
+
+This creates an ambiguous grammar that cannot be parsed by the PEG parser. To
+help mitigate this, if `.*` is found in the pattern, the grammar from the
+underlying parser will be emitted instead.
+
+## Common AST Shapes for Chat Parsing
+
+Most model output can be placed in one of the following categories:
+
+- Content only
+- Tool calling with arguments emitted as a single JSON object
+- Tool calling with arguments emitted as separate entities, either XML
+  (Qwen3-Coder, MiniMax M2) or pseudo-function calls (LFM2)
+
+To provide broad coverage,
+[`common/chat-peg-parser.h`](common/chat-peg-parser.h) contains builders and
+mappers that help create parsers and visitors/extractors for these types. They
+require parsers to tag nodes to conform to an AST "shape". This normalization
+makes it easy to extract information and generalize parsing.
+
+### Simple
+
+The `common_chat_peg_builder` builds a `simple` parser that supports
+content-only models with optional reasoning.
+
+- **`reasoning(p)`** - Tag node for extracting `reasoning_content`
+- **`content(p)`** - Tag node for extracting `content`
+
+```cpp
+build_chat_peg_parser([&](common_chat_peg_parser & p) {
+    return p.sequence({
+        p.optional("<think>" + p.reasoning(p.until("</think>")) + "</think>"),
+        p.content(p.until("<tool_call>")),
+        p.end()
+    });
+});
+```
+
+Use `common_chat_peg_mapper` to extract the content. Note that this is already
+done for you in `common_chat_peg_parser` when
+`chat_format == COMMON_CHAT_FORMAT_PEG_SIMPLE`.
+
+```cpp
+auto result = parser.parse(ctx);
+
+common_chat_msg msg;
+auto mapper = common_chat_peg_mapper(msg);
+mapper.from_ast(ctx.ast, result);
+```
+
+### Native
+
+The `common_chat_peg_native_builder` builds a `native` parser suitable for
+models that emit tool arguments as a direct JSON object.
+
+- **`reasoning(p)`** - Tag node for `reasoning_content`
+- **`content(p)`** - Tag node for `content`
+- **`tool(p)`** - Tag entirety of a single tool call
+- **`tool_open(p)`** - Tag start of a tool call
+- **`tool_close(p)`** - Tag end of a tool call
+- **`tool_id(p)`** - Tag the tool call ID (optional)
+- **`tool_name(p)`** - Tag the tool name
+- **`tool_args(p)`** - Tag the tool arguments
+
+```cpp
+build_chat_peg_native_parser([&](common_chat_peg_native_parser & p) {
+    auto get_weather_tool = p.tool(p.sequence({
+        p.tool_open(p.literal("{")),
+        p.json_member("name", "\"" + p.tool_name(p.literal("get_weather")) + "\""),
+        p.literal(","),
+        p.json_member("arguments", p.tool_args(p.json())),
+        p.tool_close(p.literal("}"))
+    }));
+
+    return p.sequence({
+        p.content(p.until("<tool_call>")),
+        p.literal("<tool_call>"),
+        get_weather_tool,
+        p.literal("</tool_call>"),
+        p.end()
+    });
+});
+```
+
+### Constructed
+
+The `common_chat_peg_constructed_builder` builds a `constructed` parser
+suitable for models that emit tool arguments as separate entities, such as XML
+tags.
+
+- **`reasoning(p)`** - Tag node for `reasoning_content`
+- **`content(p)`** - Tag node for `content`
+- **`tool(p)`** - Tag entirety of a single tool call
+- **`tool_open(p)`** - Tag start of a tool call
+- **`tool_close(p)`** - Tag end of a tool call
+- **`tool_name(p)`** - Tag the tool name
+- **`tool_arg(p)`** - Tag a complete tool argument (name + value)
+- **`tool_arg_open(p)`** - Tag start of a tool argument
+- **`tool_arg_close(p)`** - Tag end of a tool argument
+- **`tool_arg_name(p)`** - Tag the argument name
+- **`tool_arg_string_value(p)`** - Tag string value for the argument
+- **`tool_arg_json_value(p)`** - Tag JSON value for the argument
+
+```cpp
+build_chat_peg_constructed_parser([&](common_chat_peg_constructed_builder & p) {
+    auto location_arg = p.tool_arg(
+        p.tool_arg_open("<parameter name=\"" + p.tool_arg_name(p.literal("location")) + "\">"),
+        p.tool_arg_string_value(p.until("</parameter>")),
+        p.tool_arg_close(p.literal("</parameter>"))
+    );
+
+    auto get_weather_tool = p.tool(p.sequence({
+        p.tool_open("<function name=\"" + p.tool_name(p.literal("get_weather")) + "\">"),
+        location_arg,
+        p.tool_close(p.literal("</function>"))
+    }));
+
+    return p.sequence({
+        p.content(p.until("<tool_call>")),
+        p.literal("<tool_call>"),
+        get_weather_tool,
+        p.literal("</tool_call>"),
+        p.end()
+    });
+});
+```
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -7,9 +7,9 @@
 ## Images
 We have three Docker images available for this project:

-1. `ghcr.io/ggml-org/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
-2. `ghcr.io/ggml-org/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
-3. `ghcr.io/ggml-org/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
+1. `ghcr.io/ggml-org/llama.cpp:full`: This image includes both the `llama-cli` and `llama-completion` executables and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
+2. `ghcr.io/ggml-org/llama.cpp:light`: This image only includes the `llama-cli` and `llama-completion` executables. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
+3. `ghcr.io/ggml-org/llama.cpp:server`: This image only includes the `llama-server` executable. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)

 Additionally, there the following images, similar to the above:

@@ -44,21 +44,25 @@ docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --all-in-o
 On completion, you are ready to play!

 ```bash
-docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
+docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf
+docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --run-legacy -m /models/32B/ggml-model-q8_0.gguf -no-cnv -p "Building a mobile app can be done in 15 steps:" -n 512
 ```

 or with a light image:

 ```bash
-docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
+docker run -v /path/to/models:/models --entrypoint /app/llama-cli ghcr.io/ggml-org/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf
+docker run -v /path/to/models:/models --entrypoint /app/llama-completion ghcr.io/ggml-org/llama.cpp:light -m /models/32B/ggml-model-q8_0.gguf -no-cnv -p "Building a mobile app can be done in 15 steps:" -n 512
 ```

 or with a server image:

 ```bash
-docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggml-org/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512
+docker run -v /path/to/models:/models -p 8080:8080 ghcr.io/ggml-org/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8080 --host 0.0.0.0 -n 512
 ```

+In the above examples, `--entrypoint /app/llama-cli` is specified for clarity, but you can safely omit it since it's the default entrypoint in the container.
+
 ## Docker With CUDA

 Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) properly installed on Linux, or is using a GPU enabled cloud, `cuBLAS` should be accessible inside the container.
@@ -80,9 +84,9 @@ The defaults are:

 The resulting images, are essentially the same as the non-CUDA images:

-1. `local/llama.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
-2. `local/llama.cpp:light-cuda`: This image only includes the main executable file.
-3. `local/llama.cpp:server-cuda`: This image only includes the server executable file.
+1. `local/llama.cpp:full-cuda`: This image includes both the `llama-cli` and `llama-completion` executables and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
+2. `local/llama.cpp:light-cuda`: This image only includes the `llama-cli` and `llama-completion` executables.
+3. `local/llama.cpp:server-cuda`: This image only includes the `llama-server` executable.

 ## Usage

@@ -91,7 +95,7 @@ After building locally, Usage is similar to the non-CUDA examples, but you'll ne
 ```bash
 docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
 docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
-docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
+docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8080 --host 0.0.0.0 -n 512 --n-gpu-layers 1
 ```

 ## Docker With MUSA
@@ -114,9 +118,9 @@ The defaults are:

 The resulting images, are essentially the same as the non-MUSA images:

-1. `local/llama.cpp:full-musa`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
-2. `local/llama.cpp:light-musa`: This image only includes the main executable file.
-3. `local/llama.cpp:server-musa`: This image only includes the server executable file.
+1. `local/llama.cpp:full-musa`: This image includes both the `llama-cli` and `llama-completion` executables and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
+2. `local/llama.cpp:light-musa`: This image only includes the `llama-cli` and `llama-completion` executables.
+3. `local/llama.cpp:server-musa`: This image only includes the `llama-server` executable.

 ## Usage

@@ -125,5 +129,5 @@ After building locally, Usage is similar to the non-MUSA examples, but you'll ne
 ```bash
 docker run -v /path/to/models:/models local/llama.cpp:full-musa --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
 docker run -v /path/to/models:/models local/llama.cpp:light-musa -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
-docker run -v /path/to/models:/models local/llama.cpp:server-musa -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
+docker run -v /path/to/models:/models local/llama.cpp:server-musa -m /models/7B/ggml-model-q4_0.gguf --port 8080 --host 0.0.0.0 -n 512 --n-gpu-layers 1
 ```
--- a/docs/ops.md
+++ b/docs/ops.md
@@ -12,110 +12,112 @@ Legend:
 - 🟡 Partially supported by this backend
 - ❌ Not supported by this backend

-| Operation | BLAS | CANN | CPU | CUDA | Metal | OpenCL | SYCL | Vulkan | zDNN |
-|-----------|------|------|------|------|------|------|------|------|------|
-|                              ABS | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ❌ |
-|                              ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                              ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
-|                             ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
-|                           ADD_ID | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                           ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                           ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                          ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ❌ |
-|                             CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ |
-|                            CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
-|                           CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ❌ |
-|                             CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
-|                          CONV_2D | ❌ | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ | ✅ | ❌ |
-|                       CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                          CONV_3D | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                              COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ |
-|                      COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
-|                              CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
-|               CROSS_ENTROPY_LOSS | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|          CROSS_ENTROPY_LOSS_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                           CUMSUM | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                    DIAG_MASK_INF | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
-|                              DIV | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
-|                              DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
-|                              ELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | ❌ | ❌ |
-|                              EXP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ❌ |
-|                            EXPM1 | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                             FILL | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                   FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ |
-|                            FLOOR | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ |
-|                GATED_LINEAR_ATTN | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
-|                            GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
-|                        GEGLU_ERF | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
-|                      GEGLU_QUICK | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
-|                             GELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
-|                         GELU_ERF | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
-|                       GELU_QUICK | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
-|                         GET_ROWS | ❌ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
-|                    GET_ROWS_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                       GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
-|               GROUP_NORM_MUL_ADD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                      HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ❌ |
-|                        HARDSWISH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ❌ |
-|                           IM2COL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ |
-|                        IM2COL_3D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                          L2_NORM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                       LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | 🟡 | ❌ |
-|                              LOG | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ✅ | ❌ |
-|                             MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                              MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
-|                          MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
-|                       MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ |
-|                              NEG | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ❌ |
-|                             NORM | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
-|                     NORM_MUL_ADD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                   OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                     OPT_STEP_SGD | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                         OUT_PROD | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ |
-|                              PAD | ❌ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ |
-|                   PAD_REFLECT_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ |
-|                          POOL_2D | ❌ | 🟡 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                            REGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
-|                             RELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
-|                           REPEAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ❌ |
-|                      REPEAT_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
-|                         RMS_NORM | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ |
-|                    RMS_NORM_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
-|                 RMS_NORM_MUL_ADD | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                             ROLL | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
-|                             ROPE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
-|                        ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                            ROUND | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ |
-|                        RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                        RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                            SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
-|                              SET | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | 🟡 | ❌ | ❌ |
-|                         SET_ROWS | ❌ | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
-|                              SGN | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | ❌ | ❌ |
-|                          SIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
-|                             SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
-|                        SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                              SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ |
-|                          SOFTCAP | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                         SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | 🟡 | ❌ |
-|                         SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
-|                    SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ |
-|                        SOLVE_TRI | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                              SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ |
-|                             SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ |
-|                         SSM_CONV | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                         SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ❌ |
-|                             STEP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ❌ |
-|                              SUB | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
-|                              SUM | ❌ | ✅ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ |
-|                         SUM_ROWS | ❌ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ |
-|                           SWIGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
-|                       SWIGLU_OAI | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | 🟡 | ❌ |
-|                             TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | 🟡 | ❌ |
-|               TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
-|                              TRI | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                            TRUNC | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ |
-|                          UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ |
-|                            XIELU | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+| Operation | BLAS | CANN | CPU | CUDA | Metal | OpenCL | SYCL | Vulkan | WebGPU | ZenDNN | zDNN |
+|-----------|------|------|------|------|------|------|------|------|------|------|------|
+|                              ABS | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                              ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                              ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                             ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                           ADD_ID | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                           ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                           ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                          ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
+|                             CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
+|                            CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ | ❌ |
+|                           CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                             CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
+|                          CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ |
+|                       CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
+|                          CONV_3D | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
+|                              COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
+|                      COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                              CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
+|               CROSS_ENTROPY_LOSS | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|          CROSS_ENTROPY_LOSS_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                           CUMSUM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
+|                             DIAG | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                    DIAG_MASK_INF | ❌ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                              DIV | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                              DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                              ELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ |
+|                              EXP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                            EXPM1 | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                             FILL | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
+|                   FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ | ❌ |
+|                            FLOOR | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
+|                GATED_LINEAR_ATTN | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
+|                            GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                        GEGLU_ERF | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                      GEGLU_QUICK | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                             GELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                         GELU_ERF | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                       GELU_QUICK | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                         GET_ROWS | ❌ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
+|                    GET_ROWS_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                       GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
+|               GROUP_NORM_MUL_ADD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                      HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                        HARDSWISH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                           IM2COL | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                        IM2COL_3D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
+|                          L2_NORM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                       LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
+|                              LOG | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                             MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                              MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                          MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
+|                       MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
+|                              NEG | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                             NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ❌ | ❌ | ❌ |
+|                     NORM_MUL_ADD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                   OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
+|                     OPT_STEP_SGD | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
+|                         OUT_PROD | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ | ❌ |
+|                              PAD | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
+|                   PAD_REFLECT_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
+|                          POOL_2D | ❌ | 🟡 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                            REGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                             RELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                           REPEAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ❌ | ❌ | ❌ |
+|                      REPEAT_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                         RMS_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                    RMS_NORM_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                 RMS_NORM_MUL_ADD | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                             ROLL | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                             ROPE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                        ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
+|                            ROUND | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
+|                        RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                        RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                            SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                              SET | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ | ❌ |
+|                         SET_ROWS | ❌ | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
+|                              SGN | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ |
+|                          SIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                             SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                        SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
+|                              SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
+|                          SOFTCAP | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                         SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
+|                         SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                    SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
+|                        SOLVE_TRI | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
+|                              SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ | ❌ | ❌ |
+|                             SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ | ❌ | ❌ |
+|                         SSM_CONV | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                         SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
+|                             STEP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                              SUB | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                              SUM | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
+|                         SUM_ROWS | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
+|                           SWIGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                       SWIGLU_OAI | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                             TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|               TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                            TOP_K | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
+|                              TRI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
+|                            TRUNC | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
+|                          UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ | ❌ |
+|                            XIELU | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
--- a/docs/ops/BLAS.csv
+++ b/docs/ops/BLAS.csv
--- a/docs/ops/CPU.csv
+++ b/docs/ops/CPU.csv
@@ -4964,6 +4964,7 @@
 "CPU","CONV_TRANSPOSE_1D","ne_input=[2,1,1,1],ne_kernel=[3,1,1,1],s0=1,p0=0,d0=1","support","1","yes","CPU"
 "CPU","CONV_TRANSPOSE_2D","ne_input=[3,2,3,1],ne_kernel=[2,2,1,3],stride=1","support","1","yes","CPU"
 "CPU","CONV_TRANSPOSE_2D","ne_input=[10,10,9,1],ne_kernel=[3,3,1,9],stride=2","support","1","yes","CPU"
+"CPU","CONV_TRANSPOSE_2D","ne_input=[129,63,35,1],ne_kernel=[3,3,48,35],stride=1","support","1","yes","CPU"
 "CPU","COUNT_EQUAL","type=f32,ne=[4,500,1,1]","support","1","yes","CPU"
 "CPU","COUNT_EQUAL","type=f32,ne=[4,5000,1,1]","support","1","yes","CPU"
 "CPU","ARGMAX","type=f32,ne=[32,1,1,1]","support","1","yes","CPU"
@@ -5419,17 +5420,45 @@
 "CPU","CPY","type_src=f16,type_dst=f16,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","CPU"
 "CPU","CPY","type_src=f32,type_dst=f32,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","CPU"
 "CPU","CPY","type_src=bf16,type_dst=bf16,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","CPU"
+"CPU","CPY","type_src=i32,type_dst=i32,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","CPU"
+"CPU","CPY","type_src=i32,type_dst=i32,ne=[256,1,4,1],permute_src=[1,2,0,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","CPU"
 "CPU","CPY","type_src=f32,type_dst=f32,ne=[256,1,4,1],permute_src=[1,2,0,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","CPU"
-"CPU","CONT","type=f32,ne=[10,10,10,1]","support","1","yes","CPU"
-"CPU","CONT","type=f32,ne=[2,1,1,1]","support","1","yes","CPU"
-"CPU","CONT","type=f32,ne=[2,1,3,5]","support","1","yes","CPU"
-"CPU","CONT","type=f32,ne=[2,3,5,7]","support","1","yes","CPU"
-"CPU","CONT","type=f16,ne=[2,1,1,1]","support","1","yes","CPU"
-"CPU","CONT","type=f16,ne=[2,1,3,5]","support","1","yes","CPU"
-"CPU","CONT","type=f16,ne=[2,3,5,7]","support","1","yes","CPU"
-"CPU","CONT","type=bf16,ne=[2,1,1,1]","support","1","yes","CPU"
-"CPU","CONT","type=bf16,ne=[2,1,3,5]","support","1","yes","CPU"
-"CPU","CONT","type=bf16,ne=[2,3,5,7]","support","1","yes","CPU"
+"CPU","CONT","type=f32,ne=[2,1,1,1],use_view_slice=1","support","1","yes","CPU"
+"CPU","CONT","type=f32,ne=[2,1,3,5],use_view_slice=1","support","1","yes","CPU"
+"CPU","CONT","type=f32,ne=[2,3,5,7],use_view_slice=1","support","1","yes","CPU"
+"CPU","CONT","type=f32,ne=[1,4,4,1],use_view_slice=1","support","1","yes","CPU"
+"CPU","CONT","type=f32,ne=[1,8,17,1],use_view_slice=1","support","1","yes","CPU"
+"CPU","CONT","type=f32,ne=[10,10,10,1],use_view_slice=1","support","1","yes","CPU"
+"CPU","CONT","type=f32,ne=[2,1,1,1],use_view_slice=0","support","1","yes","CPU"
+"CPU","CONT","type=f32,ne=[2,1,3,5],use_view_slice=0","support","1","yes","CPU"
+"CPU","CONT","type=f32,ne=[2,3,5,7],use_view_slice=0","support","1","yes","CPU"
+"CPU","CONT","type=f32,ne=[1,4,4,1],use_view_slice=0","support","1","yes","CPU"
+"CPU","CONT","type=f32,ne=[1,8,17,1],use_view_slice=0","support","1","yes","CPU"
+"CPU","CONT","type=f32,ne=[10,10,10,1],use_view_slice=0","support","1","yes","CPU"
+"CPU","CONT","type=i32,ne=[2,1,1,1],use_view_slice=1","support","1","yes","CPU"
+"CPU","CONT","type=i32,ne=[2,1,3,5],use_view_slice=1","support","1","yes","CPU"
+"CPU","CONT","type=i32,ne=[2,3,5,7],use_view_slice=1","support","1","yes","CPU"
+"CPU","CONT","type=i32,ne=[1,4,4,1],use_view_slice=1","support","1","yes","CPU"
+"CPU","CONT","type=i32,ne=[1,8,17,1],use_view_slice=1","support","1","yes","CPU"
+"CPU","CONT","type=i32,ne=[10,10,10,1],use_view_slice=1","support","1","yes","CPU"
+"CPU","CONT","type=i32,ne=[2,1,1,1],use_view_slice=0","support","1","yes","CPU"
+"CPU","CONT","type=i32,ne=[2,1,3,5],use_view_slice=0","support","1","yes","CPU"
+"CPU","CONT","type=i32,ne=[2,3,5,7],use_view_slice=0","support","1","yes","CPU"
+"CPU","CONT","type=i32,ne=[1,4,4,1],use_view_slice=0","support","1","yes","CPU"
+"CPU","CONT","type=i32,ne=[1,8,17,1],use_view_slice=0","support","1","yes","CPU"
+"CPU","CONT","type=i32,ne=[10,10,10,1],use_view_slice=0","support","1","yes","CPU"
+"CPU","CONT","type=f16,ne=[2,1,1,1],use_view_slice=0","support","1","yes","CPU"
+"CPU","CONT","type=f16,ne=[2,1,3,5],use_view_slice=0","support","1","yes","CPU"
+"CPU","CONT","type=f16,ne=[2,3,5,7],use_view_slice=0","support","1","yes","CPU"
+"CPU","CONT","type=f16,ne=[1,4,4,1],use_view_slice=0","support","1","yes","CPU"
+"CPU","CONT","type=f16,ne=[1,8,17,1],use_view_slice=0","support","1","yes","CPU"
+"CPU","CONT","type=f16,ne=[10,10,10,1],use_view_slice=0","support","1","yes","CPU"
+"CPU","CONT","type=bf16,ne=[2,1,1,1],use_view_slice=0","support","1","yes","CPU"
+"CPU","CONT","type=bf16,ne=[2,1,3,5],use_view_slice=0","support","1","yes","CPU"
+"CPU","CONT","type=bf16,ne=[2,3,5,7],use_view_slice=0","support","1","yes","CPU"
+"CPU","CONT","type=bf16,ne=[1,4,4,1],use_view_slice=0","support","1","yes","CPU"
+"CPU","CONT","type=bf16,ne=[1,8,17,1],use_view_slice=0","support","1","yes","CPU"
+"CPU","CONT","type=bf16,ne=[10,10,10,1],use_view_slice=0","support","1","yes","CPU"
 "CPU","ADD","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1","support","1","yes","CPU"
 "CPU","SUB","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1","support","1","yes","CPU"
 "CPU","MUL","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1","support","1","yes","CPU"
@@ -5655,6 +5684,7 @@
 "CPU","MUL","type=f32,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1","support","1","yes","CPU"
 "CPU","DIV","type=f32,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1","support","1","yes","CPU"
 "CPU","ADD1","type=f32,ne=[10,5,4,3]","support","1","yes","CPU"
+"CPU","ADD1","type=f32,ne=[1024,1024,1,1]","support","1","yes","CPU"
 "CPU","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=0.000000,inplace=0","support","1","yes","CPU"
 "CPU","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=1.000000,inplace=0","support","1","yes","CPU"
 "CPU","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=1.000000,inplace=1","support","1","yes","CPU"
@@ -8644,9 +8674,13 @@
 "CPU","CLAMP","type=f16,ne=[7,1,5,3],min=-0.500000,max=0.500000","support","1","yes","CPU"
 "CPU","LEAKY_RELU","type=f16,ne_a=[7,1,5,3],negative_slope=0.100000","support","1","yes","CPU"
 "CPU","FLOOR","type=f16,ne=[7,1,5,3]","support","1","yes","CPU"
+"CPU","FLOOR","type=f16,ne=[1024,1024,1,1]","support","1","yes","CPU"
 "CPU","CEIL","type=f16,ne=[7,1,5,3]","support","1","yes","CPU"
+"CPU","CEIL","type=f16,ne=[1024,1024,1,1]","support","1","yes","CPU"
 "CPU","ROUND","type=f16,ne=[7,1,5,3]","support","1","yes","CPU"
+"CPU","ROUND","type=f16,ne=[1024,1024,1,1]","support","1","yes","CPU"
 "CPU","TRUNC","type=f16,ne=[7,1,5,3]","support","1","yes","CPU"
+"CPU","TRUNC","type=f16,ne=[1024,1024,1,1]","support","1","yes","CPU"
 "CPU","SQR","type=f32,ne=[10,5,4,3]","support","1","yes","CPU"
 "CPU","SQRT","type=f32,ne=[10,3,3,2]","support","1","yes","CPU"
 "CPU","LOG","type=f32,ne=[10,5,4,3]","support","1","yes","CPU"
@@ -8666,9 +8700,13 @@
 "CPU","CLAMP","type=f32,ne=[7,1,5,3],min=-0.500000,max=0.500000","support","1","yes","CPU"
 "CPU","LEAKY_RELU","type=f32,ne_a=[7,1,5,3],negative_slope=0.100000","support","1","yes","CPU"
 "CPU","FLOOR","type=f32,ne=[7,1,5,3]","support","1","yes","CPU"
+"CPU","FLOOR","type=f32,ne=[1024,1024,1,1]","support","1","yes","CPU"
 "CPU","CEIL","type=f32,ne=[7,1,5,3]","support","1","yes","CPU"
+"CPU","CEIL","type=f32,ne=[1024,1024,1,1]","support","1","yes","CPU"
 "CPU","ROUND","type=f32,ne=[7,1,5,3]","support","1","yes","CPU"
+"CPU","ROUND","type=f32,ne=[1024,1024,1,1]","support","1","yes","CPU"
 "CPU","TRUNC","type=f32,ne=[7,1,5,3]","support","1","yes","CPU"
+"CPU","TRUNC","type=f32,ne=[1024,1024,1,1]","support","1","yes","CPU"
 "CPU","DIAG_MASK_INF","type=f32,ne=[10,10,1,1],n_past=5","support","1","yes","CPU"
 "CPU","DIAG_MASK_INF","type=f32,ne=[10,10,3,1],n_past=5","support","1","yes","CPU"
 "CPU","DIAG_MASK_INF","type=f32,ne=[10,10,3,2],n_past=5","support","1","yes","CPU"
@@ -9411,18 +9449,405 @@
 "CPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","1","yes","CPU"
 "CPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","CPU"
 "CPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[3,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[4,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[7,1,1,1],order=0","support","1","yes","CPU"
 "CPU","ARGSORT","type=f32,ne=[8,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[15,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[16,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[31,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[32,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[63,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[64,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[127,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[128,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[255,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[256,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[511,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[512,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[1023,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[1024,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[2047,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[2048,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[4095,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[4096,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[8191,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[8192,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[16383,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[16384,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[32767,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[32768,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[65535,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[65536,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[131071,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[131072,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[262143,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[262144,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[524287,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[524288,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[1048575,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[1048576,1,1,1],order=0","support","1","yes","CPU"
 "CPU","ARGSORT","type=f32,ne=[16,10,10,10],order=0","support","1","yes","CPU"
 "CPU","ARGSORT","type=f32,ne=[60,10,10,10],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[1024,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[16384,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[1023,2,1,3],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[1024,2,1,3],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[1025,2,1,3],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[2047,2,1,3],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[2048,2,1,3],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[2049,2,1,3],order=0","support","1","yes","CPU"
 "CPU","ARGSORT","type=f32,ne=[2,8,8192,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[8,1,1,1],order=1","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[3,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[4,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[7,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[8,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[15,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[16,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[31,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[32,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[63,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[64,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[127,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[128,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[255,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[256,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[511,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[512,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[1023,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[1024,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[2047,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[2048,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[4095,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[4096,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[8191,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[8192,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[16383,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[16384,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[32767,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[32768,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[65535,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[65536,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[131071,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[131072,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[262143,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[262144,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[524287,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[524288,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[1048575,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[1048576,1,1,1],order=0","support","1","yes","CPU"
 "CPU","ARGSORT","type=f32,ne=[16,10,10,10],order=1","support","1","yes","CPU"
 "CPU","ARGSORT","type=f32,ne=[60,10,10,10],order=1","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[1024,1,1,1],order=1","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[16384,1,1,1],order=1","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[1023,2,1,3],order=1","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[1024,2,1,3],order=1","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[1025,2,1,3],order=1","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[2047,2,1,3],order=1","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[2048,2,1,3],order=1","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[2049,2,1,3],order=1","support","1","yes","CPU"
 "CPU","ARGSORT","type=f32,ne=[2,8,8192,1],order=1","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1,1,1,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[12,1,2,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2,1,1,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[13,1,2,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2,1,1,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[13,1,2,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[4,1,1,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[15,1,2,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[4,1,1,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[15,1,2,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[4,1,1,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[15,1,2,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[8,1,1,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[19,1,2,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[8,1,1,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[19,1,2,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[8,1,1,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[19,1,2,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[8,1,1,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[19,1,2,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16,1,1,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[27,1,2,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16,1,1,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[27,1,2,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16,1,1,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[27,1,2,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16,1,1,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[27,1,2,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16,1,1,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[27,1,2,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[32,1,1,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[43,1,2,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[32,1,1,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[43,1,2,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[32,1,1,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[43,1,2,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[32,1,1,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[43,1,2,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[32,1,1,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[43,1,2,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[64,1,1,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[75,1,2,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[64,1,1,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[75,1,2,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[64,1,1,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[75,1,2,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[64,1,1,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[75,1,2,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[64,1,1,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[75,1,2,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[128,1,1,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[139,1,2,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[128,1,1,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[139,1,2,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[128,1,1,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[139,1,2,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[128,1,1,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[139,1,2,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[128,1,1,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[139,1,2,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[128,1,1,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[139,1,2,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[256,1,1,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[267,1,2,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[256,1,1,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[267,1,2,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[256,1,1,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[267,1,2,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[256,1,1,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[267,1,2,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[256,1,1,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[267,1,2,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[256,1,1,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[267,1,2,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[512,1,1,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[523,1,2,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[512,1,1,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[523,1,2,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[512,1,1,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[523,1,2,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[512,1,1,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[523,1,2,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[512,1,1,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[523,1,2,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[512,1,1,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[523,1,2,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[512,1,1,1],k=500,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[523,1,2,1],k=500,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1024,1,1,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1035,1,2,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1024,1,1,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1035,1,2,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1024,1,1,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1035,1,2,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1024,1,1,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1035,1,2,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1024,1,1,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1035,1,2,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1024,1,1,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1035,1,2,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1024,1,1,1],k=500,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1035,1,2,1],k=500,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1024,1,1,1],k=1023,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1035,1,2,1],k=1023,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2048,1,1,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2059,1,2,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2048,1,1,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2059,1,2,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2048,1,1,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2059,1,2,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2048,1,1,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2059,1,2,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2048,1,1,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2059,1,2,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2048,1,1,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2059,1,2,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2048,1,1,1],k=500,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2059,1,2,1],k=500,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2048,1,1,1],k=1023,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2059,1,2,1],k=1023,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[4096,1,1,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[4107,1,2,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[4096,1,1,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[4107,1,2,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[4096,1,1,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[4107,1,2,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[4096,1,1,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[4107,1,2,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[4096,1,1,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[4107,1,2,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[4096,1,1,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[4107,1,2,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[4096,1,1,1],k=500,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[4107,1,2,1],k=500,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[4096,1,1,1],k=1023,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[4107,1,2,1],k=1023,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[8192,1,1,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[8203,1,2,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[8192,1,1,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[8203,1,2,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[8192,1,1,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[8203,1,2,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[8192,1,1,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[8203,1,2,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[8192,1,1,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[8203,1,2,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[8192,1,1,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[8203,1,2,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[8192,1,1,1],k=500,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[8203,1,2,1],k=500,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[8192,1,1,1],k=1023,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[8203,1,2,1],k=1023,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16395,1,2,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16395,1,2,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16395,1,2,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16395,1,2,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16395,1,2,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16395,1,2,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=500,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16395,1,2,1],k=500,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=1023,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16395,1,2,1],k=1023,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=9999,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16395,1,2,1],k=9999,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[32768,1,1,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[32779,1,2,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[32768,1,1,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[32779,1,2,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[32768,1,1,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[32779,1,2,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[32768,1,1,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[32779,1,2,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[32768,1,1,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[32779,1,2,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[32768,1,1,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[32779,1,2,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[32768,1,1,1],k=500,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[32779,1,2,1],k=500,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[32768,1,1,1],k=1023,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[32779,1,2,1],k=1023,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[32768,1,1,1],k=9999,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[32779,1,2,1],k=9999,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[65536,1,1,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[65547,1,2,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[65536,1,1,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[65547,1,2,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[65536,1,1,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[65547,1,2,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[65536,1,1,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[65547,1,2,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[65536,1,1,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[65547,1,2,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[65536,1,1,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[65547,1,2,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[65536,1,1,1],k=500,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[65547,1,2,1],k=500,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[65536,1,1,1],k=1023,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[65547,1,2,1],k=1023,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[65536,1,1,1],k=9999,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[65547,1,2,1],k=9999,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[131072,1,1,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[131083,1,2,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[131072,1,1,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[131083,1,2,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[131072,1,1,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[131083,1,2,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[131072,1,1,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[131083,1,2,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[131072,1,1,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[131083,1,2,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[131072,1,1,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[131083,1,2,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[131072,1,1,1],k=500,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[131083,1,2,1],k=500,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[131072,1,1,1],k=1023,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[131083,1,2,1],k=1023,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[131072,1,1,1],k=9999,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[131083,1,2,1],k=9999,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[262144,1,1,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[262155,1,2,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[262144,1,1,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[262155,1,2,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[262144,1,1,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[262155,1,2,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[262144,1,1,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[262155,1,2,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[262144,1,1,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[262155,1,2,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[262144,1,1,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[262155,1,2,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[262144,1,1,1],k=500,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[262155,1,2,1],k=500,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[262144,1,1,1],k=1023,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[262155,1,2,1],k=1023,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[262144,1,1,1],k=9999,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[262155,1,2,1],k=9999,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[524288,1,1,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[524299,1,2,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[524288,1,1,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[524299,1,2,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[524288,1,1,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[524299,1,2,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[524288,1,1,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[524299,1,2,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[524288,1,1,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[524299,1,2,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[524288,1,1,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[524299,1,2,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[524288,1,1,1],k=500,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[524299,1,2,1],k=500,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[524288,1,1,1],k=1023,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[524299,1,2,1],k=1023,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[524288,1,1,1],k=9999,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[524299,1,2,1],k=9999,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16,10,10,10],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[60,10,10,10],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1023,2,1,3],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1024,2,1,3],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1025,2,1,3],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2047,2,1,3],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2048,2,1,3],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2049,2,1,3],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16,10,10,10],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[60,10,10,10],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1023,2,1,3],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1024,2,1,3],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1025,2,1,3],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2047,2,1,3],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2048,2,1,3],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2049,2,1,3],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16,10,10,10],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[60,10,10,10],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1023,2,1,3],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1024,2,1,3],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1025,2,1,3],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2047,2,1,3],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2048,2,1,3],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2049,2,1,3],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16,10,10,10],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[60,10,10,10],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1023,2,1,3],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1024,2,1,3],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1025,2,1,3],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2047,2,1,3],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2048,2,1,3],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2049,2,1,3],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16,10,10,10],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[60,10,10,10],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1023,2,1,3],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1024,2,1,3],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1025,2,1,3],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2047,2,1,3],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2048,2,1,3],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2049,2,1,3],k=15,ties=0","support","1","yes","CPU"
 "CPU","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=0","support","1","yes","CPU"
 "CPU","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=1","support","1","yes","CPU"
 "CPU","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=nearest,flags=none","support","1","yes","CPU"
@@ -9435,6 +9860,10 @@
 "CPU","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bicubic,transpose=1","support","1","yes","CPU"
 "CPU","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bicubic,flags=none","support","1","yes","CPU"
 "CPU","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bicubic,flags=none","support","1","yes","CPU"
+"CPU","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=513,transpose=0","support","1","yes","CPU"
+"CPU","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=513,transpose=1","support","1","yes","CPU"
+"CPU","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear,flags=none","support","1","yes","CPU"
+"CPU","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bilinear,flags=none","support","1","yes","CPU"
 "CPU","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear,flags=align_corners","support","1","yes","CPU"
 "CPU","UPSCALE","type=f32,ne=[1,4,3,2],ne_tgt=[2,8,3,2],mode=bilinear,flags=align_corners","support","1","yes","CPU"
 "CPU","UPSCALE","type=f32,ne=[4,1,3,2],ne_tgt=[1,1,3,2],mode=bilinear,flags=align_corners","support","1","yes","CPU"
@@ -9463,15 +9892,30 @@
 "CPU","GROUP_NORM","type=f32,ne=[64,64,320,1],num_groups=32,eps=0.000001","support","1","yes","CPU"
 "CPU","GROUP_NORM","type=f32,ne=[9,9,1280,1],num_groups=32,eps=0.000001","support","1","yes","CPU"
 "CPU","ACC","type=f32,ne_a=[256,17,1,1],ne_b=[256,16,1,1]","support","1","yes","CPU"
-"CPU","PAD","type=f32,ne_a=[512,512,1,1],pad_0=1,pad_1=1","support","1","yes","CPU"
-"CPU","PAD","type=f32,ne_a=[512,512,3,1],lp0=1,rp0=1,lp1=1,rp1=1,lp2=1,rp2=1,lp3=1,rp3=1,v=0","support","1","yes","CPU"
+"CPU","PAD","type=f32,ne_a=[512,512,1,1],pad_0=1,pad_1=1,circular=0","support","1","yes","CPU"
+"CPU","PAD","type=f32,ne_a=[33,17,2,1],pad_0=4,pad_1=3,circular=1","support","1","yes","CPU"
+"CPU","PAD","type=f32,ne_a=[512,512,3,1],lp0=1,rp0=1,lp1=1,rp1=1,lp2=1,rp2=1,lp3=1,rp3=1,v=0,circular=0","support","1","yes","CPU"
 "CPU","PAD_REFLECT_1D","type=f32,ne_a=[512,34,2,1],pad_0=10,pad_1=9","support","1","yes","CPU"
 "CPU","PAD_REFLECT_1D","type=f32,ne_a=[3000,384,4,1],pad_0=10,pad_1=9","support","1","yes","CPU"
 "CPU","ROLL","shift0=3,shift1=-2,shift3=1,shift4=-1","support","1","yes","CPU"
 "CPU","ARANGE","type=f32,start=0.000000,stop=10.000000,step=1.000000","support","1","yes","CPU"
+"CPU","ARANGE","type=f32,start=0.000000,stop=1048576.000000,step=1.000000","support","1","yes","CPU"
 "CPU","TIMESTEP_EMBEDDING","type=f32,ne_a=[2,1,1,1],dim=320,max_period=10000","support","1","yes","CPU"
 "CPU","LEAKY_RELU","type=f32,ne_a=[10,5,4,3],negative_slope=0.100000","support","1","yes","CPU"
 "CPU","CUMSUM","type=f32,ne=[10,5,4,3]","support","1","yes","CPU"
+"CPU","CUMSUM","type=f32,ne=[127,5,4,3]","support","1","yes","CPU"
+"CPU","CUMSUM","type=f32,ne=[128,5,4,3]","support","1","yes","CPU"
+"CPU","CUMSUM","type=f32,ne=[128,128,4,4]","support","1","yes","CPU"
+"CPU","CUMSUM","type=f32,ne=[255,5,4,3]","support","1","yes","CPU"
+"CPU","CUMSUM","type=f32,ne=[256,5,4,3]","support","1","yes","CPU"
+"CPU","CUMSUM","type=f32,ne=[511,5,4,3]","support","1","yes","CPU"
+"CPU","CUMSUM","type=f32,ne=[512,5,4,3]","support","1","yes","CPU"
+"CPU","CUMSUM","type=f32,ne=[1023,5,4,3]","support","1","yes","CPU"
+"CPU","CUMSUM","type=f32,ne=[1024,5,4,3]","support","1","yes","CPU"
+"CPU","CUMSUM","type=f32,ne=[2047,5,4,3]","support","1","yes","CPU"
+"CPU","CUMSUM","type=f32,ne=[2048,5,4,3]","support","1","yes","CPU"
+"CPU","CUMSUM","type=f32,ne=[242004,1,1,1]","support","1","yes","CPU"
+"CPU","CUMSUM","type=f32,ne=[375960,1,1,1]","support","1","yes","CPU"
 "CPU","XIELU","type=f32,ne=[10,5,4,3]","support","1","yes","CPU"
 "CPU","TRI","type=f32,ne=[10,10,4,3],tri_type=3","support","1","yes","CPU"
 "CPU","TRI","type=f32,ne=[10,10,4,3],tri_type=2","support","1","yes","CPU"
@@ -9480,6 +9924,10 @@
 "CPU","FILL","type=f32,ne=[10,10,4,3],c=0.000000","support","1","yes","CPU"
 "CPU","FILL","type=f32,ne=[303,207,11,3],c=2.000000","support","1","yes","CPU"
 "CPU","FILL","type=f32,ne=[800,600,4,4],c=-152.000000","support","1","yes","CPU"
+"CPU","FILL","type=f32,ne=[2048,512,2,2],c=3.500000","support","1","yes","CPU"
+"CPU","DIAG","type=f32,ne=[10,1,4,3]","support","1","yes","CPU"
+"CPU","DIAG","type=f32,ne=[79,1,19,13]","support","1","yes","CPU"
+"CPU","DIAG","type=f32,ne=[256,1,8,16]","support","1","yes","CPU"
 "CPU","SOLVE_TRI","type=f32,ne_lhs=[10,10,4,3],ne_rhs=[3,10,4,3]","support","1","yes","CPU"
 "CPU","SOLVE_TRI","type=f32,ne_lhs=[11,11,1,1],ne_rhs=[5,11,1,1]","support","1","yes","CPU"
 "CPU","SOLVE_TRI","type=f32,ne_lhs=[17,17,2,4],ne_rhs=[9,17,2,4]","support","1","yes","CPU"
@@ -9487,10 +9935,16 @@
 "CPU","SOLVE_TRI","type=f32,ne_lhs=[42,42,5,2],ne_rhs=[10,42,5,2]","support","1","yes","CPU"
 "CPU","SOLVE_TRI","type=f32,ne_lhs=[64,64,2,2],ne_rhs=[10,64,2,2]","support","1","yes","CPU"
 "CPU","SOLVE_TRI","type=f32,ne_lhs=[100,100,4,4],ne_rhs=[41,100,4,4]","support","1","yes","CPU"
-"CPU","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0","support","1","yes","CPU"
-"CPU","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0","support","1","yes","CPU"
-"CPU","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1","support","1","yes","CPU"
-"CPU","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1","support","1","yes","CPU"
+"CPU","SOLVE_TRI","type=f32,ne_lhs=[128,128,4,4],ne_rhs=[31,128,4,4]","support","1","yes","CPU"
+"CPU","SOLVE_TRI","type=f32,ne_lhs=[64,64,4,4],ne_rhs=[300,64,4,4]","support","1","yes","CPU"
+"CPU","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0,circular=0","support","1","yes","CPU"
+"CPU","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0,circular=0","support","1","yes","CPU"
+"CPU","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0,circular=1","support","1","yes","CPU"
+"CPU","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0,circular=1","support","1","yes","CPU"
+"CPU","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1,circular=0","support","1","yes","CPU"
+"CPU","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1,circular=0","support","1","yes","CPU"
+"CPU","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1,circular=1","support","1","yes","CPU"
+"CPU","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1,circular=1","support","1","yes","CPU"
 "CPU","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","CPU"
 "CPU","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","CPU"
 "CPU","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","1","yes","CPU"
--- a/docs/ops/CUDA.csv
+++ b/docs/ops/CUDA.csv
@@ -4964,6 +4964,7 @@
 "CUDA0","CONV_TRANSPOSE_1D","ne_input=[2,1,1,1],ne_kernel=[3,1,1,1],s0=1,p0=0,d0=1","support","1","yes","CUDA"
 "CUDA0","CONV_TRANSPOSE_2D","ne_input=[3,2,3,1],ne_kernel=[2,2,1,3],stride=1","support","1","yes","CUDA"
 "CUDA0","CONV_TRANSPOSE_2D","ne_input=[10,10,9,1],ne_kernel=[3,3,1,9],stride=2","support","1","yes","CUDA"
+"CUDA0","CONV_TRANSPOSE_2D","ne_input=[129,63,35,1],ne_kernel=[3,3,48,35],stride=1","support","1","yes","CUDA"
 "CUDA0","COUNT_EQUAL","type=f32,ne=[4,500,1,1]","support","1","yes","CUDA"
 "CUDA0","COUNT_EQUAL","type=f32,ne=[4,5000,1,1]","support","1","yes","CUDA"
 "CUDA0","ARGMAX","type=f32,ne=[32,1,1,1]","support","1","yes","CUDA"
@@ -5419,17 +5420,45 @@
 "CUDA0","CPY","type_src=f16,type_dst=f16,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","CUDA"
 "CUDA0","CPY","type_src=f32,type_dst=f32,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","CUDA"
 "CUDA0","CPY","type_src=bf16,type_dst=bf16,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","CUDA"
+"CUDA0","CPY","type_src=i32,type_dst=i32,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","CUDA"
+"CUDA0","CPY","type_src=i32,type_dst=i32,ne=[256,1,4,1],permute_src=[1,2,0,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","CUDA"
 "CUDA0","CPY","type_src=f32,type_dst=f32,ne=[256,1,4,1],permute_src=[1,2,0,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","CUDA"
-"CUDA0","CONT","type=f32,ne=[10,10,10,1]","support","1","yes","CUDA"
-"CUDA0","CONT","type=f32,ne=[2,1,1,1]","support","1","yes","CUDA"
-"CUDA0","CONT","type=f32,ne=[2,1,3,5]","support","1","yes","CUDA"
-"CUDA0","CONT","type=f32,ne=[2,3,5,7]","support","1","yes","CUDA"
-"CUDA0","CONT","type=f16,ne=[2,1,1,1]","support","1","yes","CUDA"
-"CUDA0","CONT","type=f16,ne=[2,1,3,5]","support","1","yes","CUDA"
-"CUDA0","CONT","type=f16,ne=[2,3,5,7]","support","1","yes","CUDA"
-"CUDA0","CONT","type=bf16,ne=[2,1,1,1]","support","1","yes","CUDA"
-"CUDA0","CONT","type=bf16,ne=[2,1,3,5]","support","1","yes","CUDA"
-"CUDA0","CONT","type=bf16,ne=[2,3,5,7]","support","1","yes","CUDA"
+"CUDA0","CONT","type=f32,ne=[2,1,1,1],use_view_slice=1","support","1","yes","CUDA"
+"CUDA0","CONT","type=f32,ne=[2,1,3,5],use_view_slice=1","support","1","yes","CUDA"
+"CUDA0","CONT","type=f32,ne=[2,3,5,7],use_view_slice=1","support","1","yes","CUDA"
+"CUDA0","CONT","type=f32,ne=[1,4,4,1],use_view_slice=1","support","1","yes","CUDA"
+"CUDA0","CONT","type=f32,ne=[1,8,17,1],use_view_slice=1","support","1","yes","CUDA"
+"CUDA0","CONT","type=f32,ne=[10,10,10,1],use_view_slice=1","support","1","yes","CUDA"
+"CUDA0","CONT","type=f32,ne=[2,1,1,1],use_view_slice=0","support","1","yes","CUDA"
+"CUDA0","CONT","type=f32,ne=[2,1,3,5],use_view_slice=0","support","1","yes","CUDA"
+"CUDA0","CONT","type=f32,ne=[2,3,5,7],use_view_slice=0","support","1","yes","CUDA"
+"CUDA0","CONT","type=f32,ne=[1,4,4,1],use_view_slice=0","support","1","yes","CUDA"
+"CUDA0","CONT","type=f32,ne=[1,8,17,1],use_view_slice=0","support","1","yes","CUDA"
+"CUDA0","CONT","type=f32,ne=[10,10,10,1],use_view_slice=0","support","1","yes","CUDA"
+"CUDA0","CONT","type=i32,ne=[2,1,1,1],use_view_slice=1","support","1","yes","CUDA"
+"CUDA0","CONT","type=i32,ne=[2,1,3,5],use_view_slice=1","support","1","yes","CUDA"
+"CUDA0","CONT","type=i32,ne=[2,3,5,7],use_view_slice=1","support","1","yes","CUDA"
+"CUDA0","CONT","type=i32,ne=[1,4,4,1],use_view_slice=1","support","1","yes","CUDA"
+"CUDA0","CONT","type=i32,ne=[1,8,17,1],use_view_slice=1","support","1","yes","CUDA"
+"CUDA0","CONT","type=i32,ne=[10,10,10,1],use_view_slice=1","support","1","yes","CUDA"
+"CUDA0","CONT","type=i32,ne=[2,1,1,1],use_view_slice=0","support","1","yes","CUDA"
+"CUDA0","CONT","type=i32,ne=[2,1,3,5],use_view_slice=0","support","1","yes","CUDA"
+"CUDA0","CONT","type=i32,ne=[2,3,5,7],use_view_slice=0","support","1","yes","CUDA"
+"CUDA0","CONT","type=i32,ne=[1,4,4,1],use_view_slice=0","support","1","yes","CUDA"
+"CUDA0","CONT","type=i32,ne=[1,8,17,1],use_view_slice=0","support","1","yes","CUDA"
+"CUDA0","CONT","type=i32,ne=[10,10,10,1],use_view_slice=0","support","1","yes","CUDA"
+"CUDA0","CONT","type=f16,ne=[2,1,1,1],use_view_slice=0","support","1","yes","CUDA"
+"CUDA0","CONT","type=f16,ne=[2,1,3,5],use_view_slice=0","support","1","yes","CUDA"
+"CUDA0","CONT","type=f16,ne=[2,3,5,7],use_view_slice=0","support","1","yes","CUDA"
+"CUDA0","CONT","type=f16,ne=[1,4,4,1],use_view_slice=0","support","1","yes","CUDA"
+"CUDA0","CONT","type=f16,ne=[1,8,17,1],use_view_slice=0","support","1","yes","CUDA"
+"CUDA0","CONT","type=f16,ne=[10,10,10,1],use_view_slice=0","support","1","yes","CUDA"
+"CUDA0","CONT","type=bf16,ne=[2,1,1,1],use_view_slice=0","support","1","yes","CUDA"
+"CUDA0","CONT","type=bf16,ne=[2,1,3,5],use_view_slice=0","support","1","yes","CUDA"
+"CUDA0","CONT","type=bf16,ne=[2,3,5,7],use_view_slice=0","support","1","yes","CUDA"
+"CUDA0","CONT","type=bf16,ne=[1,4,4,1],use_view_slice=0","support","1","yes","CUDA"
+"CUDA0","CONT","type=bf16,ne=[1,8,17,1],use_view_slice=0","support","1","yes","CUDA"
+"CUDA0","CONT","type=bf16,ne=[10,10,10,1],use_view_slice=0","support","1","yes","CUDA"
 "CUDA0","ADD","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1","support","1","yes","CUDA"
 "CUDA0","SUB","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1","support","1","yes","CUDA"
 "CUDA0","MUL","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1","support","1","yes","CUDA"
@@ -5655,6 +5684,7 @@
 "CUDA0","MUL","type=f32,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1","support","1","yes","CUDA"
 "CUDA0","DIV","type=f32,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1","support","1","yes","CUDA"
 "CUDA0","ADD1","type=f32,ne=[10,5,4,3]","support","1","yes","CUDA"
+"CUDA0","ADD1","type=f32,ne=[1024,1024,1,1]","support","1","yes","CUDA"
 "CUDA0","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=0.000000,inplace=0","support","1","yes","CUDA"
 "CUDA0","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=1.000000,inplace=0","support","1","yes","CUDA"
 "CUDA0","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=1.000000,inplace=1","support","1","yes","CUDA"
@@ -8644,9 +8674,13 @@
 "CUDA0","CLAMP","type=f16,ne=[7,1,5,3],min=-0.500000,max=0.500000","support","1","yes","CUDA"
 "CUDA0","LEAKY_RELU","type=f16,ne_a=[7,1,5,3],negative_slope=0.100000","support","1","yes","CUDA"
 "CUDA0","FLOOR","type=f16,ne=[7,1,5,3]","support","1","yes","CUDA"
+"CUDA0","FLOOR","type=f16,ne=[1024,1024,1,1]","support","1","yes","CUDA"
 "CUDA0","CEIL","type=f16,ne=[7,1,5,3]","support","1","yes","CUDA"
+"CUDA0","CEIL","type=f16,ne=[1024,1024,1,1]","support","1","yes","CUDA"
 "CUDA0","ROUND","type=f16,ne=[7,1,5,3]","support","1","yes","CUDA"
+"CUDA0","ROUND","type=f16,ne=[1024,1024,1,1]","support","1","yes","CUDA"
 "CUDA0","TRUNC","type=f16,ne=[7,1,5,3]","support","1","yes","CUDA"
+"CUDA0","TRUNC","type=f16,ne=[1024,1024,1,1]","support","1","yes","CUDA"
 "CUDA0","SQR","type=f32,ne=[10,5,4,3]","support","1","yes","CUDA"
 "CUDA0","SQRT","type=f32,ne=[10,3,3,2]","support","1","yes","CUDA"
 "CUDA0","LOG","type=f32,ne=[10,5,4,3]","support","1","yes","CUDA"
@@ -8666,9 +8700,13 @@
 "CUDA0","CLAMP","type=f32,ne=[7,1,5,3],min=-0.500000,max=0.500000","support","1","yes","CUDA"
 "CUDA0","LEAKY_RELU","type=f32,ne_a=[7,1,5,3],negative_slope=0.100000","support","1","yes","CUDA"
 "CUDA0","FLOOR","type=f32,ne=[7,1,5,3]","support","1","yes","CUDA"
+"CUDA0","FLOOR","type=f32,ne=[1024,1024,1,1]","support","1","yes","CUDA"
 "CUDA0","CEIL","type=f32,ne=[7,1,5,3]","support","1","yes","CUDA"
+"CUDA0","CEIL","type=f32,ne=[1024,1024,1,1]","support","1","yes","CUDA"
 "CUDA0","ROUND","type=f32,ne=[7,1,5,3]","support","1","yes","CUDA"
+"CUDA0","ROUND","type=f32,ne=[1024,1024,1,1]","support","1","yes","CUDA"
 "CUDA0","TRUNC","type=f32,ne=[7,1,5,3]","support","1","yes","CUDA"
+"CUDA0","TRUNC","type=f32,ne=[1024,1024,1,1]","support","1","yes","CUDA"
 "CUDA0","DIAG_MASK_INF","type=f32,ne=[10,10,1,1],n_past=5","support","1","yes","CUDA"
 "CUDA0","DIAG_MASK_INF","type=f32,ne=[10,10,3,1],n_past=5","support","1","yes","CUDA"
 "CUDA0","DIAG_MASK_INF","type=f32,ne=[10,10,3,2],n_past=5","support","1","yes","CUDA"
@@ -9411,18 +9449,405 @@
 "CUDA0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","0","no","CUDA"
 "CUDA0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","CUDA"
 "CUDA0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","0","no","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[3,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[4,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[7,1,1,1],order=0","support","1","yes","CUDA"
 "CUDA0","ARGSORT","type=f32,ne=[8,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[15,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[16,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[31,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[32,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[63,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[64,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[127,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[128,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[255,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[256,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[511,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[512,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[1023,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[1024,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[2047,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[2048,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[4095,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[4096,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[8191,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[8192,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[16383,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[16384,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[32767,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[32768,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[65535,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[65536,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[131071,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[131072,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[262143,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[262144,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[524287,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[524288,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[1048575,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[1048576,1,1,1],order=0","support","1","yes","CUDA"
 "CUDA0","ARGSORT","type=f32,ne=[16,10,10,10],order=0","support","1","yes","CUDA"
 "CUDA0","ARGSORT","type=f32,ne=[60,10,10,10],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[1024,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[16384,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[1023,2,1,3],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[1024,2,1,3],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[1025,2,1,3],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[2047,2,1,3],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[2048,2,1,3],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[2049,2,1,3],order=0","support","1","yes","CUDA"
 "CUDA0","ARGSORT","type=f32,ne=[2,8,8192,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[8,1,1,1],order=1","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[3,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[4,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[7,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[8,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[15,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[16,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[31,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[32,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[63,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[64,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[127,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[128,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[255,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[256,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[511,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[512,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[1023,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[1024,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[2047,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[2048,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[4095,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[4096,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[8191,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[8192,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[16383,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[16384,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[32767,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[32768,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[65535,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[65536,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[131071,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[131072,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[262143,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[262144,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[524287,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[524288,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[1048575,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[1048576,1,1,1],order=0","support","1","yes","CUDA"
 "CUDA0","ARGSORT","type=f32,ne=[16,10,10,10],order=1","support","1","yes","CUDA"
 "CUDA0","ARGSORT","type=f32,ne=[60,10,10,10],order=1","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[1024,1,1,1],order=1","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[16384,1,1,1],order=1","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[1023,2,1,3],order=1","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[1024,2,1,3],order=1","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[1025,2,1,3],order=1","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[2047,2,1,3],order=1","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[2048,2,1,3],order=1","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[2049,2,1,3],order=1","support","1","yes","CUDA"
 "CUDA0","ARGSORT","type=f32,ne=[2,8,8192,1],order=1","support","1","yes","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1,1,1,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[12,1,2,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2,1,1,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[13,1,2,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2,1,1,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[13,1,2,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[4,1,1,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[15,1,2,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[4,1,1,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[15,1,2,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[4,1,1,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[15,1,2,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[8,1,1,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[19,1,2,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[8,1,1,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[19,1,2,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[8,1,1,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[19,1,2,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[8,1,1,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[19,1,2,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16,1,1,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[27,1,2,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16,1,1,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[27,1,2,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16,1,1,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[27,1,2,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16,1,1,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[27,1,2,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16,1,1,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[27,1,2,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[32,1,1,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[43,1,2,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[32,1,1,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[43,1,2,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[32,1,1,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[43,1,2,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[32,1,1,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[43,1,2,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[32,1,1,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[43,1,2,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[64,1,1,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[75,1,2,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[64,1,1,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[75,1,2,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[64,1,1,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[75,1,2,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[64,1,1,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[75,1,2,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[64,1,1,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[75,1,2,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[128,1,1,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[139,1,2,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[128,1,1,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[139,1,2,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[128,1,1,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[139,1,2,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[128,1,1,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[139,1,2,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[128,1,1,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[139,1,2,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[128,1,1,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[139,1,2,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[256,1,1,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[267,1,2,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[256,1,1,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[267,1,2,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[256,1,1,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[267,1,2,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[256,1,1,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[267,1,2,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[256,1,1,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[267,1,2,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[256,1,1,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[267,1,2,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[512,1,1,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[523,1,2,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[512,1,1,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[523,1,2,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[512,1,1,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[523,1,2,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[512,1,1,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[523,1,2,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[512,1,1,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[523,1,2,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[512,1,1,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[523,1,2,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[512,1,1,1],k=500,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[523,1,2,1],k=500,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1024,1,1,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1035,1,2,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1024,1,1,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1035,1,2,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1024,1,1,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1035,1,2,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1024,1,1,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1035,1,2,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1024,1,1,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1035,1,2,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1024,1,1,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1035,1,2,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1024,1,1,1],k=500,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1035,1,2,1],k=500,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1024,1,1,1],k=1023,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1035,1,2,1],k=1023,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2048,1,1,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2059,1,2,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2048,1,1,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2059,1,2,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2048,1,1,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2059,1,2,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2048,1,1,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2059,1,2,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2048,1,1,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2059,1,2,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2048,1,1,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2059,1,2,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2048,1,1,1],k=500,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2059,1,2,1],k=500,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2048,1,1,1],k=1023,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2059,1,2,1],k=1023,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[4096,1,1,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[4107,1,2,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[4096,1,1,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[4107,1,2,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[4096,1,1,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[4107,1,2,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[4096,1,1,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[4107,1,2,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[4096,1,1,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[4107,1,2,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[4096,1,1,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[4107,1,2,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[4096,1,1,1],k=500,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[4107,1,2,1],k=500,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[4096,1,1,1],k=1023,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[4107,1,2,1],k=1023,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[8192,1,1,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[8203,1,2,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[8192,1,1,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[8203,1,2,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[8192,1,1,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[8203,1,2,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[8192,1,1,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[8203,1,2,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[8192,1,1,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[8203,1,2,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[8192,1,1,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[8203,1,2,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[8192,1,1,1],k=500,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[8203,1,2,1],k=500,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[8192,1,1,1],k=1023,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[8203,1,2,1],k=1023,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16395,1,2,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16395,1,2,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16395,1,2,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16395,1,2,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16395,1,2,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16395,1,2,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=500,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16395,1,2,1],k=500,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=1023,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16395,1,2,1],k=1023,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=9999,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16395,1,2,1],k=9999,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[32768,1,1,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[32779,1,2,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[32768,1,1,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[32779,1,2,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[32768,1,1,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[32779,1,2,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[32768,1,1,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[32779,1,2,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[32768,1,1,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[32779,1,2,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[32768,1,1,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[32779,1,2,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[32768,1,1,1],k=500,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[32779,1,2,1],k=500,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[32768,1,1,1],k=1023,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[32779,1,2,1],k=1023,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[32768,1,1,1],k=9999,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[32779,1,2,1],k=9999,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[65536,1,1,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[65547,1,2,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[65536,1,1,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[65547,1,2,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[65536,1,1,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[65547,1,2,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[65536,1,1,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[65547,1,2,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[65536,1,1,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[65547,1,2,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[65536,1,1,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[65547,1,2,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[65536,1,1,1],k=500,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[65547,1,2,1],k=500,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[65536,1,1,1],k=1023,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[65547,1,2,1],k=1023,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[65536,1,1,1],k=9999,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[65547,1,2,1],k=9999,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[131072,1,1,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[131083,1,2,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[131072,1,1,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[131083,1,2,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[131072,1,1,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[131083,1,2,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[131072,1,1,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[131083,1,2,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[131072,1,1,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[131083,1,2,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[131072,1,1,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[131083,1,2,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[131072,1,1,1],k=500,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[131083,1,2,1],k=500,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[131072,1,1,1],k=1023,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[131083,1,2,1],k=1023,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[131072,1,1,1],k=9999,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[131083,1,2,1],k=9999,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[262144,1,1,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[262155,1,2,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[262144,1,1,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[262155,1,2,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[262144,1,1,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[262155,1,2,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[262144,1,1,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[262155,1,2,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[262144,1,1,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[262155,1,2,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[262144,1,1,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[262155,1,2,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[262144,1,1,1],k=500,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[262155,1,2,1],k=500,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[262144,1,1,1],k=1023,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[262155,1,2,1],k=1023,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[262144,1,1,1],k=9999,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[262155,1,2,1],k=9999,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[524288,1,1,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[524299,1,2,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[524288,1,1,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[524299,1,2,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[524288,1,1,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[524299,1,2,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[524288,1,1,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[524299,1,2,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[524288,1,1,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[524299,1,2,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[524288,1,1,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[524299,1,2,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[524288,1,1,1],k=500,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[524299,1,2,1],k=500,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[524288,1,1,1],k=1023,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[524299,1,2,1],k=1023,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[524288,1,1,1],k=9999,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[524299,1,2,1],k=9999,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16,10,10,10],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[60,10,10,10],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1023,2,1,3],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1024,2,1,3],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1025,2,1,3],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2047,2,1,3],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2048,2,1,3],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2049,2,1,3],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16,10,10,10],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[60,10,10,10],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1023,2,1,3],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1024,2,1,3],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1025,2,1,3],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2047,2,1,3],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2048,2,1,3],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2049,2,1,3],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16,10,10,10],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[60,10,10,10],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1023,2,1,3],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1024,2,1,3],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1025,2,1,3],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2047,2,1,3],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2048,2,1,3],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2049,2,1,3],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16,10,10,10],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[60,10,10,10],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1023,2,1,3],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1024,2,1,3],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1025,2,1,3],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2047,2,1,3],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2048,2,1,3],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2049,2,1,3],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16,10,10,10],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[60,10,10,10],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1023,2,1,3],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1024,2,1,3],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1025,2,1,3],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2047,2,1,3],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2048,2,1,3],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2049,2,1,3],k=15,ties=0","support","0","no","CUDA"
 "CUDA0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=0","support","1","yes","CUDA"
 "CUDA0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=1","support","1","yes","CUDA"
 "CUDA0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=nearest,flags=none","support","1","yes","CUDA"
@@ -9435,6 +9860,10 @@
 "CUDA0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bicubic,transpose=1","support","1","yes","CUDA"
 "CUDA0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bicubic,flags=none","support","1","yes","CUDA"
 "CUDA0","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bicubic,flags=none","support","1","yes","CUDA"
+"CUDA0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=513,transpose=0","support","1","yes","CUDA"
+"CUDA0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=513,transpose=1","support","1","yes","CUDA"
+"CUDA0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear,flags=none","support","1","yes","CUDA"
+"CUDA0","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bilinear,flags=none","support","1","yes","CUDA"
 "CUDA0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear,flags=align_corners","support","1","yes","CUDA"
 "CUDA0","UPSCALE","type=f32,ne=[1,4,3,2],ne_tgt=[2,8,3,2],mode=bilinear,flags=align_corners","support","1","yes","CUDA"
 "CUDA0","UPSCALE","type=f32,ne=[4,1,3,2],ne_tgt=[1,1,3,2],mode=bilinear,flags=align_corners","support","1","yes","CUDA"
@@ -9463,34 +9892,59 @@
 "CUDA0","GROUP_NORM","type=f32,ne=[64,64,320,1],num_groups=32,eps=0.000001","support","1","yes","CUDA"
 "CUDA0","GROUP_NORM","type=f32,ne=[9,9,1280,1],num_groups=32,eps=0.000001","support","1","yes","CUDA"
 "CUDA0","ACC","type=f32,ne_a=[256,17,1,1],ne_b=[256,16,1,1]","support","1","yes","CUDA"
-"CUDA0","PAD","type=f32,ne_a=[512,512,1,1],pad_0=1,pad_1=1","support","1","yes","CUDA"
-"CUDA0","PAD","type=f32,ne_a=[512,512,3,1],lp0=1,rp0=1,lp1=1,rp1=1,lp2=1,rp2=1,lp3=1,rp3=1,v=0","support","1","yes","CUDA"
+"CUDA0","PAD","type=f32,ne_a=[512,512,1,1],pad_0=1,pad_1=1,circular=0","support","1","yes","CUDA"
+"CUDA0","PAD","type=f32,ne_a=[33,17,2,1],pad_0=4,pad_1=3,circular=1","support","1","yes","CUDA"
+"CUDA0","PAD","type=f32,ne_a=[512,512,3,1],lp0=1,rp0=1,lp1=1,rp1=1,lp2=1,rp2=1,lp3=1,rp3=1,v=0,circular=0","support","1","yes","CUDA"
 "CUDA0","PAD_REFLECT_1D","type=f32,ne_a=[512,34,2,1],pad_0=10,pad_1=9","support","1","yes","CUDA"
 "CUDA0","PAD_REFLECT_1D","type=f32,ne_a=[3000,384,4,1],pad_0=10,pad_1=9","support","1","yes","CUDA"
 "CUDA0","ROLL","shift0=3,shift1=-2,shift3=1,shift4=-1","support","1","yes","CUDA"
 "CUDA0","ARANGE","type=f32,start=0.000000,stop=10.000000,step=1.000000","support","1","yes","CUDA"
+"CUDA0","ARANGE","type=f32,start=0.000000,stop=1048576.000000,step=1.000000","support","1","yes","CUDA"
 "CUDA0","TIMESTEP_EMBEDDING","type=f32,ne_a=[2,1,1,1],dim=320,max_period=10000","support","1","yes","CUDA"
 "CUDA0","LEAKY_RELU","type=f32,ne_a=[10,5,4,3],negative_slope=0.100000","support","1","yes","CUDA"
-"CUDA0","CUMSUM","type=f32,ne=[10,5,4,3]","support","0","no","CUDA"
+"CUDA0","CUMSUM","type=f32,ne=[10,5,4,3]","support","1","yes","CUDA"
+"CUDA0","CUMSUM","type=f32,ne=[127,5,4,3]","support","1","yes","CUDA"
+"CUDA0","CUMSUM","type=f32,ne=[128,5,4,3]","support","1","yes","CUDA"
+"CUDA0","CUMSUM","type=f32,ne=[128,128,4,4]","support","1","yes","CUDA"
+"CUDA0","CUMSUM","type=f32,ne=[255,5,4,3]","support","1","yes","CUDA"
+"CUDA0","CUMSUM","type=f32,ne=[256,5,4,3]","support","1","yes","CUDA"
+"CUDA0","CUMSUM","type=f32,ne=[511,5,4,3]","support","1","yes","CUDA"
+"CUDA0","CUMSUM","type=f32,ne=[512,5,4,3]","support","1","yes","CUDA"
+"CUDA0","CUMSUM","type=f32,ne=[1023,5,4,3]","support","1","yes","CUDA"
+"CUDA0","CUMSUM","type=f32,ne=[1024,5,4,3]","support","1","yes","CUDA"
+"CUDA0","CUMSUM","type=f32,ne=[2047,5,4,3]","support","1","yes","CUDA"
+"CUDA0","CUMSUM","type=f32,ne=[2048,5,4,3]","support","1","yes","CUDA"
+"CUDA0","CUMSUM","type=f32,ne=[242004,1,1,1]","support","1","yes","CUDA"
+"CUDA0","CUMSUM","type=f32,ne=[375960,1,1,1]","support","1","yes","CUDA"
 "CUDA0","XIELU","type=f32,ne=[10,5,4,3]","support","0","no","CUDA"
-"CUDA0","TRI","type=f32,ne=[10,10,4,3],tri_type=3","support","0","no","CUDA"
-"CUDA0","TRI","type=f32,ne=[10,10,4,3],tri_type=2","support","0","no","CUDA"
-"CUDA0","TRI","type=f32,ne=[10,10,4,3],tri_type=1","support","0","no","CUDA"
-"CUDA0","TRI","type=f32,ne=[10,10,4,3],tri_type=0","support","0","no","CUDA"
-"CUDA0","FILL","type=f32,ne=[10,10,4,3],c=0.000000","support","0","no","CUDA"
-"CUDA0","FILL","type=f32,ne=[303,207,11,3],c=2.000000","support","0","no","CUDA"
-"CUDA0","FILL","type=f32,ne=[800,600,4,4],c=-152.000000","support","0","no","CUDA"
-"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[10,10,4,3],ne_rhs=[3,10,4,3]","support","0","no","CUDA"
-"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[11,11,1,1],ne_rhs=[5,11,1,1]","support","0","no","CUDA"
-"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[17,17,2,4],ne_rhs=[9,17,2,4]","support","0","no","CUDA"
-"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[30,30,7,1],ne_rhs=[8,30,7,1]","support","0","no","CUDA"
-"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[42,42,5,2],ne_rhs=[10,42,5,2]","support","0","no","CUDA"
-"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[64,64,2,2],ne_rhs=[10,64,2,2]","support","0","no","CUDA"
+"CUDA0","TRI","type=f32,ne=[10,10,4,3],tri_type=3","support","1","yes","CUDA"
+"CUDA0","TRI","type=f32,ne=[10,10,4,3],tri_type=2","support","1","yes","CUDA"
+"CUDA0","TRI","type=f32,ne=[10,10,4,3],tri_type=1","support","1","yes","CUDA"
+"CUDA0","TRI","type=f32,ne=[10,10,4,3],tri_type=0","support","1","yes","CUDA"
+"CUDA0","FILL","type=f32,ne=[10,10,4,3],c=0.000000","support","1","yes","CUDA"
+"CUDA0","FILL","type=f32,ne=[303,207,11,3],c=2.000000","support","1","yes","CUDA"
+"CUDA0","FILL","type=f32,ne=[800,600,4,4],c=-152.000000","support","1","yes","CUDA"
+"CUDA0","FILL","type=f32,ne=[2048,512,2,2],c=3.500000","support","1","yes","CUDA"
+"CUDA0","DIAG","type=f32,ne=[10,1,4,3]","support","1","yes","CUDA"
+"CUDA0","DIAG","type=f32,ne=[79,1,19,13]","support","1","yes","CUDA"
+"CUDA0","DIAG","type=f32,ne=[256,1,8,16]","support","1","yes","CUDA"
+"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[10,10,4,3],ne_rhs=[3,10,4,3]","support","1","yes","CUDA"
+"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[11,11,1,1],ne_rhs=[5,11,1,1]","support","1","yes","CUDA"
+"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[17,17,2,4],ne_rhs=[9,17,2,4]","support","1","yes","CUDA"
+"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[30,30,7,1],ne_rhs=[8,30,7,1]","support","1","yes","CUDA"
+"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[42,42,5,2],ne_rhs=[10,42,5,2]","support","1","yes","CUDA"
+"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[64,64,2,2],ne_rhs=[10,64,2,2]","support","1","yes","CUDA"
 "CUDA0","SOLVE_TRI","type=f32,ne_lhs=[100,100,4,4],ne_rhs=[41,100,4,4]","support","0","no","CUDA"
-"CUDA0","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0","support","1","yes","CUDA"
-"CUDA0","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0","support","1","yes","CUDA"
-"CUDA0","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1","support","0","no","CUDA"
-"CUDA0","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1","support","0","no","CUDA"
+"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[128,128,4,4],ne_rhs=[31,128,4,4]","support","0","no","CUDA"
+"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[64,64,4,4],ne_rhs=[300,64,4,4]","support","0","no","CUDA"
+"CUDA0","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0,circular=0","support","1","yes","CUDA"
+"CUDA0","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0,circular=0","support","1","yes","CUDA"
+"CUDA0","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0,circular=1","support","1","yes","CUDA"
+"CUDA0","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0,circular=1","support","1","yes","CUDA"
+"CUDA0","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1,circular=0","support","0","no","CUDA"
+"CUDA0","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1,circular=0","support","0","no","CUDA"
+"CUDA0","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1,circular=1","support","0","no","CUDA"
+"CUDA0","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1,circular=1","support","0","no","CUDA"
 "CUDA0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","CUDA"
 "CUDA0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","CUDA"
 "CUDA0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","CUDA"
--- a/docs/ops/Metal.csv
+++ b/docs/ops/Metal.csv
--- a/docs/ops/OpenCL.csv
+++ b/docs/ops/OpenCL.csv
--- a/docs/ops/SYCL.csv
+++ b/docs/ops/SYCL.csv
--- a/docs/ops/Vulkan.csv
+++ b/docs/ops/Vulkan.csv
@@ -5005,8 +5005,8 @@
 "Vulkan0","DUP","type=f16,ne=[10,10,5,1],permute=[0,2,1,3]","support","1","yes","Vulkan"
 "Vulkan0","DUP","type=f32,ne=[10,10,5,1],permute=[1,0,2,3]","support","1","yes","Vulkan"
 "Vulkan0","DUP","type=f16,ne=[10,10,5,1],permute=[1,0,2,3]","support","1","yes","Vulkan"
-"Vulkan0","DUP","type=i16,ne=[10,8,3,1],permute=[0,2,1,3]","support","0","no","Vulkan"
-"Vulkan0","DUP","type=i16,ne=[10,8,3,1],permute=[1,2,0,3]","support","0","no","Vulkan"
+"Vulkan0","DUP","type=i16,ne=[10,8,3,1],permute=[0,2,1,3]","support","1","yes","Vulkan"
+"Vulkan0","DUP","type=i16,ne=[10,8,3,1],permute=[1,2,0,3]","support","1","yes","Vulkan"
 "Vulkan0","SET","type_src=f32,type_dst=f32,ne=[6,5,4,3],dim=1","support","0","no","Vulkan"
 "Vulkan0","SET","type_src=f32,type_dst=f32,ne=[6,5,4,3],dim=2","support","0","no","Vulkan"
 "Vulkan0","SET","type_src=f32,type_dst=f32,ne=[6,5,4,3],dim=3","support","0","no","Vulkan"
@@ -5032,14 +5032,14 @@
 "Vulkan0","CPY","type_src=f16,type_dst=f16,ne=[3,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan"
 "Vulkan0","CPY","type_src=f16,type_dst=f16,ne=[3,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","Vulkan"
 "Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[1,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan"
-"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[1,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan"
-"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[1,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan"
+"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[1,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan"
+"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[1,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","Vulkan"
 "Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[2,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan"
-"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[2,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan"
-"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[2,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan"
+"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[2,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan"
+"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[2,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","Vulkan"
 "Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[3,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan"
-"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[3,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan"
-"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[3,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan"
+"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[3,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan"
+"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[3,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","Vulkan"
 "Vulkan0","CPY","type_src=q4_0,type_dst=q4_0,ne=[32,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan"
 "Vulkan0","CPY","type_src=q4_0,type_dst=q4_0,ne=[32,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan"
 "Vulkan0","CPY","type_src=q4_0,type_dst=q4_0,ne=[32,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan"
@@ -5271,7 +5271,7 @@
 "Vulkan0","CPY","type_src=bf16,type_dst=f16,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan"
 "Vulkan0","CPY","type_src=bf16,type_dst=f16,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan"
 "Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan"
-"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan"
+"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan"
 "Vulkan0","CPY","type_src=bf16,type_dst=q4_0,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan"
 "Vulkan0","CPY","type_src=bf16,type_dst=q4_0,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan"
 "Vulkan0","CPY","type_src=bf16,type_dst=q4_1,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan"
@@ -5415,21 +5415,49 @@
 "Vulkan0","CPY","type_src=f16,type_dst=f16,ne=[256,4,3,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","Vulkan"
 "Vulkan0","CPY","type_src=f32,type_dst=f32,ne=[256,4,3,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","Vulkan"
 "Vulkan0","CPY","type_src=f32,type_dst=f32,ne=[256,4,3,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","Vulkan"
-"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[256,4,3,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","0","no","Vulkan"
+"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[256,4,3,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","Vulkan"
 "Vulkan0","CPY","type_src=f16,type_dst=f16,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","Vulkan"
 "Vulkan0","CPY","type_src=f32,type_dst=f32,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","Vulkan"
-"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","0","no","Vulkan"
+"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","Vulkan"
+"Vulkan0","CPY","type_src=i32,type_dst=i32,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","Vulkan"
+"Vulkan0","CPY","type_src=i32,type_dst=i32,ne=[256,1,4,1],permute_src=[1,2,0,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan"
 "Vulkan0","CPY","type_src=f32,type_dst=f32,ne=[256,1,4,1],permute_src=[1,2,0,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan"
-"Vulkan0","CONT","type=f32,ne=[10,10,10,1]","support","1","yes","Vulkan"
-"Vulkan0","CONT","type=f32,ne=[2,1,1,1]","support","1","yes","Vulkan"
-"Vulkan0","CONT","type=f32,ne=[2,1,3,5]","support","1","yes","Vulkan"
-"Vulkan0","CONT","type=f32,ne=[2,3,5,7]","support","1","yes","Vulkan"
-"Vulkan0","CONT","type=f16,ne=[2,1,1,1]","support","1","yes","Vulkan"
-"Vulkan0","CONT","type=f16,ne=[2,1,3,5]","support","1","yes","Vulkan"
-"Vulkan0","CONT","type=f16,ne=[2,3,5,7]","support","1","yes","Vulkan"
-"Vulkan0","CONT","type=bf16,ne=[2,1,1,1]","support","1","yes","Vulkan"
-"Vulkan0","CONT","type=bf16,ne=[2,1,3,5]","support","1","yes","Vulkan"
-"Vulkan0","CONT","type=bf16,ne=[2,3,5,7]","support","0","no","Vulkan"
+"Vulkan0","CONT","type=f32,ne=[2,1,1,1],use_view_slice=1","support","1","yes","Vulkan"
+"Vulkan0","CONT","type=f32,ne=[2,1,3,5],use_view_slice=1","support","1","yes","Vulkan"
+"Vulkan0","CONT","type=f32,ne=[2,3,5,7],use_view_slice=1","support","1","yes","Vulkan"
+"Vulkan0","CONT","type=f32,ne=[1,4,4,1],use_view_slice=1","support","1","yes","Vulkan"
+"Vulkan0","CONT","type=f32,ne=[1,8,17,1],use_view_slice=1","support","1","yes","Vulkan"
+"Vulkan0","CONT","type=f32,ne=[10,10,10,1],use_view_slice=1","support","1","yes","Vulkan"
+"Vulkan0","CONT","type=f32,ne=[2,1,1,1],use_view_slice=0","support","1","yes","Vulkan"
+"Vulkan0","CONT","type=f32,ne=[2,1,3,5],use_view_slice=0","support","1","yes","Vulkan"
+"Vulkan0","CONT","type=f32,ne=[2,3,5,7],use_view_slice=0","support","1","yes","Vulkan"
+"Vulkan0","CONT","type=f32,ne=[1,4,4,1],use_view_slice=0","support","1","yes","Vulkan"
+"Vulkan0","CONT","type=f32,ne=[1,8,17,1],use_view_slice=0","support","1","yes","Vulkan"
+"Vulkan0","CONT","type=f32,ne=[10,10,10,1],use_view_slice=0","support","1","yes","Vulkan"
+"Vulkan0","CONT","type=i32,ne=[2,1,1,1],use_view_slice=1","support","1","yes","Vulkan"
+"Vulkan0","CONT","type=i32,ne=[2,1,3,5],use_view_slice=1","support","1","yes","Vulkan"
+"Vulkan0","CONT","type=i32,ne=[2,3,5,7],use_view_slice=1","support","1","yes","Vulkan"
+"Vulkan0","CONT","type=i32,ne=[1,4,4,1],use_view_slice=1","support","1","yes","Vulkan"
+"Vulkan0","CONT","type=i32,ne=[1,8,17,1],use_view_slice=1","support","1","yes","Vulkan"
+"Vulkan0","CONT","type=i32,ne=[10,10,10,1],use_view_slice=1","support","1","yes","Vulkan"
+"Vulkan0","CONT","type=i32,ne=[2,1,1,1],use_view_slice=0","support","1","yes","Vulkan"
+"Vulkan0","CONT","type=i32,ne=[2,1,3,5],use_view_slice=0","support","1","yes","Vulkan"
+"Vulkan0","CONT","type=i32,ne=[2,3,5,7],use_view_slice=0","support","1","yes","Vulkan"
+"Vulkan0","CONT","type=i32,ne=[1,4,4,1],use_view_slice=0","support","1","yes","Vulkan"
+"Vulkan0","CONT","type=i32,ne=[1,8,17,1],use_view_slice=0","support","1","yes","Vulkan"
+"Vulkan0","CONT","type=i32,ne=[10,10,10,1],use_view_slice=0","support","1","yes","Vulkan"
+"Vulkan0","CONT","type=f16,ne=[2,1,1,1],use_view_slice=0","support","1","yes","Vulkan"
+"Vulkan0","CONT","type=f16,ne=[2,1,3,5],use_view_slice=0","support","1","yes","Vulkan"
+"Vulkan0","CONT","type=f16,ne=[2,3,5,7],use_view_slice=0","support","1","yes","Vulkan"
+"Vulkan0","CONT","type=f16,ne=[1,4,4,1],use_view_slice=0","support","1","yes","Vulkan"
+"Vulkan0","CONT","type=f16,ne=[1,8,17,1],use_view_slice=0","support","1","yes","Vulkan"
+"Vulkan0","CONT","type=f16,ne=[10,10,10,1],use_view_slice=0","support","1","yes","Vulkan"
+"Vulkan0","CONT","type=bf16,ne=[2,1,1,1],use_view_slice=0","support","1","yes","Vulkan"
+"Vulkan0","CONT","type=bf16,ne=[2,1,3,5],use_view_slice=0","support","1","yes","Vulkan"
+"Vulkan0","CONT","type=bf16,ne=[2,3,5,7],use_view_slice=0","support","1","yes","Vulkan"
+"Vulkan0","CONT","type=bf16,ne=[1,4,4,1],use_view_slice=0","support","1","yes","Vulkan"
+"Vulkan0","CONT","type=bf16,ne=[1,8,17,1],use_view_slice=0","support","1","yes","Vulkan"
+"Vulkan0","CONT","type=bf16,ne=[10,10,10,1],use_view_slice=0","support","1","yes","Vulkan"
 "Vulkan0","ADD","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1","support","1","yes","Vulkan"
 "Vulkan0","SUB","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1","support","1","yes","Vulkan"
 "Vulkan0","MUL","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1","support","1","yes","Vulkan"
@@ -5655,6 +5683,7 @@
 "Vulkan0","MUL","type=f32,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1","support","1","yes","Vulkan"
 "Vulkan0","DIV","type=f32,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1","support","1","yes","Vulkan"
 "Vulkan0","ADD1","type=f32,ne=[10,5,4,3]","support","1","yes","Vulkan"
+"Vulkan0","ADD1","type=f32,ne=[1024,1024,1,1]","support","1","yes","Vulkan"
 "Vulkan0","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=0.000000,inplace=0","support","1","yes","Vulkan"
 "Vulkan0","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=1.000000,inplace=0","support","1","yes","Vulkan"
 "Vulkan0","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=1.000000,inplace=1","support","1","yes","Vulkan"
@@ -8644,9 +8673,13 @@
 "Vulkan0","CLAMP","type=f16,ne=[7,1,5,3],min=-0.500000,max=0.500000","support","0","no","Vulkan"
 "Vulkan0","LEAKY_RELU","type=f16,ne_a=[7,1,5,3],negative_slope=0.100000","support","0","no","Vulkan"
 "Vulkan0","FLOOR","type=f16,ne=[7,1,5,3]","support","1","yes","Vulkan"
+"Vulkan0","FLOOR","type=f16,ne=[1024,1024,1,1]","support","1","yes","Vulkan"
 "Vulkan0","CEIL","type=f16,ne=[7,1,5,3]","support","1","yes","Vulkan"
+"Vulkan0","CEIL","type=f16,ne=[1024,1024,1,1]","support","1","yes","Vulkan"
 "Vulkan0","ROUND","type=f16,ne=[7,1,5,3]","support","1","yes","Vulkan"
+"Vulkan0","ROUND","type=f16,ne=[1024,1024,1,1]","support","1","yes","Vulkan"
 "Vulkan0","TRUNC","type=f16,ne=[7,1,5,3]","support","1","yes","Vulkan"
+"Vulkan0","TRUNC","type=f16,ne=[1024,1024,1,1]","support","1","yes","Vulkan"
 "Vulkan0","SQR","type=f32,ne=[10,5,4,3]","support","1","yes","Vulkan"
 "Vulkan0","SQRT","type=f32,ne=[10,3,3,2]","support","1","yes","Vulkan"
 "Vulkan0","LOG","type=f32,ne=[10,5,4,3]","support","1","yes","Vulkan"
@@ -8666,9 +8699,13 @@
 "Vulkan0","CLAMP","type=f32,ne=[7,1,5,3],min=-0.500000,max=0.500000","support","1","yes","Vulkan"
 "Vulkan0","LEAKY_RELU","type=f32,ne_a=[7,1,5,3],negative_slope=0.100000","support","1","yes","Vulkan"
 "Vulkan0","FLOOR","type=f32,ne=[7,1,5,3]","support","1","yes","Vulkan"
+"Vulkan0","FLOOR","type=f32,ne=[1024,1024,1,1]","support","1","yes","Vulkan"
 "Vulkan0","CEIL","type=f32,ne=[7,1,5,3]","support","1","yes","Vulkan"
+"Vulkan0","CEIL","type=f32,ne=[1024,1024,1,1]","support","1","yes","Vulkan"
 "Vulkan0","ROUND","type=f32,ne=[7,1,5,3]","support","1","yes","Vulkan"
+"Vulkan0","ROUND","type=f32,ne=[1024,1024,1,1]","support","1","yes","Vulkan"
 "Vulkan0","TRUNC","type=f32,ne=[7,1,5,3]","support","1","yes","Vulkan"
+"Vulkan0","TRUNC","type=f32,ne=[1024,1024,1,1]","support","1","yes","Vulkan"
 "Vulkan0","DIAG_MASK_INF","type=f32,ne=[10,10,1,1],n_past=5","support","1","yes","Vulkan"
 "Vulkan0","DIAG_MASK_INF","type=f32,ne=[10,10,3,1],n_past=5","support","1","yes","Vulkan"
 "Vulkan0","DIAG_MASK_INF","type=f32,ne=[10,10,3,2],n_past=5","support","1","yes","Vulkan"
@@ -9411,28 +9448,405 @@
 "Vulkan0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","1","yes","Vulkan"
 "Vulkan0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","Vulkan"
 "Vulkan0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[3,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[4,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[7,1,1,1],order=0","support","1","yes","Vulkan"
 "Vulkan0","ARGSORT","type=f32,ne=[8,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[15,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[16,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[31,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[32,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[63,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[64,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[127,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[128,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[255,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[256,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[511,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[512,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[1023,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[1024,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[2047,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[2048,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[4095,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[4096,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[8191,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[8192,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[16383,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[16384,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[32767,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[32768,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[65535,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[65536,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[131071,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[131072,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[262143,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[262144,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[524287,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[524288,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[1048575,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[1048576,1,1,1],order=0","support","1","yes","Vulkan"
 "Vulkan0","ARGSORT","type=f32,ne=[16,10,10,10],order=0","support","1","yes","Vulkan"
 "Vulkan0","ARGSORT","type=f32,ne=[60,10,10,10],order=0","support","1","yes","Vulkan"
 "Vulkan0","ARGSORT","type=f32,ne=[1023,2,1,3],order=0","support","1","yes","Vulkan"
 "Vulkan0","ARGSORT","type=f32,ne=[1024,2,1,3],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[1025,2,1,3],order=0","support","0","no","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[16384,1,1,1],order=0","support","0","no","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[2047,2,1,3],order=0","support","0","no","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[2048,2,1,3],order=0","support","0","no","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[2049,2,1,3],order=0","support","0","no","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[1025,2,1,3],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[2047,2,1,3],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[2048,2,1,3],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[2049,2,1,3],order=0","support","1","yes","Vulkan"
 "Vulkan0","ARGSORT","type=f32,ne=[2,8,8192,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[8,1,1,1],order=1","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[3,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[4,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[7,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[8,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[15,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[16,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[31,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[32,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[63,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[64,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[127,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[128,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[255,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[256,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[511,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[512,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[1023,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[1024,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[2047,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[2048,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[4095,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[4096,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[8191,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[8192,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[16383,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[16384,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[32767,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[32768,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[65535,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[65536,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[131071,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[131072,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[262143,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[262144,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[524287,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[524288,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[1048575,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[1048576,1,1,1],order=0","support","1","yes","Vulkan"
 "Vulkan0","ARGSORT","type=f32,ne=[16,10,10,10],order=1","support","1","yes","Vulkan"
 "Vulkan0","ARGSORT","type=f32,ne=[60,10,10,10],order=1","support","1","yes","Vulkan"
 "Vulkan0","ARGSORT","type=f32,ne=[1023,2,1,3],order=1","support","1","yes","Vulkan"
 "Vulkan0","ARGSORT","type=f32,ne=[1024,2,1,3],order=1","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[1025,2,1,3],order=1","support","0","no","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[16384,1,1,1],order=1","support","0","no","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[2047,2,1,3],order=1","support","0","no","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[2048,2,1,3],order=1","support","0","no","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[2049,2,1,3],order=1","support","0","no","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[1025,2,1,3],order=1","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[2047,2,1,3],order=1","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[2048,2,1,3],order=1","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[2049,2,1,3],order=1","support","1","yes","Vulkan"
 "Vulkan0","ARGSORT","type=f32,ne=[2,8,8192,1],order=1","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[1,1,1,1],k=1","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[12,1,2,1],k=1","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[2,1,1,1],k=1","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[13,1,2,1],k=1","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[2,1,1,1],k=2","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[13,1,2,1],k=2","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[4,1,1,1],k=1","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[15,1,2,1],k=1","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[4,1,1,1],k=2","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[15,1,2,1],k=2","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[4,1,1,1],k=3","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[15,1,2,1],k=3","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[8,1,1,1],k=1","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[19,1,2,1],k=1","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[8,1,1,1],k=2","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[19,1,2,1],k=2","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[8,1,1,1],k=3","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[19,1,2,1],k=3","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[8,1,1,1],k=7","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[19,1,2,1],k=7","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[16,1,1,1],k=1","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[27,1,2,1],k=1","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[16,1,1,1],k=2","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[27,1,2,1],k=2","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[16,1,1,1],k=3","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[27,1,2,1],k=3","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[16,1,1,1],k=7","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[27,1,2,1],k=7","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[16,1,1,1],k=15","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[27,1,2,1],k=15","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[32,1,1,1],k=1","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[43,1,2,1],k=1","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[32,1,1,1],k=2","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[43,1,2,1],k=2","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[32,1,1,1],k=3","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[43,1,2,1],k=3","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[32,1,1,1],k=7","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[43,1,2,1],k=7","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[32,1,1,1],k=15","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[43,1,2,1],k=15","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[64,1,1,1],k=1","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[75,1,2,1],k=1","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[64,1,1,1],k=2","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[75,1,2,1],k=2","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[64,1,1,1],k=3","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[75,1,2,1],k=3","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[64,1,1,1],k=7","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[75,1,2,1],k=7","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[64,1,1,1],k=15","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[75,1,2,1],k=15","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[128,1,1,1],k=1","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[139,1,2,1],k=1","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[128,1,1,1],k=2","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[139,1,2,1],k=2","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[128,1,1,1],k=3","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[139,1,2,1],k=3","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[128,1,1,1],k=7","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[139,1,2,1],k=7","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[128,1,1,1],k=15","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[139,1,2,1],k=15","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[128,1,1,1],k=100","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[139,1,2,1],k=100","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[256,1,1,1],k=1","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[267,1,2,1],k=1","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[256,1,1,1],k=2","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[267,1,2,1],k=2","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[256,1,1,1],k=3","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[267,1,2,1],k=3","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[256,1,1,1],k=7","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[267,1,2,1],k=7","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[256,1,1,1],k=15","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[267,1,2,1],k=15","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[256,1,1,1],k=100","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[267,1,2,1],k=100","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[512,1,1,1],k=1","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[523,1,2,1],k=1","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[512,1,1,1],k=2","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[523,1,2,1],k=2","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[512,1,1,1],k=3","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[523,1,2,1],k=3","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[512,1,1,1],k=7","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[523,1,2,1],k=7","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[512,1,1,1],k=15","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[523,1,2,1],k=15","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[512,1,1,1],k=100","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[523,1,2,1],k=100","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[512,1,1,1],k=500","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[523,1,2,1],k=500","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[1024,1,1,1],k=1","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[1035,1,2,1],k=1","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[1024,1,1,1],k=2","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[1035,1,2,1],k=2","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[1024,1,1,1],k=3","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[1035,1,2,1],k=3","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[1024,1,1,1],k=7","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[1035,1,2,1],k=7","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[1024,1,1,1],k=15","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[1035,1,2,1],k=15","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[1024,1,1,1],k=100","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[1035,1,2,1],k=100","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[1024,1,1,1],k=500","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[1035,1,2,1],k=500","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[1024,1,1,1],k=1023","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[1035,1,2,1],k=1023","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[2048,1,1,1],k=1","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[2059,1,2,1],k=1","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[2048,1,1,1],k=2","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[2059,1,2,1],k=2","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[2048,1,1,1],k=3","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[2059,1,2,1],k=3","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[2048,1,1,1],k=7","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[2059,1,2,1],k=7","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[2048,1,1,1],k=15","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[2059,1,2,1],k=15","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[2048,1,1,1],k=100","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[2059,1,2,1],k=100","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[2048,1,1,1],k=500","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[2059,1,2,1],k=500","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[2048,1,1,1],k=1023","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[2059,1,2,1],k=1023","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[4096,1,1,1],k=1","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[4107,1,2,1],k=1","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[4096,1,1,1],k=2","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[4107,1,2,1],k=2","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[4096,1,1,1],k=3","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[4107,1,2,1],k=3","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[4096,1,1,1],k=7","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[4107,1,2,1],k=7","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[4096,1,1,1],k=15","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[4107,1,2,1],k=15","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[4096,1,1,1],k=100","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[4107,1,2,1],k=100","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[4096,1,1,1],k=500","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[4107,1,2,1],k=500","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[4096,1,1,1],k=1023","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[4107,1,2,1],k=1023","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[8192,1,1,1],k=1","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[8203,1,2,1],k=1","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[8192,1,1,1],k=2","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[8203,1,2,1],k=2","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[8192,1,1,1],k=3","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[8203,1,2,1],k=3","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[8192,1,1,1],k=7","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[8203,1,2,1],k=7","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[8192,1,1,1],k=15","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[8203,1,2,1],k=15","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[8192,1,1,1],k=100","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[8203,1,2,1],k=100","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[8192,1,1,1],k=500","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[8203,1,2,1],k=500","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[8192,1,1,1],k=1023","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[8203,1,2,1],k=1023","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=1","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[16395,1,2,1],k=1","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=2","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[16395,1,2,1],k=2","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=3","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[16395,1,2,1],k=3","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=7","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[16395,1,2,1],k=7","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=15","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[16395,1,2,1],k=15","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=100","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[16395,1,2,1],k=100","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=500","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[16395,1,2,1],k=500","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=1023","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[16395,1,2,1],k=1023","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=9999","support","0","no","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[16395,1,2,1],k=9999","support","0","no","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[32768,1,1,1],k=1","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[32779,1,2,1],k=1","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[32768,1,1,1],k=2","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[32779,1,2,1],k=2","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[32768,1,1,1],k=3","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[32779,1,2,1],k=3","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[32768,1,1,1],k=7","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[32779,1,2,1],k=7","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[32768,1,1,1],k=15","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[32779,1,2,1],k=15","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[32768,1,1,1],k=100","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[32779,1,2,1],k=100","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[32768,1,1,1],k=500","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[32779,1,2,1],k=500","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[32768,1,1,1],k=1023","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[32779,1,2,1],k=1023","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[32768,1,1,1],k=9999","support","0","no","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[32779,1,2,1],k=9999","support","0","no","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[65536,1,1,1],k=1","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[65547,1,2,1],k=1","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[65536,1,1,1],k=2","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[65547,1,2,1],k=2","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[65536,1,1,1],k=3","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[65547,1,2,1],k=3","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[65536,1,1,1],k=7","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[65547,1,2,1],k=7","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[65536,1,1,1],k=15","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[65547,1,2,1],k=15","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[65536,1,1,1],k=100","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[65547,1,2,1],k=100","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[65536,1,1,1],k=500","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[65547,1,2,1],k=500","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[65536,1,1,1],k=1023","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[65547,1,2,1],k=1023","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[65536,1,1,1],k=9999","support","0","no","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[65547,1,2,1],k=9999","support","0","no","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[131072,1,1,1],k=1","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[131083,1,2,1],k=1","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[131072,1,1,1],k=2","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[131083,1,2,1],k=2","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[131072,1,1,1],k=3","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[131083,1,2,1],k=3","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[131072,1,1,1],k=7","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[131083,1,2,1],k=7","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[131072,1,1,1],k=15","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[131083,1,2,1],k=15","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[131072,1,1,1],k=100","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[131083,1,2,1],k=100","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[131072,1,1,1],k=500","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[131083,1,2,1],k=500","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[131072,1,1,1],k=1023","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[131083,1,2,1],k=1023","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[131072,1,1,1],k=9999","support","0","no","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[131083,1,2,1],k=9999","support","0","no","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[262144,1,1,1],k=1","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[262155,1,2,1],k=1","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[262144,1,1,1],k=2","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[262155,1,2,1],k=2","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[262144,1,1,1],k=3","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[262155,1,2,1],k=3","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[262144,1,1,1],k=7","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[262155,1,2,1],k=7","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[262144,1,1,1],k=15","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[262155,1,2,1],k=15","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[262144,1,1,1],k=100","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[262155,1,2,1],k=100","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[262144,1,1,1],k=500","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[262155,1,2,1],k=500","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[262144,1,1,1],k=1023","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[262155,1,2,1],k=1023","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[262144,1,1,1],k=9999","support","0","no","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[262155,1,2,1],k=9999","support","0","no","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[524288,1,1,1],k=1","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[524299,1,2,1],k=1","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[524288,1,1,1],k=2","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[524299,1,2,1],k=2","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[524288,1,1,1],k=3","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[524299,1,2,1],k=3","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[524288,1,1,1],k=7","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[524299,1,2,1],k=7","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[524288,1,1,1],k=15","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[524299,1,2,1],k=15","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[524288,1,1,1],k=100","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[524299,1,2,1],k=100","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[524288,1,1,1],k=500","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[524299,1,2,1],k=500","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[524288,1,1,1],k=1023","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[524299,1,2,1],k=1023","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[524288,1,1,1],k=9999","support","0","no","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[524299,1,2,1],k=9999","support","0","no","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[16,10,10,10],k=1","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[60,10,10,10],k=1","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[1023,2,1,3],k=1","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[1024,2,1,3],k=1","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[1025,2,1,3],k=1","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=1","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[2047,2,1,3],k=1","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[2048,2,1,3],k=1","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[2049,2,1,3],k=1","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[16,10,10,10],k=2","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[60,10,10,10],k=2","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[1023,2,1,3],k=2","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[1024,2,1,3],k=2","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[1025,2,1,3],k=2","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=2","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[2047,2,1,3],k=2","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[2048,2,1,3],k=2","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[2049,2,1,3],k=2","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[16,10,10,10],k=3","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[60,10,10,10],k=3","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[1023,2,1,3],k=3","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[1024,2,1,3],k=3","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[1025,2,1,3],k=3","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=3","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[2047,2,1,3],k=3","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[2048,2,1,3],k=3","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[2049,2,1,3],k=3","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[16,10,10,10],k=7","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[60,10,10,10],k=7","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[1023,2,1,3],k=7","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[1024,2,1,3],k=7","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[1025,2,1,3],k=7","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=7","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[2047,2,1,3],k=7","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[2048,2,1,3],k=7","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[2049,2,1,3],k=7","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[16,10,10,10],k=15","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[60,10,10,10],k=15","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[1023,2,1,3],k=15","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[1024,2,1,3],k=15","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[1025,2,1,3],k=15","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=15","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[2047,2,1,3],k=15","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[2048,2,1,3],k=15","support","1","yes","Vulkan"
+"Vulkan0","TOP_K","type=f32,ne=[2049,2,1,3],k=15","support","1","yes","Vulkan"
 "Vulkan0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=0","support","1","yes","Vulkan"
 "Vulkan0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=1","support","1","yes","Vulkan"
 "Vulkan0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=nearest,flags=none","support","1","yes","Vulkan"
@@ -9445,6 +9859,10 @@
 "Vulkan0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bicubic,transpose=1","support","1","yes","Vulkan"
 "Vulkan0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bicubic,flags=none","support","1","yes","Vulkan"
 "Vulkan0","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bicubic,flags=none","support","1","yes","Vulkan"
+"Vulkan0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=513,transpose=0","support","0","no","Vulkan"
+"Vulkan0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=513,transpose=1","support","0","no","Vulkan"
+"Vulkan0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear,flags=none","support","0","no","Vulkan"
+"Vulkan0","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bilinear,flags=none","support","0","no","Vulkan"
 "Vulkan0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear,flags=align_corners","support","1","yes","Vulkan"
 "Vulkan0","UPSCALE","type=f32,ne=[1,4,3,2],ne_tgt=[2,8,3,2],mode=bilinear,flags=align_corners","support","1","yes","Vulkan"
 "Vulkan0","UPSCALE","type=f32,ne=[4,1,3,2],ne_tgt=[1,1,3,2],mode=bilinear,flags=align_corners","support","1","yes","Vulkan"
@@ -9479,23 +9897,37 @@
 "Vulkan0","PAD_REFLECT_1D","type=f32,ne_a=[3000,384,4,1],pad_0=10,pad_1=9","support","0","no","Vulkan"
 "Vulkan0","ROLL","shift0=3,shift1=-2,shift3=1,shift4=-1","support","1","yes","Vulkan"
 "Vulkan0","ARANGE","type=f32,start=0.000000,stop=10.000000,step=1.000000","support","1","yes","Vulkan"
+"Vulkan0","ARANGE","type=f32,start=0.000000,stop=1048576.000000,step=1.000000","support","1","yes","Vulkan"
 "Vulkan0","TIMESTEP_EMBEDDING","type=f32,ne_a=[2,1,1,1],dim=320,max_period=10000","support","1","yes","Vulkan"
 "Vulkan0","LEAKY_RELU","type=f32,ne_a=[10,5,4,3],negative_slope=0.100000","support","1","yes","Vulkan"
-"Vulkan0","CUMSUM","type=f32,ne=[10,5,4,3]","support","0","no","Vulkan"
+"Vulkan0","CUMSUM","type=f32,ne=[10,5,4,3]","support","1","yes","Vulkan"
+"Vulkan0","CUMSUM","type=f32,ne=[127,5,4,3]","support","1","yes","Vulkan"
+"Vulkan0","CUMSUM","type=f32,ne=[128,5,4,3]","support","1","yes","Vulkan"
+"Vulkan0","CUMSUM","type=f32,ne=[255,5,4,3]","support","1","yes","Vulkan"
+"Vulkan0","CUMSUM","type=f32,ne=[256,5,4,3]","support","1","yes","Vulkan"
+"Vulkan0","CUMSUM","type=f32,ne=[511,5,4,3]","support","1","yes","Vulkan"
+"Vulkan0","CUMSUM","type=f32,ne=[512,5,4,3]","support","1","yes","Vulkan"
+"Vulkan0","CUMSUM","type=f32,ne=[1023,5,4,3]","support","1","yes","Vulkan"
+"Vulkan0","CUMSUM","type=f32,ne=[1024,5,4,3]","support","1","yes","Vulkan"
+"Vulkan0","CUMSUM","type=f32,ne=[2047,5,4,3]","support","1","yes","Vulkan"
+"Vulkan0","CUMSUM","type=f32,ne=[2048,5,4,3]","support","1","yes","Vulkan"
+"Vulkan0","CUMSUM","type=f32,ne=[242004,1,1,1]","support","1","yes","Vulkan"
+"Vulkan0","CUMSUM","type=f32,ne=[375960,1,1,1]","support","1","yes","Vulkan"
 "Vulkan0","XIELU","type=f32,ne=[10,5,4,3]","support","0","no","Vulkan"
-"Vulkan0","TRI","type=f32,ne=[10,10,4,3],tri_type=3","support","0","no","Vulkan"
-"Vulkan0","TRI","type=f32,ne=[10,10,4,3],tri_type=2","support","0","no","Vulkan"
-"Vulkan0","TRI","type=f32,ne=[10,10,4,3],tri_type=1","support","0","no","Vulkan"
-"Vulkan0","TRI","type=f32,ne=[10,10,4,3],tri_type=0","support","0","no","Vulkan"
+"Vulkan0","TRI","type=f32,ne=[10,10,4,3],tri_type=3","support","1","yes","Vulkan"
+"Vulkan0","TRI","type=f32,ne=[10,10,4,3],tri_type=2","support","1","yes","Vulkan"
+"Vulkan0","TRI","type=f32,ne=[10,10,4,3],tri_type=1","support","1","yes","Vulkan"
+"Vulkan0","TRI","type=f32,ne=[10,10,4,3],tri_type=0","support","1","yes","Vulkan"
 "Vulkan0","FILL","type=f32,ne=[10,10,4,3],c=0.000000","support","1","yes","Vulkan"
 "Vulkan0","FILL","type=f32,ne=[303,207,11,3],c=2.000000","support","1","yes","Vulkan"
 "Vulkan0","FILL","type=f32,ne=[800,600,4,4],c=-152.000000","support","1","yes","Vulkan"
-"Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[10,10,4,3],ne_rhs=[3,10,4,3]","support","0","no","Vulkan"
-"Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[11,11,1,1],ne_rhs=[5,11,1,1]","support","0","no","Vulkan"
-"Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[17,17,2,4],ne_rhs=[9,17,2,4]","support","0","no","Vulkan"
-"Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[30,30,7,1],ne_rhs=[8,30,7,1]","support","0","no","Vulkan"
-"Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[42,42,5,2],ne_rhs=[10,42,5,2]","support","0","no","Vulkan"
-"Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[64,64,2,2],ne_rhs=[10,64,2,2]","support","0","no","Vulkan"
+"Vulkan0","FILL","type=f32,ne=[2048,512,2,2],c=3.500000","support","1","yes","Vulkan"
+"Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[10,10,4,3],ne_rhs=[3,10,4,3]","support","1","yes","Vulkan"
+"Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[11,11,1,1],ne_rhs=[5,11,1,1]","support","1","yes","Vulkan"
+"Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[17,17,2,4],ne_rhs=[9,17,2,4]","support","1","yes","Vulkan"
+"Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[30,30,7,1],ne_rhs=[8,30,7,1]","support","1","yes","Vulkan"
+"Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[42,42,5,2],ne_rhs=[10,42,5,2]","support","1","yes","Vulkan"
+"Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[64,64,2,2],ne_rhs=[10,64,2,2]","support","1","yes","Vulkan"
 "Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[100,100,4,4],ne_rhs=[41,100,4,4]","support","0","no","Vulkan"
 "Vulkan0","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0","support","1","yes","Vulkan"
 "Vulkan0","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0","support","1","yes","Vulkan"
--- a/docs/ops/WebGPU.csv
+++ b/docs/ops/WebGPU.csv
--- a/docs/ops/ZenDNN.csv
+++ b/docs/ops/ZenDNN.csv
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -20,6 +20,7 @@ else()

    add_subdirectory(gguf-hash)
    add_subdirectory(gguf)
+    add_subdirectory(idle)
    add_subdirectory(lookahead)
    add_subdirectory(lookup)
    add_subdirectory(parallel)
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -2,6 +2,7 @@
 #include "common.h"
 #include "log.h"
 #include "llama.h"
+#include "sampling.h"

 #include <algorithm>
 #include <cstdio>
@@ -64,17 +65,23 @@ int main(int argc, char ** argv) {
    ctx_params.n_ctx   = n_kv_req;
    ctx_params.n_batch = std::max(n_predict, n_parallel);

-    llama_context * ctx = llama_init_from_model(model, ctx_params);
-
    auto sparams = llama_sampler_chain_default_params();
    sparams.no_perf = false;

-    llama_sampler * smpl = llama_sampler_chain_init(sparams);
+    std::vector<llama_sampler *> samplers;

-    llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sampling.top_k));
-    llama_sampler_chain_add(smpl, llama_sampler_init_top_p(params.sampling.top_p, params.sampling.min_keep));
-    llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sampling.temp));
-    llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sampling.seed));
+    for (int32_t i = 0; i < n_parallel; ++i) {
+        llama_sampler * smpl = llama_sampler_chain_init(sparams);
+
+        llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sampling.top_k));
+        llama_sampler_chain_add(smpl, llama_sampler_init_top_p(params.sampling.top_p, params.sampling.min_keep));
+        llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sampling.temp));
+        llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sampling.seed));
+
+        samplers.push_back(smpl);
+    }
+
+    llama_context * ctx = llama_init_from_model(model, ctx_params);

    if (ctx == NULL) {
        LOG_ERR("%s: error: failed to create the llama_context\n" , __func__);
@@ -173,7 +180,7 @@ int main(int argc, char ** argv) {
                continue;
            }

-            const llama_token new_token_id = llama_sampler_sample(smpl, ctx, i_batch[i]);
+            const llama_token new_token_id = llama_sampler_sample(samplers[i], ctx, i_batch[i]);

            // is it an end of generation? -> mark the stream as finished
            if (llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_predict) {
@@ -229,14 +236,17 @@ int main(int argc, char ** argv) {
            __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));

    LOG("\n");
-    llama_perf_sampler_print(smpl);
+    llama_perf_sampler_print(samplers[0]);
    llama_perf_context_print(ctx);

    fprintf(stderr, "\n");

    llama_batch_free(batch);

-    llama_sampler_free(smpl);
+    for (auto & sampler_config : samplers) {
+        llama_sampler_free(sampler_config);
+    }
+
    llama_free(ctx);
    llama_model_free(model);

--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -104,12 +104,16 @@ int main(int argc, char ** argv) {

    params.embedding = true;

+    // get max number of sequences per batch
+    const int n_seq_max = llama_max_parallel_sequences();
+
    // if the number of prompts that would be encoded is known in advance, it's more efficient to specify the
    //   --parallel argument accordingly. for convenience, if not specified, we fallback to unified KV cache
    //   in order to support any number of prompts
    if (params.n_parallel == 1) {
        LOG_INF("%s: n_parallel == 1 -> unified KV cache is enabled\n", __func__);
        params.kv_unified = true;
+        params.n_parallel = n_seq_max;
    }

    // utilize the full context
@@ -123,17 +127,14 @@ int main(int argc, char ** argv) {
        params.n_ubatch = params.n_batch;
    }

-    // get max number of sequences per batch
-    const int n_seq_max = llama_max_parallel_sequences();
-
    llama_backend_init();
    llama_numa_init(params.numa);

    // load the model
-    common_init_result llama_init = common_init_from_params(params);
+    auto llama_init = common_init_from_params(params);

-    llama_model * model = llama_init.model.get();
-    llama_context * ctx = llama_init.context.get();
+    auto * model = llama_init->model();
+    auto * ctx = llama_init->context();

    if (model == NULL) {
        LOG_ERR("%s: unable to load model\n", __func__);
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -202,10 +202,10 @@ int main(int argc, char ** argv) {
    params.warmup = false;

    // init
-    common_init_result llama_init = common_init_from_params(params);
+    auto llama_init = common_init_from_params(params);

-    llama_model * model = llama_init.model.get();
-    llama_context * ctx = llama_init.context.get();
+    auto * model = llama_init->model();
+    auto * ctx   = llama_init->context();

    if (model == nullptr || ctx == nullptr) {
        LOG_ERR("%s : failed to init\n", __func__);
--- a/examples/gen-docs/gen-docs.cpp
+++ b/examples/gen-docs/gen-docs.cpp
@@ -14,12 +14,13 @@ static void write_table_header(std::ofstream & file) {
 static void write_table_entry(std::ofstream & file, const common_arg & opt) {
    file << "| `";
    // args
-    for (const auto & arg : opt.args) {
-    if (arg == opt.args.front()) {
+    auto all_args = opt.get_args();
+    for (const auto & arg : all_args) {
+    if (arg == all_args.front()) {
            file << arg;
-            if (opt.args.size() > 1) file << ", ";
+            if (all_args.size() > 1) file << ", ";
        } else {
-            file << arg << (arg != opt.args.back() ? ", " : "");
+            file << arg << (arg != all_args.back() ? ", " : "");
        }
    }
    // value hint
@@ -47,7 +48,7 @@ static void write_table(std::ofstream & file, std::vector<common_arg *> & opts)
    }
 }

-static void export_md(std::string fname, llama_example ex) {
+static void export_md(std::string fname, llama_example ex, std::string name) {
    std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc);

    common_params params;
@@ -71,13 +72,14 @@ static void export_md(std::string fname, llama_example ex) {
    write_table(file, common_options);
    file << "\n\n**Sampling params**\n\n";
    write_table(file, sparam_options);
-    file << "\n\n**Example-specific params**\n\n";
+    file << "\n\n**" << name << "-specific params**\n\n";
    write_table(file, specific_options);
 }

 int main(int, char **) {
-    export_md("autogen-main.md", LLAMA_EXAMPLE_MAIN);
-    export_md("autogen-server.md", LLAMA_EXAMPLE_SERVER);
+    // TODO: add CLI
+    export_md("autogen-completion.md", LLAMA_EXAMPLE_COMPLETION, "Tool");
+    export_md("autogen-server.md", LLAMA_EXAMPLE_SERVER, "Server");

    return 0;
 }
--- a/examples/idle/CMakeLists.txt
+++ b/examples/idle/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET llama-idle)
+add_executable(${TARGET} idle.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/idle/README.md
+++ b/examples/idle/README.md
@@ -0,0 +1,3 @@
+# llama.cpp/example/idle
+
+https://github.com/ggml-org/llama.cpp/pull/17766
--- a/examples/idle/idle.cpp
+++ b/examples/idle/idle.cpp
@@ -0,0 +1,110 @@
+#include "arg.h"
+#include "common.h"
+#include "log.h"
+#include "llama.h"
+
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <string>
+#include <thread>
+#include <vector>
+
+static void print_usage(int /*argc*/, char ** argv) {
+    printf("\nexample usage:\n");
+    printf("\n    %s -m model.gguf [-ngl n_gpu_layers]\n", argv[0]);
+    printf("\n");
+}
+
+int main(int argc, char ** argv) {
+    common_params params;
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
+        return 1;
+    }
+
+    common_init();
+
+    // init LLM
+
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    // initialize the model
+
+    llama_model_params model_params = common_model_params_to_llama(params);
+
+    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
+
+    if (model == NULL) {
+        LOG_ERR("%s: error: unable to load model\n" , __func__);
+        return 1;
+    }
+
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+
+    // we need just a dummy token to evaluate
+    std::vector<llama_token> prompt_tokens(1, llama_vocab_bos(vocab));
+
+    llama_context_params ctx_params = llama_context_default_params();
+    ctx_params.n_ctx   = 512;
+    ctx_params.n_batch = 512;
+    ctx_params.no_perf = false;
+
+    llama_context * ctx = llama_init_from_model(model, ctx_params);
+    if (ctx == NULL) {
+        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
+        return 1;
+    }
+
+    llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
+
+    const int n_iters = 3;
+
+    // warm-up
+    llama_decode(ctx, batch);
+    llama_memory_clear(llama_get_memory(ctx), true);
+    llama_synchronize(ctx);
+
+    for (int64_t t_pause_ms = 0; t_pause_ms <= 4000; t_pause_ms += 800) {
+        double t_sum_us  = 0.0;
+        double t_sum2_us = 0.0;
+
+        for (int i = 0; i < n_iters; i++) {
+            // this pause is important - it simulates "idle GPU"
+            std::this_thread::sleep_for(std::chrono::milliseconds(t_pause_ms));
+
+            const int64_t t_start_us = llama_time_us();
+
+            // this should take constant time
+            llama_decode(ctx, batch);
+            llama_synchronize(ctx);
+
+            const int64_t t_end_us = llama_time_us();
+
+            const double t_cur_us = t_end_us - t_start_us;
+
+#if 1
+            // print individual decode times
+            printf("  - decode time: %8.2f ms\n", t_cur_us / 1000);
+#endif
+
+            t_sum_us  += t_cur_us;
+            t_sum2_us += t_cur_us * t_cur_us;
+
+            llama_memory_clear(llama_get_memory(ctx), true);
+            llama_synchronize(ctx); // just in case
+        }
+
+        const double t_avg_us = t_sum_us / n_iters;
+        const double t_dev_us = sqrt((t_sum2_us / (n_iters - 1)) - (t_avg_us * t_avg_us * n_iters) / (n_iters - 1));
+
+        printf("iters: %4d, pause: %5d ms, avg decode time: %8.2f +/- %4.2f ms\n", n_iters, (int) t_pause_ms, t_avg_us / 1000, t_dev_us / 1000);
+        fflush(stdout);
+    }
+
+    llama_free(ctx);
+    llama_model_free(model);
+
+    return 0;
+}
--- a/examples/json_schema_to_grammar.py
+++ b/examples/json_schema_to_grammar.py
@@ -231,9 +231,9 @@ DOT = '[^\\x0A\\x0D]'
 RESERVED_NAMES = set(["root", "dot", *PRIMITIVE_RULES.keys(), *STRING_FORMAT_RULES.keys()])

 INVALID_RULE_CHARS_RE = re.compile(r'[^a-zA-Z0-9-]+')
-GRAMMAR_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"]')
+GRAMMAR_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"\\]')
 GRAMMAR_RANGE_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"\]\-\\]')
-GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]'}
+GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]', '\\': '\\\\'}

 NON_LITERAL_SET = set('|.()[]{}*+?')
 ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('^$.[]()|{}*+?')
--- a/examples/llama.android/app/build.gradle.kts
+++ b/examples/llama.android/app/build.gradle.kts
@@ -1,16 +1,18 @@
 plugins {
-    id("com.android.application")
-    id("org.jetbrains.kotlin.android")
+    alias(libs.plugins.android.application)
+    alias(libs.plugins.jetbrains.kotlin.android)
 }

 android {
    namespace = "com.example.llama"
-    compileSdk = 34
+    compileSdk = 36

    defaultConfig {
-        applicationId = "com.example.llama"
+        applicationId = "com.example.llama.aichat"
+
        minSdk = 33
-        targetSdk = 34
+        targetSdk = 36
+
        versionCode = 1
        versionName = "1.0"

@@ -21,8 +23,17 @@ android {
    }

    buildTypes {
+        debug {
+            isMinifyEnabled = true
+            isShrinkResources = true
+            proguardFiles(
+                getDefaultProguardFile("proguard-android.txt"),
+                "proguard-rules.pro"
+            )
+        }
        release {
-            isMinifyEnabled = false
+            isMinifyEnabled = true
+            isShrinkResources = true
            proguardFiles(
                getDefaultProguardFile("proguard-android-optimize.txt"),
                "proguard-rules.pro"
@@ -36,30 +47,15 @@ android {
    kotlinOptions {
        jvmTarget = "1.8"
    }
-    buildFeatures {
-        compose = true
-    }
-    composeOptions {
-        kotlinCompilerExtensionVersion = "1.5.1"
-    }
 }

 dependencies {
+    implementation(libs.bundles.androidx)
+    implementation(libs.material)

-    implementation("androidx.core:core-ktx:1.12.0")
-    implementation("androidx.lifecycle:lifecycle-runtime-ktx:2.6.2")
-    implementation("androidx.activity:activity-compose:1.8.2")
-    implementation(platform("androidx.compose:compose-bom:2023.08.00"))
-    implementation("androidx.compose.ui:ui")
-    implementation("androidx.compose.ui:ui-graphics")
-    implementation("androidx.compose.ui:ui-tooling-preview")
-    implementation("androidx.compose.material3:material3")
-    implementation(project(":llama"))
-    testImplementation("junit:junit:4.13.2")
-    androidTestImplementation("androidx.test.ext:junit:1.1.5")
-    androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
-    androidTestImplementation(platform("androidx.compose:compose-bom:2023.08.00"))
-    androidTestImplementation("androidx.compose.ui:ui-test-junit4")
-    debugImplementation("androidx.compose.ui:ui-tooling")
-    debugImplementation("androidx.compose.ui:ui-test-manifest")
+    implementation(project(":lib"))
+
+    testImplementation(libs.junit)
+    androidTestImplementation(libs.androidx.junit)
+    androidTestImplementation(libs.androidx.espresso.core)
 }
--- a/examples/llama.android/app/proguard-rules.pro
+++ b/examples/llama.android/app/proguard-rules.pro
@@ -19,3 +19,11 @@
 # If you keep the line number information, uncomment this to
 # hide the original source file name.
 #-renamesourcefileattribute SourceFile
+
+-keep class com.arm.aichat.* { *; }
+-keep class com.arm.aichat.gguf.* { *; }
+
+-assumenosideeffects class android.util.Log {
+    public static int v(...);
+    public static int d(...);
+}
--- a/examples/llama.android/app/src/main/AndroidManifest.xml
+++ b/examples/llama.android/app/src/main/AndroidManifest.xml
@@ -1,24 +1,21 @@
 <?xml version="1.0" encoding="utf-8"?>
-<manifest xmlns:android="http://schemas.android.com/apk/res/android"
-    xmlns:tools="http://schemas.android.com/tools">
-
-    <uses-permission android:name="android.permission.INTERNET" />
+<manifest xmlns:android="http://schemas.android.com/apk/res/android">

    <application
        android:allowBackup="true"
        android:dataExtractionRules="@xml/data_extraction_rules"
+        android:extractNativeLibs="true"
        android:fullBackupContent="@xml/backup_rules"
-        android:icon="@mipmap/ic_launcher"
+        android:icon="@mipmap/ic_launcher_round"
        android:label="@string/app_name"
        android:roundIcon="@mipmap/ic_launcher_round"
        android:supportsRtl="true"
-        android:theme="@style/Theme.LlamaAndroid"
+        android:theme="@style/Theme.AiChatSample"
        >

        <activity
            android:name=".MainActivity"
-            android:exported="true"
-            android:theme="@style/Theme.LlamaAndroid">
+            android:exported="true">
            <intent-filter>
                <action android:name="android.intent.action.MAIN" />

--- a/examples/llama.android/app/src/main/java/com/example/llama/Downloadable.kt
+++ b/examples/llama.android/app/src/main/java/com/example/llama/Downloadable.kt
@@ -1,119 +0,0 @@
-package com.example.llama
-
-import android.app.DownloadManager
-import android.net.Uri
-import android.util.Log
-import androidx.compose.material3.Button
-import androidx.compose.material3.Text
-import androidx.compose.runtime.Composable
-import androidx.compose.runtime.getValue
-import androidx.compose.runtime.mutableDoubleStateOf
-import androidx.compose.runtime.mutableStateOf
-import androidx.compose.runtime.remember
-import androidx.compose.runtime.rememberCoroutineScope
-import androidx.compose.runtime.setValue
-import androidx.core.database.getLongOrNull
-import androidx.core.net.toUri
-import kotlinx.coroutines.delay
-import kotlinx.coroutines.launch
-import java.io.File
-
-data class Downloadable(val name: String, val source: Uri, val destination: File) {
-    companion object {
-        @JvmStatic
-        private val tag: String? = this::class.qualifiedName
-
-        sealed interface State
-        data object Ready: State
-        data class Downloading(val id: Long): State
-        data class Downloaded(val downloadable: Downloadable): State
-        data class Error(val message: String): State
-
-        @JvmStatic
-        @Composable
-        fun Button(viewModel: MainViewModel, dm: DownloadManager, item: Downloadable) {
-            var status: State by remember {
-                mutableStateOf(
-                    if (item.destination.exists()) Downloaded(item)
-                    else Ready
-                )
-            }
-            var progress by remember { mutableDoubleStateOf(0.0) }
-
-            val coroutineScope = rememberCoroutineScope()
-
-            suspend fun waitForDownload(result: Downloading, item: Downloadable): State {
-                while (true) {
-                    val cursor = dm.query(DownloadManager.Query().setFilterById(result.id))
-
-                    if (cursor == null) {
-                        Log.e(tag, "dm.query() returned null")
-                        return Error("dm.query() returned null")
-                    }
-
-                    if (!cursor.moveToFirst() || cursor.count < 1) {
-                        cursor.close()
-                        Log.i(tag, "cursor.moveToFirst() returned false or cursor.count < 1, download canceled?")
-                        return Ready
-                    }
-
-                    val pix = cursor.getColumnIndex(DownloadManager.COLUMN_BYTES_DOWNLOADED_SO_FAR)
-                    val tix = cursor.getColumnIndex(DownloadManager.COLUMN_TOTAL_SIZE_BYTES)
-                    val sofar = cursor.getLongOrNull(pix) ?: 0
-                    val total = cursor.getLongOrNull(tix) ?: 1
-                    cursor.close()
-
-                    if (sofar == total) {
-                        return Downloaded(item)
-                    }
-
-                    progress = (sofar * 1.0) / total
-
-                    delay(1000L)
-                }
-            }
-
-            fun onClick() {
-                when (val s = status) {
-                    is Downloaded -> {
-                        viewModel.load(item.destination.path)
-                    }
-
-                    is Downloading -> {
-                        coroutineScope.launch {
-                            status = waitForDownload(s, item)
-                        }
-                    }
-
-                    else -> {
-                        item.destination.delete()
-
-                        val request = DownloadManager.Request(item.source).apply {
-                            setTitle("Downloading model")
-                            setDescription("Downloading model: ${item.name}")
-                            setAllowedNetworkTypes(DownloadManager.Request.NETWORK_WIFI)
-                            setDestinationUri(item.destination.toUri())
-                        }
-
-                        viewModel.log("Saving ${item.name} to ${item.destination.path}")
-                        Log.i(tag, "Saving ${item.name} to ${item.destination.path}")
-
-                        val id = dm.enqueue(request)
-                        status = Downloading(id)
-                        onClick()
-                    }
-                }
-            }
-
-            Button(onClick = { onClick() }, enabled = status !is Downloading) {
-                when (status) {
-                    is Downloading -> Text(text = "Downloading ${(progress * 100).toInt()}%")
-                    is Downloaded -> Text("Load ${item.name}")
-                    is Ready -> Text("Download ${item.name}")
-                    is Error -> Text("Download ${item.name}")
-                }
-            }
-        }
-
-    }
-}
--- a/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt
+++ b/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt
@@ -1,154 +1,257 @@
 package com.example.llama

-import android.app.ActivityManager
-import android.app.DownloadManager
-import android.content.ClipData
-import android.content.ClipboardManager
 import android.net.Uri
 import android.os.Bundle
-import android.os.StrictMode
-import android.os.StrictMode.VmPolicy
-import android.text.format.Formatter
-import androidx.activity.ComponentActivity
-import androidx.activity.compose.setContent
-import androidx.activity.viewModels
-import androidx.compose.foundation.layout.Box
-import androidx.compose.foundation.layout.Column
-import androidx.compose.foundation.layout.Row
-import androidx.compose.foundation.layout.fillMaxSize
-import androidx.compose.foundation.layout.padding
-import androidx.compose.foundation.lazy.LazyColumn
-import androidx.compose.foundation.lazy.items
-import androidx.compose.foundation.lazy.rememberLazyListState
-import androidx.compose.material3.Button
-import androidx.compose.material3.LocalContentColor
-import androidx.compose.material3.MaterialTheme
-import androidx.compose.material3.OutlinedTextField
-import androidx.compose.material3.Surface
-import androidx.compose.material3.Text
-import androidx.compose.runtime.Composable
-import androidx.compose.ui.Modifier
-import androidx.compose.ui.unit.dp
-import androidx.core.content.getSystemService
-import com.example.llama.ui.theme.LlamaAndroidTheme
+import android.util.Log
+import android.widget.EditText
+import android.widget.TextView
+import android.widget.Toast
+import androidx.activity.enableEdgeToEdge
+import androidx.activity.result.contract.ActivityResultContracts
+import androidx.appcompat.app.AppCompatActivity
+import androidx.lifecycle.lifecycleScope
+import androidx.recyclerview.widget.LinearLayoutManager
+import androidx.recyclerview.widget.RecyclerView
+import com.arm.aichat.AiChat
+import com.arm.aichat.InferenceEngine
+import com.arm.aichat.gguf.GgufMetadata
+import com.arm.aichat.gguf.GgufMetadataReader
+import com.google.android.material.floatingactionbutton.FloatingActionButton
+import kotlinx.coroutines.Dispatchers
+import kotlinx.coroutines.flow.onCompletion
+import kotlinx.coroutines.launch
+import kotlinx.coroutines.withContext
 import java.io.File
+import java.io.FileOutputStream
+import java.io.InputStream
+import java.util.UUID

-class MainActivity(
-    activityManager: ActivityManager? = null,
-    downloadManager: DownloadManager? = null,
-    clipboardManager: ClipboardManager? = null,
-): ComponentActivity() {
-    private val tag: String? = this::class.simpleName
+class MainActivity : AppCompatActivity() {

-    private val activityManager by lazy { activityManager ?: getSystemService<ActivityManager>()!! }
-    private val downloadManager by lazy { downloadManager ?: getSystemService<DownloadManager>()!! }
-    private val clipboardManager by lazy { clipboardManager ?: getSystemService<ClipboardManager>()!! }
+    // Android views
+    private lateinit var ggufTv: TextView
+    private lateinit var messagesRv: RecyclerView
+    private lateinit var userInputEt: EditText
+    private lateinit var userActionFab: FloatingActionButton

-    private val viewModel: MainViewModel by viewModels()
+    // Arm AI Chat inference engine
+    private lateinit var engine: InferenceEngine

-    // Get a MemoryInfo object for the device's current memory status.
-    private fun availableMemory(): ActivityManager.MemoryInfo {
-        return ActivityManager.MemoryInfo().also { memoryInfo ->
-            activityManager.getMemoryInfo(memoryInfo)
-        }
-    }
+    // Conversation states
+    private var isModelReady = false
+    private val messages = mutableListOf<Message>()
+    private val lastAssistantMsg = StringBuilder()
+    private val messageAdapter = MessageAdapter(messages)

    override fun onCreate(savedInstanceState: Bundle?) {
        super.onCreate(savedInstanceState)
+        enableEdgeToEdge()
+        setContentView(R.layout.activity_main)

-        StrictMode.setVmPolicy(
-            VmPolicy.Builder(StrictMode.getVmPolicy())
-                .detectLeakedClosableObjects()
-                .build()
-        )
+        // Find views
+        ggufTv = findViewById(R.id.gguf)
+        messagesRv = findViewById(R.id.messages)
+        messagesRv.layoutManager = LinearLayoutManager(this)
+        messagesRv.adapter = messageAdapter
+        userInputEt = findViewById(R.id.user_input)
+        userActionFab = findViewById(R.id.fab)

-        val free = Formatter.formatFileSize(this, availableMemory().availMem)
-        val total = Formatter.formatFileSize(this, availableMemory().totalMem)
-
-        viewModel.log("Current memory: $free / $total")
-        viewModel.log("Downloads directory: ${getExternalFilesDir(null)}")
-
-        val extFilesDir = getExternalFilesDir(null)
-
-        val models = listOf(
-            Downloadable(
-                "Phi-2 7B (Q4_0, 1.6 GiB)",
-                Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf?download=true"),
-                File(extFilesDir, "phi-2-q4_0.gguf"),
-            ),
-            Downloadable(
-                "TinyLlama 1.1B (f16, 2.2 GiB)",
-                Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true"),
-                File(extFilesDir, "tinyllama-1.1-f16.gguf"),
-            ),
-            Downloadable(
-                "Phi 2 DPO (Q3_K_M, 1.48 GiB)",
-                Uri.parse("https://huggingface.co/TheBloke/phi-2-dpo-GGUF/resolve/main/phi-2-dpo.Q3_K_M.gguf?download=true"),
-                File(extFilesDir, "phi-2-dpo.Q3_K_M.gguf")
-            ),
-        )
-
-        setContent {
-            LlamaAndroidTheme {
-                // A surface container using the 'background' color from the theme
-                Surface(
-                    modifier = Modifier.fillMaxSize(),
-                    color = MaterialTheme.colorScheme.background
-                ) {
-                    MainCompose(
-                        viewModel,
-                        clipboardManager,
-                        downloadManager,
-                        models,
-                    )
-                }
+        // Arm AI Chat initialization
+        lifecycleScope.launch(Dispatchers.Default) {
+            engine = AiChat.getInferenceEngine(applicationContext)
+        }

+        // Upon CTA button tapped
+        userActionFab.setOnClickListener {
+            if (isModelReady) {
+                // If model is ready, validate input and send to engine
+                handleUserInput()
+            } else {
+                // Otherwise, prompt user to select a GGUF metadata on the device
+                getContent.launch(arrayOf("*/*"))
            }
        }
    }
-}

-@Composable
-fun MainCompose(
-    viewModel: MainViewModel,
-    clipboard: ClipboardManager,
-    dm: DownloadManager,
-    models: List<Downloadable>
-) {
-    Column {
-        val scrollState = rememberLazyListState()
+    private val getContent = registerForActivityResult(
+        ActivityResultContracts.OpenDocument()
+    ) { uri ->
+        Log.i(TAG, "Selected file uri:\n $uri")
+        uri?.let { handleSelectedModel(it) }
+    }

-        Box(modifier = Modifier.weight(1f)) {
-            LazyColumn(state = scrollState) {
-                items(viewModel.messages) {
-                    Text(
-                        it,
-                        style = MaterialTheme.typography.bodyLarge.copy(color = LocalContentColor.current),
-                        modifier = Modifier.padding(16.dp)
-                    )
+    /**
+     * Handles the file Uri from [getContent] result
+     */
+    private fun handleSelectedModel(uri: Uri) {
+        // Update UI states
+        userActionFab.isEnabled = false
+        userInputEt.hint = "Parsing GGUF..."
+        ggufTv.text = "Parsing metadata from selected file \n$uri"
+
+        lifecycleScope.launch(Dispatchers.IO) {
+            // Parse GGUF metadata
+            Log.i(TAG, "Parsing GGUF metadata...")
+            contentResolver.openInputStream(uri)?.use {
+                GgufMetadataReader.create().readStructuredMetadata(it)
+            }?.let { metadata ->
+                // Update UI to show GGUF metadata to user
+                Log.i(TAG, "GGUF parsed: \n$metadata")
+                withContext(Dispatchers.Main) {
+                    ggufTv.text = metadata.toString()
                }
-            }
-        }
-        OutlinedTextField(
-            value = viewModel.message,
-            onValueChange = { viewModel.updateMessage(it) },
-            label = { Text("Message") },
-        )
-        Row {
-            Button({ viewModel.send() }) { Text("Send") }
-            Button({ viewModel.bench(8, 4, 1) }) { Text("Bench") }
-            Button({ viewModel.clear() }) { Text("Clear") }
-            Button({
-                viewModel.messages.joinToString("\n").let {
-                    clipboard.setPrimaryClip(ClipData.newPlainText("", it))
-                }
-            }) { Text("Copy") }
-        }

-        Column {
-            for (model in models) {
-                Downloadable.Button(viewModel, dm, model)
+                // Ensure the model file is available
+                val modelName = metadata.filename() + FILE_EXTENSION_GGUF
+                contentResolver.openInputStream(uri)?.use { input ->
+                    ensureModelFile(modelName, input)
+                }?.let { modelFile ->
+                    loadModel(modelName, modelFile)
+
+                    withContext(Dispatchers.Main) {
+                        isModelReady = true
+                        userInputEt.hint = "Type and send a message!"
+                        userInputEt.isEnabled = true
+                        userActionFab.setImageResource(R.drawable.outline_send_24)
+                        userActionFab.isEnabled = true
+                    }
+                }
            }
        }
    }
+
+    /**
+     * Prepare the model file within app's private storage
+     */
+    private suspend fun ensureModelFile(modelName: String, input: InputStream) =
+        withContext(Dispatchers.IO) {
+            File(ensureModelsDirectory(), modelName).also { file ->
+                // Copy the file into local storage if not yet done
+                if (!file.exists()) {
+                    Log.i(TAG, "Start copying file to $modelName")
+                    withContext(Dispatchers.Main) {
+                        userInputEt.hint = "Copying file..."
+                    }
+
+                    FileOutputStream(file).use { input.copyTo(it) }
+                    Log.i(TAG, "Finished copying file to $modelName")
+                } else {
+                    Log.i(TAG, "File already exists $modelName")
+                }
+            }
+        }
+
+    /**
+     * Load the model file from the app private storage
+     */
+    private suspend fun loadModel(modelName: String, modelFile: File) =
+        withContext(Dispatchers.IO) {
+            Log.i(TAG, "Loading model $modelName")
+            withContext(Dispatchers.Main) {
+                userInputEt.hint = "Loading model..."
+            }
+            engine.loadModel(modelFile.path)
+        }
+
+    /**
+     * Validate and send the user message into [InferenceEngine]
+     */
+    private fun handleUserInput() {
+        userInputEt.text.toString().also { userSsg ->
+            if (userSsg.isEmpty()) {
+                Toast.makeText(this, "Input message is empty!", Toast.LENGTH_SHORT).show()
+            } else {
+                userInputEt.text = null
+                userActionFab.isEnabled = false
+
+                // Update message states
+                messages.add(Message(UUID.randomUUID().toString(), userSsg, true))
+                lastAssistantMsg.clear()
+                messages.add(Message(UUID.randomUUID().toString(), lastAssistantMsg.toString(), false))
+
+                lifecycleScope.launch(Dispatchers.Default) {
+                    engine.sendUserPrompt(userSsg)
+                        .onCompletion {
+                            withContext(Dispatchers.Main) {
+                                userActionFab.isEnabled = true
+                            }
+                        }.collect { token ->
+                            val messageCount = messages.size
+                            check(messageCount > 0 && !messages[messageCount - 1].isUser)
+
+                            messages.removeAt(messageCount - 1).copy(
+                                content = lastAssistantMsg.append(token).toString()
+                            ).let { messages.add(it) }
+
+                            withContext(Dispatchers.Main) {
+                                messageAdapter.notifyItemChanged(messages.size - 1)
+                            }
+                        }
+                }
+            }
+        }
+    }
+
+    /**
+     * Run a benchmark with the model file
+     */
+    private suspend fun runBenchmark(modelName: String, modelFile: File) =
+        withContext(Dispatchers.Default) {
+            Log.i(TAG, "Starts benchmarking $modelName")
+            withContext(Dispatchers.Main) {
+                userInputEt.hint = "Running benchmark..."
+            }
+            engine.bench(
+                pp=BENCH_PROMPT_PROCESSING_TOKENS,
+                tg=BENCH_TOKEN_GENERATION_TOKENS,
+                pl=BENCH_SEQUENCE,
+                nr=BENCH_REPETITION
+            ).let { result ->
+                messages.add(Message(UUID.randomUUID().toString(), result, false))
+                withContext(Dispatchers.Main) {
+                    messageAdapter.notifyItemChanged(messages.size - 1)
+                }
+            }
+        }
+
+    /**
+     * Create the `models` directory if not exist.
+     */
+    private fun ensureModelsDirectory() =
+        File(filesDir, DIRECTORY_MODELS).also {
+            if (it.exists() && !it.isDirectory) { it.delete() }
+            if (!it.exists()) { it.mkdir() }
+        }
+
+    companion object {
+        private val TAG = MainActivity::class.java.simpleName
+
+        private const val DIRECTORY_MODELS = "models"
+        private const val FILE_EXTENSION_GGUF = ".gguf"
+
+        private const val BENCH_PROMPT_PROCESSING_TOKENS = 512
+        private const val BENCH_TOKEN_GENERATION_TOKENS = 128
+        private const val BENCH_SEQUENCE = 1
+        private const val BENCH_REPETITION = 3
+    }
+}
+
+fun GgufMetadata.filename() = when {
+    basic.name != null -> {
+        basic.name?.let { name ->
+            basic.sizeLabel?.let { size ->
+                "$name-$size"
+            } ?: name
+        }
+    }
+    architecture?.architecture != null -> {
+        architecture?.architecture?.let { arch ->
+            basic.uuid?.let { uuid ->
+                "$arch-$uuid"
+            } ?: "$arch-${System.currentTimeMillis()}"
+        }
+    }
+    else -> {
+        "model-${System.currentTimeMillis().toHexString()}"
+    }
 }
--- a/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt
+++ b/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt
@@ -1,105 +0,0 @@
-package com.example.llama
-
-import android.llama.cpp.LLamaAndroid
-import android.util.Log
-import androidx.compose.runtime.getValue
-import androidx.compose.runtime.mutableStateOf
-import androidx.compose.runtime.setValue
-import androidx.lifecycle.ViewModel
-import androidx.lifecycle.viewModelScope
-import kotlinx.coroutines.flow.catch
-import kotlinx.coroutines.launch
-
-class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instance()): ViewModel() {
-    companion object {
-        @JvmStatic
-        private val NanosPerSecond = 1_000_000_000.0
-    }
-
-    private val tag: String? = this::class.simpleName
-
-    var messages by mutableStateOf(listOf("Initializing..."))
-        private set
-
-    var message by mutableStateOf("")
-        private set
-
-    override fun onCleared() {
-        super.onCleared()
-
-        viewModelScope.launch {
-            try {
-                llamaAndroid.unload()
-            } catch (exc: IllegalStateException) {
-                messages += exc.message!!
-            }
-        }
-    }
-
-    fun send() {
-        val text = message
-        message = ""
-
-        // Add to messages console.
-        messages += text
-        messages += ""
-
-        viewModelScope.launch {
-            llamaAndroid.send(text)
-                .catch {
-                    Log.e(tag, "send() failed", it)
-                    messages += it.message!!
-                }
-                .collect { messages = messages.dropLast(1) + (messages.last() + it) }
-        }
-    }
-
-    fun bench(pp: Int, tg: Int, pl: Int, nr: Int = 1) {
-        viewModelScope.launch {
-            try {
-                val start = System.nanoTime()
-                val warmupResult = llamaAndroid.bench(pp, tg, pl, nr)
-                val end = System.nanoTime()
-
-                messages += warmupResult
-
-                val warmup = (end - start).toDouble() / NanosPerSecond
-                messages += "Warm up time: $warmup seconds, please wait..."
-
-                if (warmup > 5.0) {
-                    messages += "Warm up took too long, aborting benchmark"
-                    return@launch
-                }
-
-                messages += llamaAndroid.bench(512, 128, 1, 3)
-            } catch (exc: IllegalStateException) {
-                Log.e(tag, "bench() failed", exc)
-                messages += exc.message!!
-            }
-        }
-    }
-
-    fun load(pathToModel: String) {
-        viewModelScope.launch {
-            try {
-                llamaAndroid.load(pathToModel)
-                messages += "Loaded $pathToModel"
-            } catch (exc: IllegalStateException) {
-                Log.e(tag, "load() failed", exc)
-                messages += exc.message!!
-            }
-        }
-    }
-
-    fun updateMessage(newMessage: String) {
-        message = newMessage
-    }
-
-    fun clear() {
-        messages = listOf()
-    }
-
-    fun log(message: String) {
-        messages += message
-    }
-}
--- a/examples/llama.android/app/src/main/java/com/example/llama/MessageAdapter.kt
+++ b/examples/llama.android/app/src/main/java/com/example/llama/MessageAdapter.kt
@@ -0,0 +1,51 @@
+package com.example.llama
+
+import android.view.LayoutInflater
+import android.view.View
+import android.view.ViewGroup
+import android.widget.TextView
+import androidx.recyclerview.widget.RecyclerView
+
+data class Message(
+    val id: String,
+    val content: String,
+    val isUser: Boolean
+)
+
+class MessageAdapter(
+    private val messages: List<Message>
+) : RecyclerView.Adapter<RecyclerView.ViewHolder>() {
+
+    companion object {
+        private const val VIEW_TYPE_USER = 1
+        private const val VIEW_TYPE_ASSISTANT = 2
+    }
+
+    override fun getItemViewType(position: Int): Int {
+        return if (messages[position].isUser) VIEW_TYPE_USER else VIEW_TYPE_ASSISTANT
+    }
+
+    override fun onCreateViewHolder(parent: ViewGroup, viewType: Int): RecyclerView.ViewHolder {
+        val layoutInflater = LayoutInflater.from(parent.context)
+        return if (viewType == VIEW_TYPE_USER) {
+            val view = layoutInflater.inflate(R.layout.item_message_user, parent, false)
+            UserMessageViewHolder(view)
+        } else {
+            val view = layoutInflater.inflate(R.layout.item_message_assistant, parent, false)
+            AssistantMessageViewHolder(view)
+        }
+    }
+
+    override fun onBindViewHolder(holder: RecyclerView.ViewHolder, position: Int) {
+        val message = messages[position]
+        if (holder is UserMessageViewHolder || holder is AssistantMessageViewHolder) {
+            val textView = holder.itemView.findViewById<TextView>(R.id.msg_content)
+            textView.text = message.content
+        }
+    }
+
+    override fun getItemCount(): Int = messages.size
+
+    class UserMessageViewHolder(view: View) : RecyclerView.ViewHolder(view)
+    class AssistantMessageViewHolder(view: View) : RecyclerView.ViewHolder(view)
+}
--- a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Color.kt
+++ b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Color.kt
@@ -1,11 +0,0 @@
-package com.example.llama.ui.theme
-
-import androidx.compose.ui.graphics.Color
-
-val Purple80 = Color(0xFFD0BCFF)
-val PurpleGrey80 = Color(0xFFCCC2DC)
-val Pink80 = Color(0xFFEFB8C8)
-
-val Purple40 = Color(0xFF6650a4)
-val PurpleGrey40 = Color(0xFF625b71)
-val Pink40 = Color(0xFF7D5260)
--- a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Theme.kt
+++ b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Theme.kt
@@ -1,70 +0,0 @@
-package com.example.llama.ui.theme
-
-import android.app.Activity
-import android.os.Build
-import androidx.compose.foundation.isSystemInDarkTheme
-import androidx.compose.material3.MaterialTheme
-import androidx.compose.material3.darkColorScheme
-import androidx.compose.material3.dynamicDarkColorScheme
-import androidx.compose.material3.dynamicLightColorScheme
-import androidx.compose.material3.lightColorScheme
-import androidx.compose.runtime.Composable
-import androidx.compose.runtime.SideEffect
-import androidx.compose.ui.graphics.toArgb
-import androidx.compose.ui.platform.LocalContext
-import androidx.compose.ui.platform.LocalView
-import androidx.core.view.WindowCompat
-
-private val DarkColorScheme = darkColorScheme(
-    primary = Purple80,
-    secondary = PurpleGrey80,
-    tertiary = Pink80
-)
-
-private val LightColorScheme = lightColorScheme(
-    primary = Purple40,
-    secondary = PurpleGrey40,
-    tertiary = Pink40
-
-    /* Other default colors to override
-    background = Color(0xFFFFFBFE),
-    surface = Color(0xFFFFFBFE),
-    onPrimary = Color.White,
-    onSecondary = Color.White,
-    onTertiary = Color.White,
-    onBackground = Color(0xFF1C1B1F),
-    onSurface = Color(0xFF1C1B1F),
-    */
-)
-
-@Composable
-fun LlamaAndroidTheme(
-    darkTheme: Boolean = isSystemInDarkTheme(),
-    // Dynamic color is available on Android 12+
-    dynamicColor: Boolean = true,
-    content: @Composable () -> Unit
-) {
-    val colorScheme = when {
-        dynamicColor && Build.VERSION.SDK_INT >= Build.VERSION_CODES.S -> {
-            val context = LocalContext.current
-            if (darkTheme) dynamicDarkColorScheme(context) else dynamicLightColorScheme(context)
-        }
-
-        darkTheme -> DarkColorScheme
-        else -> LightColorScheme
-    }
-    val view = LocalView.current
-    if (!view.isInEditMode) {
-        SideEffect {
-            val window = (view.context as Activity).window
-            window.statusBarColor = colorScheme.primary.toArgb()
-            WindowCompat.getInsetsController(window, view).isAppearanceLightStatusBars = darkTheme
-        }
-    }
-
-    MaterialTheme(
-        colorScheme = colorScheme,
-        typography = Typography,
-        content = content
-    )
-}
--- a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Type.kt
+++ b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Type.kt
@@ -1,34 +0,0 @@
-package com.example.llama.ui.theme
-
-import androidx.compose.material3.Typography
-import androidx.compose.ui.text.TextStyle
-import androidx.compose.ui.text.font.FontFamily
-import androidx.compose.ui.text.font.FontWeight
-import androidx.compose.ui.unit.sp
-
-// Set of Material typography styles to start with
-val Typography = Typography(
-    bodyLarge = TextStyle(
-        fontFamily = FontFamily.Default,
-        fontWeight = FontWeight.Normal,
-        fontSize = 16.sp,
-        lineHeight = 24.sp,
-        letterSpacing = 0.5.sp
-    )
-    /* Other default text styles to override
-    titleLarge = TextStyle(
-        fontFamily = FontFamily.Default,
-        fontWeight = FontWeight.Normal,
-        fontSize = 22.sp,
-        lineHeight = 28.sp,
-        letterSpacing = 0.sp
-    ),
-    labelSmall = TextStyle(
-        fontFamily = FontFamily.Default,
-        fontWeight = FontWeight.Medium,
-        fontSize = 11.sp,
-        lineHeight = 16.sp,
-        letterSpacing = 0.5.sp
-    )
-    */
-)
--- a/Show More
+++ b/Show More