Fixes Qwen2.5VL segfault during inference with https://github.com/ggml-org/llama.cpp/pull/12402 as has_qwen2vl_merger migration was incomplete

2026-04-23 16:37:33 +03:00 · 2025-04-27 18:36:57 +08:00
326 changed files with 6363 additions and 9957 deletions
--- a/.devops/cpu.Dockerfile
+++ b/.devops/cpu.Dockerfile
@@ -14,9 +14,9 @@ WORKDIR /app
 COPY . .

 RUN if [ "$TARGETARCH" = "amd64" ]; then \
-        cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
+        cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
    elif [ "$TARGETARCH" = "arm64" ]; then \
-        cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_CPU_ARM_ARCH=${GGML_CPU_ARM_ARCH}; \
+        cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=${GGML_CPU_ARM_ARCH}; \
    else \
        echo "Unsupported architecture"; \
        exit 1; \
--- a/.devops/cuda.Dockerfile
+++ b/.devops/cuda.Dockerfile
@@ -21,7 +21,7 @@ COPY . .
 RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
    export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
    fi && \
-    cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
    cmake --build build --config Release -j$(nproc)

 RUN mkdir -p /app/lib && \
--- a/.devops/intel.Dockerfile
+++ b/.devops/intel.Dockerfile
@@ -17,7 +17,7 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
        && export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
    fi && \
    echo "Building with dynamic libs" && \
-    cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${OPT_SYCL_F16} && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON ${OPT_SYCL_F16} && \
    cmake --build build --config Release -j$(nproc)

 RUN mkdir -p /app/lib && \
--- a/.devops/llama-cli-cann.Dockerfile
+++ b/.devops/llama-cli-cann.Dockerfile
@@ -22,7 +22,7 @@ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH

 RUN echo "Building with static libs" && \
    source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
-    cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_TESTS=OFF  && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF  && \
    cmake --build build --config Release --target llama-cli

 # TODO: use image with NNRT
--- a/.devops/musa.Dockerfile
+++ b/.devops/musa.Dockerfile
@@ -35,7 +35,7 @@ COPY . .
 RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
        export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
    fi && \
-    cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
    cmake --build build --config Release -j$(nproc)

 RUN mkdir -p /app/lib && \
--- a/.devops/rocm.Dockerfile
+++ b/.devops/rocm.Dockerfile
@@ -40,7 +40,7 @@ WORKDIR /app
 COPY . .

 RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
-    cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_BUILD_TESTS=OFF \
+    cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON \
    && cmake --build build --config Release -j$(nproc)

 RUN mkdir -p /app/lib \
--- a/.devops/vulkan.Dockerfile
+++ b/.devops/vulkan.Dockerfile
@@ -16,7 +16,7 @@ WORKDIR /app

 COPY . .

-RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1  -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON && \
+RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_CURL=1 -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON && \
    cmake --build build --config Release -j$(nproc)

 RUN mkdir -p /app/lib && \
--- a/.editorconfig
+++ b/.editorconfig
@@ -21,15 +21,15 @@ indent_style = tab
 [prompts/*.txt]
 insert_final_newline = unset

-[tools/server/public/*]
+[examples/server/public/*]
 indent_size = 2

-[tools/server/public/deps_*]
+[examples/server/public/deps_*]
 trim_trailing_whitespace = unset
 indent_style = unset
 indent_size = unset

-[tools/server/deps_*]
+[examples/server/deps_*]
 trim_trailing_whitespace = unset
 indent_style = unset
 indent_size = unset
@@ -37,7 +37,7 @@ indent_size = unset
 [examples/llama.swiftui/llama.swiftui.xcodeproj/*]
 indent_style = tab

-[tools/cvector-generator/*.txt]
+[examples/cvector-generator/*.txt]
 trim_trailing_whitespace = unset
 insert_final_newline = unset

--- a/.flake8
+++ b/.flake8
@@ -2,9 +2,8 @@
 max-line-length = 125
 ignore = E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503
 exclude =
-    # Do not traverse examples and tools
+    # Do not traverse examples
    examples,
-    tools,
    # Do not include package initializers
    __init__.py,
    # No need to traverse our git directory
--- a/.github/actions/get-tag-name/action.yml
+++ b/.github/actions/get-tag-name/action.yml
@@ -1,22 +0,0 @@
-name: "Determine tag name"
-description: "Determine the tag name to use for a release"
-outputs:
-  name:
-    description: "The name of the tag"
-    value: ${{ steps.tag.outputs.name }}
-
-runs:
-  using: "composite"
-  steps:
-    - name: Determine tag name
-      id: tag
-      shell: bash
-      run: |
-        BUILD_NUMBER="$(git rev-list --count HEAD)"
-        SHORT_HASH="$(git rev-parse --short=7 HEAD)"
-        if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
-          echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
-        else
-          SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
-          echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
-        fi
--- a/.github/actions/windows-setup-cuda/action.yml
+++ b/.github/actions/windows-setup-cuda/action.yml
@@ -1,67 +0,0 @@
-name: "Windows - Setup CUDA Toolkit"
-description: "Setup CUDA Toolkit for Windows"
-inputs:
-  cuda_version:
-    description: "CUDA toolkit version"
-    required: true
-
-runs:
-  using: "composite"
-  steps:
-    - name: Install Cuda Toolkit 11.7
-      if: ${{ inputs.cuda_version == '11.7' }}
-      shell: pwsh
-      run: |
-          mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
-          choco install unzip -y
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-11.7.99-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-11.7.99-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-11.7.99-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-11.7.4.6-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-11.7.91-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-11.7.91-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-11.7.101-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-11.7.91-archive.zip"
-          unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cudart-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvcc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvrtc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libcublas-windows-x86_64-11.7.4.6-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvtx-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\visual_studio_integration-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvprof-windows-x86_64-11.7.101-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cccl-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
-          echo "CUDA_PATH_V11_7=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
-
-    - name: Install Cuda Toolkit 12.4
-      if: ${{ inputs.cuda_version == '12.4' }}
-      shell: pwsh
-      run: |
-          mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
-          choco install unzip -y
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.4.127-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-12.4.131-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-12.4.127-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.4.5.8-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-12.4.127-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-12.4.127-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-12.4.127-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-12.4.127-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-12.4.127-archive.zip"
-          unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cudart-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvcc-windows-x86_64-12.4.131-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvrtc-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libcublas-windows-x86_64-12.4.5.8-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvtx-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_profiler_api-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\visual_studio_integration-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvprof-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cccl-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
-          echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -45,9 +45,7 @@ build:
            - CMakePresets.json
 examples:
    - changed-files:
-        - any-glob-to-any-file:
-            - examples/**
-            - tools/**
+        - any-glob-to-any-file: examples/**
 devops:
    - changed-files:
        - any-glob-to-any-file:
@@ -72,7 +70,7 @@ android:
 server:
    - changed-files:
        - any-glob-to-any-file:
-            - tools/server/**
+            - examples/server/**
 ggml:
    - changed-files:
        - any-glob-to-any-file:
--- a/.github/workflows/bench.yml.disabled
+++ b/.github/workflows/bench.yml.disabled
@@ -27,10 +27,10 @@ on:
  push:
    branches:
      - master
-    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'tools/server/*.h*', 'tools/server/*.cpp']
+    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
  pull_request_target:
    types: [opened, synchronize, reopened]
-    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'tools/server/*.h*', 'tools/server/*.cpp']
+    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
  schedule:
    -  cron: '04 2 * * *'

@@ -69,7 +69,7 @@ jobs:
      - name: Install python env
        id: pipenv
        run: |
-          cd tools/server/bench
+          cd examples/server/bench
          python3 -m venv venv
          source venv/bin/activate
          pip install -r requirements.txt
@@ -79,7 +79,7 @@ jobs:
        run: |
          wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz
          tar xzf prometheus*.tar.gz --strip-components=1
-          ./prometheus --config.file=tools/server/bench/prometheus.yml &
+          ./prometheus --config.file=examples/server/bench/prometheus.yml &
          while ! nc -z localhost 9090; do
            sleep 0.1
          done
@@ -92,7 +92,7 @@ jobs:
      - name: Install k6 and xk6-sse
        id: k6_installation
        run: |
-          cd tools/server/bench
+          cd examples/server/bench
          go install go.k6.io/xk6/cmd/xk6@latest
          xk6 build master \
              --with github.com/phymbert/xk6-sse
@@ -116,7 +116,7 @@ jobs:
      - name: Download the dataset
        id: download_dataset
        run: |
-          cd tools/server/bench
+          cd examples/server/bench
          wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json

      - name: Server bench
@@ -126,7 +126,7 @@ jobs:
        run: |
          set -eux

-          cd tools/server/bench
+          cd examples/server/bench
          source venv/bin/activate
          python bench.py \
              --runner-label ${{ env.RUNNER_LABEL }} \
@@ -157,9 +157,9 @@ jobs:
          name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
          compression-level: 9
          path: |
-            tools/server/bench/*.jpg
-            tools/server/bench/*.json
-            tools/server/bench/*.log
+            examples/server/bench/*.jpg
+            examples/server/bench/*.json
+            examples/server/bench/*.log

      - name: Commit status
        uses: Sibz/github-status-action@v1
@@ -178,17 +178,17 @@ jobs:
        with:
          client_id: ${{secrets.IMGUR_CLIENT_ID}}
          path: |
-            tools/server/bench/prompt_tokens_seconds.jpg
-            tools/server/bench/predicted_tokens_seconds.jpg
-            tools/server/bench/kv_cache_usage_ratio.jpg
-            tools/server/bench/requests_processing.jpg
+            examples/server/bench/prompt_tokens_seconds.jpg
+            examples/server/bench/predicted_tokens_seconds.jpg
+            examples/server/bench/kv_cache_usage_ratio.jpg
+            examples/server/bench/requests_processing.jpg

      - name: Extract mermaid
        id: set_mermaid
        run: |
          set -eux

-          cd tools/server/bench
+          cd examples/server/bench
          PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid)
          echo "PROMPT_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
          echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV
--- a/.github/workflows/build-linux-cross.yml
+++ b/.github/workflows/build-linux-cross.yml
@@ -4,25 +4,18 @@ on:
  workflow_call:

 jobs:
-  ubuntu-24-riscv64-cpu-cross:
-    runs-on: ubuntu-24.04
+  ubuntu-latest-riscv64-cpu-cross:
+    runs-on: ubuntu-latest

    steps:
      - uses: actions/checkout@v4
      - name: Setup Riscv
        run: |
          sudo dpkg --add-architecture riscv64
-
-          # Add arch-specific repositories for non-amd64 architectures
-          cat << EOF | sudo tee /etc/apt/sources.list.d/riscv64-ports.list
-          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
-          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
-          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
-          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
-          EOF
-
-          sudo apt-get update || true    ;# Prevent failure due to missing URLs.
-
+          sudo sed -i 's|http://azure.archive.ubuntu.com/ubuntu|http://ports.ubuntu.com/ubuntu-ports|g' \
+                 /etc/apt/sources.list /etc/apt/apt-mirrors.txt
+          sudo apt-get clean
+          sudo apt-get update
          sudo apt-get install -y --no-install-recommends \
                  build-essential \
                  gcc-14-riscv64-linux-gnu \
@@ -34,7 +27,6 @@ jobs:
          cmake -B build -DCMAKE_BUILD_TYPE=Release \
                         -DGGML_OPENMP=OFF \
                         -DLLAMA_BUILD_EXAMPLES=ON \
-                         -DLLAMA_BUILD_TOOLS=ON \
                         -DLLAMA_BUILD_TESTS=OFF \
                         -DCMAKE_SYSTEM_NAME=Linux \
                         -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
@@ -48,25 +40,21 @@ jobs:

          cmake --build build --config Release -j $(nproc)

-  ubuntu-24-riscv64-vulkan-cross:
-    runs-on: ubuntu-24.04
+  ubuntu-latest-riscv64-vulkan-cross:
+    runs-on: ubuntu-latest

    steps:
      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
      - name: Setup Riscv
        run: |
          sudo dpkg --add-architecture riscv64
-
-          # Add arch-specific repositories for non-amd64 architectures
-          cat << EOF | sudo tee /etc/apt/sources.list.d/riscv64-ports.list
-          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
-          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
-          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
-          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
-          EOF
-
-          sudo apt-get update || true    ;# Prevent failure due to missing URLs.
-
+          sudo sed -i 's|http://azure.archive.ubuntu.com/ubuntu|http://ports.ubuntu.com/ubuntu-ports|g' \
+                 /etc/apt/sources.list /etc/apt/apt-mirrors.txt
+          sudo apt-get clean
+          sudo apt-get update
          sudo apt-get install -y --no-install-recommends \
                  build-essential \
                  glslc \
@@ -81,7 +69,6 @@ jobs:
                         -DGGML_VULKAN=ON \
                         -DGGML_OPENMP=OFF \
                         -DLLAMA_BUILD_EXAMPLES=ON \
-                         -DLLAMA_BUILD_TOOLS=ON \
                         -DLLAMA_BUILD_TESTS=OFF \
                         -DCMAKE_SYSTEM_NAME=Linux \
                         -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
@@ -95,25 +82,21 @@ jobs:

          cmake --build build --config Release -j $(nproc)

-  ubuntu-24-arm64-vulkan-cross:
-    runs-on: ubuntu-24.04
+  ubuntu-latest-arm64-vulkan-cross:
+    runs-on: ubuntu-latest

    steps:
      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
      - name: Setup Arm64
        run: |
          sudo dpkg --add-architecture arm64
-
-          # Add arch-specific repositories for non-amd64 architectures
-          cat << EOF | sudo tee /etc/apt/sources.list.d/arm64-ports.list
-          deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
-          deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
-          deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
-          deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
-          EOF
-
-          sudo apt-get update || true    ;# Prevent failure due to missing URLs.
-
+          sudo sed -i 's|http://azure.archive.ubuntu.com/ubuntu|http://ports.ubuntu.com/ubuntu-ports|g' \
+                 /etc/apt/sources.list /etc/apt/apt-mirrors.txt
+          sudo apt-get clean
+          sudo apt-get update
          sudo apt-get install -y --no-install-recommends \
                  build-essential \
                  glslc \
@@ -127,7 +110,6 @@ jobs:
                         -DGGML_VULKAN=ON \
                         -DGGML_OPENMP=OFF \
                         -DLLAMA_BUILD_EXAMPLES=ON \
-                         -DLLAMA_BUILD_TOOLS=ON \
                         -DLLAMA_BUILD_TESTS=OFF \
                         -DCMAKE_SYSTEM_NAME=Linux \
                         -DCMAKE_SYSTEM_PROCESSOR=aarch64 \
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -36,14 +36,10 @@ jobs:
      matrix:
        config:
          # Multi-stage build
-          # Note: the arm64 images are failing, which prevents the amd64 images from being built
-          # https://github.com/ggml-org/llama.cpp/issues/11888
-          #- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: false }
-          - { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
+          - { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: false }
          - { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
          - { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
-          # Note: the intel images are failing due to an out of disk space error
-          # - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
+          - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
          - { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
          # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
          #- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: true }
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -1,710 +0,0 @@
-name: Create Release
-
-on:
-  workflow_dispatch: # allows manual triggering
-    inputs:
-      create_release:
-        description: 'Create new release'
-        required: true
-        type: boolean
-  push:
-    branches:
-      - master
-    paths: ['.github/workflows/release.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-# Fine-grant permission
-# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
-permissions:
-  contents: write # for creating release
-
-env:
-  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
-  CMAKE_ARGS: "-DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=ON -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON"
-
-jobs:
-  macOS-arm64:
-    runs-on: macos-14
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: macOS-latest-cmake-arm64
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-          brew install curl
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          cmake -B build \
-            -DCMAKE_BUILD_RPATH="@loader_path" \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DGGML_METAL_USE_BF16=ON \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DGGML_RPC=ON \
-            ${{ env.CMAKE_ARGS }}
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        run: |
-          cp LICENSE ./build/bin/
-          zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
-          name: llama-bin-macos-arm64.zip
-
-  macOS-x64:
-    runs-on: macos-13
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: macOS-latest-cmake-x64
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-          brew install curl
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          # Metal is disabled due to intermittent failures with Github runners not having a GPU:
-          # https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
-          cmake -B build \
-            -DCMAKE_BUILD_RPATH="@loader_path" \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DGGML_METAL=OFF \
-            -DGGML_RPC=ON
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        run: |
-          cp LICENSE ./build/bin/
-          zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
-          name: llama-bin-macos-x64.zip
-
-  ubuntu-22-cpu:
-    strategy:
-      matrix:
-        include:
-          - build: 'x64'
-            os: ubuntu-22.04
-          - build: 'arm64'
-            os: ubuntu-22.04-arm
-
-    runs-on: ${{ matrix.os }}
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: ubuntu-cpu-cmake
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential libcurl4-openssl-dev
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            ${{ env.CMAKE_ARGS }}
-          cmake --build build --config Release -j $(nproc)
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        run: |
-          cp LICENSE ./build/bin/
-          zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/*
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip
-          name: llama-bin-ubuntu-${{ matrix.build }}.zip
-
-  ubuntu-22-vulkan:
-    runs-on: ubuntu-22.04
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: ubuntu-22-cmake-vulkan
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        run: |
-          wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
-          sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
-          sudo apt-get update -y
-          sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libcurl4-openssl-dev
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build \
-            -DGGML_VULKAN=ON \
-            ${{ env.CMAKE_ARGS }}
-          cmake --build build --config Release -j $(nproc)
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        run: |
-          cp LICENSE ./build/bin/
-          zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip ./build/bin/*
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip
-          name: llama-bin-ubuntu-vulkan-x64.zip
-
-  windows:
-    runs-on: windows-latest
-
-    env:
-      OPENBLAS_VERSION: 0.3.23
-      VULKAN_VERSION: 1.4.309.0
-
-    strategy:
-      matrix:
-        include:
-          - build: 'cpu-x64'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF'
-          #- build: 'openblas-x64'
-          #  defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
-          - build: 'vulkan-x64'
-            defines: '-DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
-          - build: 'cpu-arm64'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF'
-          - build: 'opencl-adreno-arm64'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: windows-latest-cmake-${{ matrix.build }}
-          variant: ccache
-          evict-old-files: 1d
-
-      - name: Download OpenBLAS
-        id: get_openblas
-        if: ${{ matrix.build == 'openblas-x64' }}
-        run: |
-          curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
-          curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
-          mkdir $env:RUNNER_TEMP/openblas
-          tar.exe -xvf $env:RUNNER_TEMP/openblas.zip -C $env:RUNNER_TEMP/openblas
-          $vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
-          $msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
-          $lib =  $(join-path $msvc 'bin\Hostx64\x64\lib.exe')
-          & $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll
-
-      - name: Install Vulkan SDK
-        id: get_vulkan
-        if: ${{ matrix.build == 'vulkan-x64' }}
-        run: |
-          curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
-          & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
-          Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
-          Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
-
-      - name: Install Ninja
-        id: install_ninja
-        run: |
-          choco install ninja
-
-      - name: Install OpenCL Headers and Libs
-        id: install_opencl
-        if: ${{ matrix.build == 'opencl-adreno-arm64' }}
-        run: |
-          git clone https://github.com/KhronosGroup/OpenCL-Headers
-          cd OpenCL-Headers
-          cmake -B build `
-            -DBUILD_TESTING=OFF `
-            -DOPENCL_HEADERS_BUILD_TESTING=OFF `
-            -DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
-            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
-          cmake --build build --target install
-          git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
-          cd OpenCL-ICD-Loader
-          cmake -B build-arm64-release `
-            -A arm64 `
-            -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" `
-            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
-          cmake --build build-arm64-release --target install --config release
-
-      - name: libCURL
-        id: get_libcurl
-        uses: ./.github/actions/windows-setup-curl
-
-      - name: Build
-        id: cmake_build
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
-        run: |
-          cmake -S . -B build ${{ matrix.defines }} `
-            -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" `
-            ${{ env.CMAKE_ARGS }}
-          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
-
-      - name: Add libopenblas.dll
-        id: add_libopenblas_dll
-        if: ${{ matrix.build == 'openblas-x64' }}
-        run: |
-          cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
-          cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
-        run: |
-          Copy-Item $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll
-          7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
-          name: llama-bin-win-${{ matrix.build }}.zip
-
-  windows-cuda:
-    runs-on: windows-2019
-
-    strategy:
-      matrix:
-        cuda: ['12.4', '11.7']
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-            fetch-depth: 0
-
-      - name: Install ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: windows-cuda-${{ matrix.cuda }}
-          variant: ccache
-          evict-old-files: 1d
-
-      - name: Install Cuda Toolkit
-        uses: ./.github/actions/windows-setup-cuda
-        with:
-          cuda_version: ${{ matrix.cuda }}
-
-      - name: Install Ninja
-        id: install_ninja
-        run: |
-          choco install ninja
-
-      - name: libCURL
-        id: get_libcurl
-        uses: ./.github/actions/windows-setup-curl
-
-      - name: Build
-        id: cmake_build
-        shell: cmd
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
-        run: |
-          call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
-          cmake -S . -B build -G "Ninja Multi-Config" ^
-            -DGGML_NATIVE=OFF ^
-            -DGGML_BACKEND_DL=ON ^
-            -DGGML_CPU_ALL_VARIANTS=ON ^
-            -DGGML_CUDA=ON ^
-            -DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include" ^
-            ${{ env.CMAKE_ARGS }}
-          set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
-          cmake --build build --config Release -j %NINJA_JOBS% -t ggml
-          cmake --build build --config Release
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
-        run: |
-          cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll
-          7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
-          name: llama-bin-win-cu${{ matrix.cuda }}-x64.zip
-
-      - name: Copy and pack Cuda runtime
-        if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-        run: |
-          echo "Cuda install location: ${{ env.CUDA_PATH }}"
-          $dst='.\build\bin\cudart\'
-          robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
-          robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
-          7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\*
-
-      - name: Upload Cuda runtime
-        uses: actions/upload-artifact@v4
-        with:
-          path: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
-          name: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
-
-  windows-sycl:
-    runs-on: windows-latest
-
-    defaults:
-      run:
-        shell: bash
-
-    env:
-      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b380d914-366b-4b77-a74a-05e3c38b3514/intel-oneapi-base-toolkit-2025.0.0.882_offline.exe
-      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
-      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: windows-latest-cmake-sycl
-          variant: ccache
-          evict-old-files: 1d
-
-      - name: Install
-        run:  |
-          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
-
-      # TODO: add libcurl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
-
-      - name: Build
-        id: cmake_build
-        run:  examples/sycl/win-build-sycl.bat
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Build the release package
-        id: pack_artifacts
-        run: |
-          echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
-
-          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
-
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
-
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
-
-          cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
-
-          echo "cp oneAPI running time dll files to ./build/bin done"
-          7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
-
-      - name: Upload the release package
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
-          name: llama-bin-win-sycl-x64.zip
-
-  windows-hip:
-    runs-on: windows-latest
-
-    strategy:
-      matrix:
-        gpu_target: [gfx1100, gfx1101, gfx1030]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-            fetch-depth: 0
-
-      - name: Clone rocWMMA repository
-        id: clone_rocwmma
-        run: |
-          git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: windows-latest-cmake-hip-release
-          evict-old-files: 1d
-
-      - name: Install
-        id: depends
-        run: |
-          $ErrorActionPreference = "Stop"
-          write-host "Downloading AMD HIP SDK Installer"
-          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
-          write-host "Installing AMD HIP SDK"
-          Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
-          write-host "Completed AMD HIP SDK installation"
-
-      - name: Verify ROCm
-        id: verify
-        run: |
-          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
-
-      - name: libCURL
-        id: get_libcurl
-        uses: ./.github/actions/windows-setup-curl
-
-      - name: Build
-        id: cmake_build
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
-        run: |
-          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
-          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
-          cmake -G "Unix Makefiles" -B build -S . `
-            -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
-            -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
-            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/" `
-            -DCMAKE_BUILD_TYPE=Release `
-            -DAMDGPU_TARGETS=${{ matrix.gpu_target }} `
-            -DGGML_HIP_ROCWMMA_FATTN=ON `
-            -DGGML_HIP=ON `
-            -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" `
-            ${{ env.CMAKE_ARGS }}
-          cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
-          md "build\bin\rocblas\library\"
-          cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
-          cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
-          cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
-        run: |
-          cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\libcurl-x64.dll
-          7z a llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip .\build\bin\*
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
-          name: llama-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
-
-  ios-xcode-build:
-    runs-on: macos-latest
-
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          cmake -B build -G Xcode \
-            -DGGML_METAL_USE_BF16=ON \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DLLAMA_CURL=OFF \
-            -DLLAMA_BUILD_EXAMPLES=OFF \
-            -DLLAMA_BUILD_TOOLS=OFF \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DLLAMA_BUILD_SERVER=OFF \
-            -DCMAKE_SYSTEM_NAME=iOS \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
-            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
-
-      - name: xcodebuild for swift package
-        id: xcodebuild
-        run: |
-          ./build-xcframework.sh
-
-      - name: Build Xcode project
-        run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        run: |
-          zip --symlinks -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
-          name: llama-${{ steps.tag.outputs.name }}-xcframework
-
-  release:
-    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-
-    runs-on: ubuntu-latest
-
-    needs:
-      - ubuntu-22-cpu
-      - ubuntu-22-vulkan
-      - windows
-      - windows-cuda
-      - windows-sycl
-      - windows-hip
-      - macOS-arm64
-      - macOS-x64
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Download artifacts
-        id: download-artifact
-        uses: actions/download-artifact@v4
-        with:
-          path: ./artifact
-
-      - name: Move artifacts
-        id: move_artifacts
-        run: mkdir -p ./artifact/release && mv ./artifact/*/*.zip ./artifact/release
-
-      - name: Create release
-        id: create_release
-        uses: ggml-org/action-create-release@v1
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        with:
-          tag_name: ${{ steps.tag.outputs.name }}
-
-      - name: Upload release
-        id: upload_release
-        uses: actions/github-script@v3
-        with:
-          github-token: ${{secrets.GITHUB_TOKEN}}
-          script: |
-            const path = require('path');
-            const fs = require('fs');
-            const release_id = '${{ steps.create_release.outputs.id }}';
-            for (let file of await fs.readdirSync('./artifact/release')) {
-              if (path.extname(file) === '.zip') {
-                console.log('uploadReleaseAsset', file);
-                await github.repos.uploadReleaseAsset({
-                  owner: context.repo.owner,
-                  repo: context.repo.repo,
-                  release_id: release_id,
-                  name: file,
-                  data: await fs.readFileSync(`./artifact/release/${file}`)
-                });
-              }
-            }
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -15,10 +15,10 @@ on:
  push:
    branches:
      - master
-    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*']
+    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
  pull_request:
    types: [opened, synchronize, reopened]
-    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*']
+    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']

 env:
  LLAMA_LOG_COLORS: 1
@@ -74,7 +74,7 @@ jobs:
      - name: Tests dependencies
        id: test_dependencies
        run: |
-          pip install -r tools/server/tests/requirements.txt
+          pip install -r examples/server/tests/requirements.txt

      # Setup nodejs (to be used for verifying bundled index.html)
      - uses: actions/setup-node@v4
@@ -84,14 +84,14 @@ jobs:
      - name: WebUI - Install dependencies
        id: webui_lint
        run: |
-          cd tools/server/webui
+          cd examples/server/webui
          npm ci

      - name: WebUI - Check code format
        id: webui_format
        run: |
          git config --global --add safe.directory $(realpath .)
-          cd tools/server/webui
+          cd examples/server/webui
          git status

          npm run format
@@ -108,7 +108,7 @@ jobs:
        id: verify_server_index_html
        run: |
          git config --global --add safe.directory $(realpath .)
-          cd tools/server/webui
+          cd examples/server/webui
          git status

          npm run build
@@ -161,21 +161,21 @@ jobs:
        env:
          GITHUB_ACTIONS: "true"
        run: |
-          cd tools/server/tests
+          cd examples/server/tests
          ./tests.sh

      - name: Tests (sanitizers)
        id: server_integration_tests_sanitizers
        if: ${{ matrix.sanitizer != '' }}
        run: |
-          cd tools/server/tests
+          cd examples/server/tests
          LLAMA_SANITIZE=1 ./tests.sh

      - name: Slow tests
        id: server_integration_tests_slow
        if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
        run: |
-          cd tools/server/tests
+          cd examples/server/tests
          SLOW_TESTS=1 ./tests.sh


@@ -211,7 +211,7 @@ jobs:
      - name: Tests dependencies
        id: test_dependencies
        run: |
-          pip install -r tools/server/tests/requirements.txt
+          pip install -r examples/server/tests/requirements.txt

      - name: Copy Libcurl
        id: prepare_libcurl
@@ -224,7 +224,7 @@ jobs:
        id: server_integration_tests
        if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
        run: |
-          cd tools/server/tests
+          cd examples/server/tests
          $env:PYTHONIOENCODING = ":replace"
          pytest -v -x -m "not slow"

@@ -232,6 +232,6 @@ jobs:
        id: server_integration_tests_slow
        if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
        run: |
-          cd tools/server/tests
+          cd examples/server/tests
          $env:SLOW_TESTS = "1"
          pytest -v -x
--- a/.gitignore
+++ b/.gitignore
@@ -96,11 +96,11 @@ perf-*.txt
 # Examples

 examples/jeopardy/results.txt
-tools/server/*.css.hpp
-tools/server/*.html.hpp
-tools/server/*.js.hpp
-tools/server/*.mjs.hpp
-tools/server/*.gz.hpp
+examples/server/*.css.hpp
+examples/server/*.html.hpp
+examples/server/*.js.hpp
+examples/server/*.mjs.hpp
+examples/server/*.gz.hpp
 !build_64.sh
 !examples/*.bat
 !examples/*/*.kts
@@ -110,7 +110,7 @@ tools/server/*.gz.hpp

 # Server Web UI temporary files
 node_modules
-tools/server/webui/dist
+examples/server/webui/dist

 # Python

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -77,7 +77,6 @@ option(LLAMA_BUILD_COMMON "llama: build common utils library" ${LLAMA_STANDALONE

 # extra artifacts
 option(LLAMA_BUILD_TESTS    "llama: build tests"          ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_TOOLS    "llama: build tools"          ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES "llama: build examples"       ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_SERVER   "llama: build server example" ${LLAMA_STANDALONE})

@@ -188,10 +187,6 @@ if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES)
    add_subdirectory(pocs)
 endif()

-if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TOOLS)
-    add_subdirectory(tools)
-endif()
-
 #
 # install
 #
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -38,6 +38,15 @@
        }
    },

+    {
+        "name": "arm64-windows-msvc", "hidden": true,
+        "architecture": { "value": "arm64",    "strategy": "external" },
+        "toolset":      { "value": "host=x64", "strategy": "external" },
+        "cacheVariables": {
+            "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
+        }
+    },
+
    {
        "name": "arm64-windows-llvm", "hidden": true,
        "architecture": { "value": "arm64",    "strategy": "external" },
@@ -64,6 +73,10 @@
    { "name": "arm64-apple-clang-release", "inherits": [ "base", "arm64-apple-clang", "reldbg" ] },
    { "name": "arm64-apple-clang+static-release", "inherits": [ "base", "arm64-apple-clang",  "reldbg", "static" ] },

+    { "name": "arm64-windows-msvc-debug", "inherits": [ "base", "arm64-windows-msvc",  "debug"   ] },
+    { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc",  "reldbg" ] },
+    { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc",  "reldbg", "static" ] },
+
    { "name": "x64-windows-llvm-debug", "inherits": [ "base", "x64-windows-llvm", "debug" ] },
    { "name": "x64-windows-llvm-release", "inherits": [ "base", "x64-windows-llvm", "release" ] },
    { "name": "x64-windows-llvm-reldbg", "inherits": [ "base", "x64-windows-llvm", "reldbg" ] },
--- a/2
+++ b/2
@@ -2,7 +2,7 @@

 /ci/ @ggerganov
 /.devops/*.Dockerfile @ngxson
-/tools/server/ @ngxson
+/examples/server/ @ngxson
 /ggml/src/ggml-cuda/fattn* @JohannesGaessler
 /ggml/src/ggml-cuda/mmq.* @JohannesGaessler
 /ggml/src/ggml-cuda/mmv.* @JohannesGaessler
--- a/97
+++ b/97
@@ -1156,10 +1156,10 @@ $(LIB_COMMON_S): $(OBJ_COMMON)

 # Clean generated server assets
 clean-server-assets:
-	find tools/server -type f -name "*.js.hpp"   -delete
-	find tools/server -type f -name "*.mjs.hpp"  -delete
-	find tools/server -type f -name "*.css.hpp"  -delete
-	find tools/server -type f -name "*.html.hpp" -delete
+	find examples/server -type f -name "*.js.hpp"   -delete
+	find examples/server -type f -name "*.mjs.hpp"  -delete
+	find examples/server -type f -name "*.css.hpp"  -delete
+	find examples/server -type f -name "*.html.hpp" -delete

 # Clean rule
 clean: clean-server-assets
@@ -1179,7 +1179,7 @@ clean: clean-server-assets
 # Helper function that replaces .c, .cpp, and .cu file endings with .o:
 GET_OBJ_FILE = $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(patsubst %.cu,%.o,$(1))))

-llama-cli: tools/main/main.cpp \
+llama-cli: examples/main/main.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -1187,7 +1187,12 @@ llama-cli: tools/main/main.cpp \
 	@echo '====  Run ./llama-cli -h for help.  ===='
 	@echo

-llama-run: tools/run/run.cpp \
+llama-infill: examples/infill/infill.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-run: examples/run/run.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -1202,7 +1207,7 @@ llama-simple-chat: examples/simple-chat/simple-chat.cpp \
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-tokenize: tools/tokenize/tokenize.cpp \
+llama-tokenize: examples/tokenize/tokenize.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -1212,27 +1217,27 @@ llama-batched: examples/batched/batched.cpp \
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-batched-bench: tools/batched-bench/batched-bench.cpp \
+llama-batched-bench: examples/batched-bench/batched-bench.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-quantize: tools/quantize/quantize.cpp \
+llama-quantize: examples/quantize/quantize.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-quantize-stats: tools/quantize-stats/quantize-stats.cpp \
+llama-quantize-stats: examples/quantize-stats/quantize-stats.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-perplexity: tools/perplexity/perplexity.cpp \
+llama-perplexity: examples/perplexity/perplexity.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-imatrix: tools/imatrix/imatrix.cpp \
+llama-imatrix: examples/imatrix/imatrix.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -1274,7 +1279,7 @@ llama-gguf-hash: examples/gguf-hash/gguf-hash.cpp examples/gguf-hash/deps/sha1/s
 	$(CXX) $(CXXFLAGS) -Iexamples/gguf-hash/deps -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-gguf-split: tools/gguf-split/gguf-split.cpp \
+llama-gguf-split: examples/gguf-split/gguf-split.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -1284,7 +1289,7 @@ llama-eval-callback: examples/eval-callback/eval-callback.cpp \
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-cvector-generator: tools/cvector-generator/cvector-generator.cpp \
+llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -1294,12 +1299,12 @@ llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-bench: tools/llama-bench/llama-bench.cpp \
+llama-bench: examples/llama-bench/llama-bench.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-export-lora: tools/export-lora/export-lora.cpp \
+llama-export-lora: examples/export-lora/export-lora.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -1355,17 +1360,17 @@ llama-gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp \
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

 ifdef GGML_RPC
-rpc-server: tools/rpc/rpc-server.cpp \
+rpc-server: examples/rpc/rpc-server.cpp \
 	$(OBJ_GGML)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 endif # GGML_RPC

 llama-server: \
-	tools/server/server.cpp \
-	tools/server/utils.hpp \
-	tools/server/httplib.h \
-	tools/server/index.html.hpp \
-	tools/server/loading.html.hpp \
+	examples/server/server.cpp \
+	examples/server/utils.hpp \
+	examples/server/httplib.h \
+	examples/server/index.html.hpp \
+	examples/server/loading.html.hpp \
 	common/chat.cpp \
 	common/chat.h \
 	common/chat-template.hpp \
@@ -1373,10 +1378,10 @@ llama-server: \
 	common/minja.hpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Itools/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)

-# Portable equivalent of `cd tools/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`:
-tools/server/%.hpp: tools/server/public/% FORCE Makefile
+# Portable equivalent of `cd examples/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`:
+examples/server/%.hpp: examples/server/public/% FORCE Makefile
 	@( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \
 		echo "unsigned char $${NAME}[] = {" && \
 		cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \
@@ -1389,36 +1394,36 @@ llama-gen-docs: examples/gen-docs/gen-docs.cpp \
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-libllava.a: tools/mtmd/llava.cpp \
-	tools/mtmd/llava.h \
-	tools/mtmd/clip.cpp \
-	tools/mtmd/clip.h \
+libllava.a: examples/llava/llava.cpp \
+	examples/llava/llava.h \
+	examples/llava/clip.cpp \
+	examples/llava/clip.h \
 	common/stb_image.h \
 	common/base64.hpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual

-llama-llava-cli: tools/mtmd/llava-cli.cpp \
-	tools/mtmd/llava.cpp \
-	tools/mtmd/llava.h \
-	tools/mtmd/clip.cpp \
-	tools/mtmd/clip.h \
+llama-llava-cli: examples/llava/llava-cli.cpp \
+	examples/llava/llava.cpp \
+	examples/llava/llava.h \
+	examples/llava/clip.cpp \
+	examples/llava/clip.h \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual

-llama-minicpmv-cli: tools/mtmd/minicpmv-cli.cpp \
-	tools/mtmd/llava.cpp \
-	tools/mtmd/llava.h \
-	tools/mtmd/clip.cpp \
-	tools/mtmd/clip.h \
+llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp \
+	examples/llava/llava.cpp \
+	examples/llava/llava.h \
+	examples/llava/clip.cpp \
+	examples/llava/clip.h \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual

-llama-qwen2vl-cli: tools/mtmd/qwen2vl-cli.cpp \
-	tools/mtmd/llava.cpp \
-	tools/mtmd/llava.h \
-	tools/mtmd/clip.cpp \
-	tools/mtmd/clip.h \
+llama-qwen2vl-cli: examples/llava/qwen2vl-cli.cpp \
+	examples/llava/llava.cpp \
+	examples/llava/llava.h \
+	examples/llava/clip.cpp \
+	examples/llava/clip.h \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual

@@ -1475,12 +1480,12 @@ tests/test-double-float: tests/test-double-float.cpp

 tests/test-json-schema-to-grammar: tests/test-json-schema-to-grammar.cpp \
 	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -Itools/server -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

 tests/test-chat: tests/test-chat.cpp \
 	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -Itools/server -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

 tests/test-opt: tests/test-opt.cpp \
--- a/README.md
+++ b/README.md
@@ -16,9 +16,9 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)

 ## Hot topics

- **GGML developer experience survey (organized and reviewed by NVIDIA):** [link](https://forms.gle/Gasw3cRgyhNEnrwK9)
- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141]((https://github.com/ggml-org/llama.cpp/pull/13141))), `libllava` will be deprecated
- VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
+- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli` and `gemma3-cli` https://github.com/ggml-org/llama.cpp/pull/13012, `libllava` will be deprecated
+- **How to use [MTLResidencySet](https://developer.apple.com/documentation/metal/mtlresidencyset?language=objc) to keep the GPU memory active?** https://github.com/ggml-org/llama.cpp/pull/11427
+- **VS Code extension for FIM completions:** https://github.com/ggml-org/llama.vscode
 - Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
 - Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
 - Introducing GGUF-my-LoRA https://github.com/ggml-org/llama.cpp/discussions/10123
@@ -242,7 +242,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 | [Vulkan](docs/build.md#vulkan) | GPU |
 | [CANN](docs/build.md#cann) | Ascend NPU |
 | [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
-| [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
+| [RPC](https://github.com/ggml-org/llama.cpp/tree/master/examples/rpc) | All |

 ## Building the project

@@ -276,9 +276,9 @@ The Hugging Face platform provides a variety of online tools for converting, qua
 - Use the [GGUF-editor space](https://huggingface.co/spaces/CISCai/gguf-editor) to edit GGUF meta data in the browser (more info: https://github.com/ggml-org/llama.cpp/discussions/9268)
 - Use the [Inference Endpoints](https://ui.endpoints.huggingface.co/) to directly host `llama.cpp` in the cloud (more info: https://github.com/ggml-org/llama.cpp/discussions/9669)

-To learn more about model quantization, [read this documentation](tools/quantize/README.md)
+To learn more about model quantization, [read this documentation](examples/quantize/README.md)

-## [`llama-cli`](tools/main)
+## [`llama-cli`](examples/main)

 #### A CLI tool for accessing and experimenting with most of `llama.cpp`'s functionality.

@@ -341,7 +341,7 @@ To learn more about model quantization, [read this documentation](tools/quantize
    </details>


-## [`llama-server`](tools/server)
+## [`llama-server`](examples/server)

 #### A lightweight, [OpenAI API](https://github.com/openai/openai-openapi) compatible, HTTP server for serving LLMs.

@@ -411,7 +411,7 @@ To learn more about model quantization, [read this documentation](tools/quantize
    </details>


-## [`llama-perplexity`](tools/perplexity)
+## [`llama-perplexity`](examples/perplexity)

 #### A tool for measuring the perplexity [^1][^2] (and other quality metrics) of a model over a given text.

@@ -436,10 +436,10 @@ To learn more about model quantization, [read this documentation](tools/quantize

    </details>

-[^1]: [tools/perplexity/README.md](./tools/perplexity/README.md)
+[^1]: [examples/perplexity/README.md](./examples/perplexity/README.md)
 [^2]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity)

-## [`llama-bench`](tools/llama-bench)
+## [`llama-bench`](examples/llama-bench)

 #### Benchmark the performance of the inference for various parameters.

@@ -460,7 +460,7 @@ To learn more about model quantization, [read this documentation](tools/quantize

    </details>

-## [`llama-run`](tools/run)
+## [`llama-run`](examples/run)

 #### A comprehensive example for running `llama.cpp` models. Useful for inferencing. Used with RamaLama [^3].

@@ -504,8 +504,8 @@ To learn more about model quantization, [read this documentation](tools/quantize

 ## Other documentation

- [main (cli)](tools/main/README.md)
- [server](tools/server/README.md)
+- [main (cli)](examples/main/README.md)
+- [server](examples/server/README.md)
 - [GBNF grammars](grammars/README.md)

 #### Development documentation
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -40,7 +40,7 @@ To protect sensitive data from potential leaks or unauthorized access, it is cru
 ### Untrusted environments or networks

 If you can't run your models in a secure and isolated environment or if it must be exposed to an untrusted network, make sure to take the following security precautions:
-* Do not use the RPC backend, [rpc-server](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) and [llama-server](https://github.com/ggml-org/llama.cpp/tree/master/tools/server) functionality (see https://github.com/ggml-org/llama.cpp/pull/13061).
+* Do not use the RPC backend, [rpc-server](https://github.com/ggml-org/llama.cpp/tree/master/examples/rpc) and [llama-server](https://github.com/ggml-org/llama.cpp/tree/master/examples/server) functionality (see https://github.com/ggml-org/llama.cpp/pull/13061).
 * Confirm the hash of any downloaded artifact (e.g. pre-trained model weights) matches a known-good value.
 * Encrypt your data if sending it over the network.

--- a/build-xcframework.sh
+++ b/build-xcframework.sh
@@ -8,7 +8,6 @@ TVOS_MIN_OS_VERSION=16.4

 BUILD_SHARED_LIBS=OFF
 LLAMA_BUILD_EXAMPLES=OFF
-LLAMA_BUILD_TOOLS=OFF
 LLAMA_BUILD_TESTS=OFF
 LLAMA_BUILD_SERVER=OFF
 GGML_METAL=ON
@@ -32,7 +31,6 @@ COMMON_CMAKE_ARGS=(
    -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
    -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}
    -DLLAMA_BUILD_EXAMPLES=${LLAMA_BUILD_EXAMPLES}
-    -DLLAMA_BUILD_TOOLS=${LLAMA_BUILD_TOOLS}
    -DLLAMA_BUILD_TESTS=${LLAMA_BUILD_TESTS}
    -DLLAMA_BUILD_SERVER=${LLAMA_BUILD_SERVER}
    -DGGML_METAL_EMBED_LIBRARY=${GGML_METAL_EMBED_LIBRARY}
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -187,8 +187,8 @@ function gg_run_test_scripts_debug {

    set -e

-    (cd ./tools/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
-    (cd ./tools/quantize   && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
+    (cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
+    (cd ./examples/quantize   && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log

    set +e
 }
@@ -211,8 +211,8 @@ function gg_run_test_scripts_release {

    set -e

-    (cd ./tools/gguf-split && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
-    (cd ./tools/quantize   && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
+    (cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
+    (cd ./examples/quantize   && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log

    set +e
 }
--- a/cmake/arm64-windows-msvc.cmake
+++ b/cmake/arm64-windows-msvc.cmake
@@ -0,0 +1,6 @@
+set( CMAKE_SYSTEM_NAME Windows )
+set( CMAKE_SYSTEM_PROCESSOR arm64 )
+
+set( target arm64-pc-windows-msvc )
+set( CMAKE_C_COMPILER_TARGET   ${target} )
+set( CMAKE_CXX_COMPILER_TARGET ${target} )
--- a/cmake/build-info.cmake
+++ b/cmake/build-info.cmake
@@ -41,20 +41,14 @@ endif()

 if(MSVC)
    set(BUILD_COMPILER "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
-    if (CMAKE_VS_PLATFORM_NAME)
-        set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME})
-    else()
-        set(BUILD_TARGET "${CMAKE_SYSTEM_NAME} ${CMAKE_SYSTEM_PROCESSOR}")
-    endif()
+    set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME})
 else()
    execute_process(
-        COMMAND ${CMAKE_C_COMPILER} --version
+        COMMAND sh -c "\"$@\" --version | head -1" _ ${CMAKE_C_COMPILER}
        OUTPUT_VARIABLE OUT
        OUTPUT_STRIP_TRAILING_WHITESPACE
    )
-    string(REGEX REPLACE " *\n.*" "" OUT "${OUT}")
    set(BUILD_COMPILER ${OUT})
-
    execute_process(
        COMMAND ${CMAKE_C_COMPILER} -dumpmachine
        OUTPUT_VARIABLE OUT
--- a/cmake/x64-windows-llvm.cmake
+++ b/cmake/x64-windows-llvm.cmake
@@ -3,3 +3,9 @@ set( CMAKE_SYSTEM_PROCESSOR x86_64 )

 set( CMAKE_C_COMPILER    clang )
 set( CMAKE_CXX_COMPILER  clang++ )
+
+set( arch_c_flags "-march=native" )
+
+set( CMAKE_C_FLAGS_INIT   "${arch_c_flags}" )
+set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags}" )
+
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -39,9 +39,7 @@ add_custom_command(
    COMMENT "Generating build details from Git"
    COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
            -DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
-            -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-            -DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME} -DCMAKE_SYSTEM_PROCESSOR=${CMAKE_SYSTEM_PROCESSOR}
-            -P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info-gen-cpp.cmake"
+            -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info-gen-cpp.cmake"
    WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
    DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
    VERBATIM
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -43,25 +43,6 @@ std::initializer_list<enum llama_example> mmproj_examples = {
    // TODO: add LLAMA_EXAMPLE_SERVER when it's ready
 };

-static std::string read_file(const std::string & fname) {
-    std::ifstream file(fname);
-    if (!file) {
-        throw std::runtime_error(string_format("error: failed to open file '%s'\n", fname.c_str()));
-    }
-    std::string content((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
-    file.close();
-    return content;
-}
-
-static void write_file(const std::string & fname, const std::string & content) {
-    std::ofstream file(fname);
-    if (!file) {
-        throw std::runtime_error(string_format("error: failed to open file '%s'\n", fname.c_str()));
-    }
-    file << content;
-    file.close();
-}
-
 common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) {
    this->examples = std::move(examples);
    return *this;
@@ -217,11 +198,11 @@ struct curl_slist_ptr {
 #define CURL_MAX_RETRY 3
 #define CURL_RETRY_DELAY_SECONDS 2

-static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds, const char * method_name) {
+static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
    int remaining_attempts = max_attempts;

    while (remaining_attempts > 0) {
-        LOG_INF("%s: %s %s (attempt %d of %d)...\n", __func__ , method_name, url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
+        LOG_INF("%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);

        CURLcode res = curl_easy_perform(curl);
        if (res == CURLE_OK) {
@@ -232,7 +213,6 @@ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int ma
        LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);

        remaining_attempts--;
-        if (remaining_attempts == 0) break;
        std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
    }

@@ -251,6 +231,8 @@ static bool common_download_file_single(const std::string & url, const std::stri
        return false;
    }

+    bool force_download = false;
+
    // Set the URL, allow to follow http redirection
    curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
    curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
@@ -274,7 +256,7 @@ static bool common_download_file_single(const std::string & url, const std::stri

    // If the file exists, check its JSON metadata companion file.
    std::string metadata_path = path + ".json";
-    nlohmann::json metadata; // TODO @ngxson : get rid of this json, use regex instead
+    nlohmann::json metadata;
    std::string etag;
    std::string last_modified;

@@ -284,7 +266,14 @@ static bool common_download_file_single(const std::string & url, const std::stri
        if (metadata_in.good()) {
            try {
                metadata_in >> metadata;
-                LOG_DBG("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
+                LOG_INF("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
+                if (metadata.contains("url") && metadata.at("url").is_string()) {
+                    auto previous_url = metadata.at("url").get<std::string>();
+                    if (previous_url != url) {
+                        LOG_ERR("%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
+                        return false;
+                    }
+                }
                if (metadata.contains("etag") && metadata.at("etag").is_string()) {
                    etag = metadata.at("etag");
                }
@@ -292,10 +281,10 @@ static bool common_download_file_single(const std::string & url, const std::stri
                    last_modified = metadata.at("lastModified");
                }
            } catch (const nlohmann::json::exception & e) {
-                LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
+            LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
+                return false;
            }
        }
-        // if we cannot open the metadata file, we assume that the downloaded file is not valid (etag and last-modified are left empty, so we will download it again)
    } else {
        LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
    }
@@ -307,10 +296,7 @@ static bool common_download_file_single(const std::string & url, const std::stri
    };

    common_load_model_from_url_headers headers;
-    bool head_request_ok = false;
-    bool should_download = !file_exists; // by default, we should download if the file does not exist

-    // get ETag to see if the remote file has changed
    {
        typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
        auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
@@ -339,28 +325,23 @@ static bool common_download_file_single(const std::string & url, const std::stri
        curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
        curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);

-        // we only allow retrying once for HEAD requests
-        // this is for the use case of using running offline (no internet), retrying can be annoying
-        bool was_perform_successful = curl_perform_with_retry(url, curl.get(), 1, 0, "HEAD");
+        bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
        if (!was_perform_successful) {
-            head_request_ok = false;
+            return false;
        }

        long http_code = 0;
        curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
-        if (http_code == 200) {
-            head_request_ok = true;
-        } else {
-            LOG_WRN("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
-            head_request_ok = false;
+        if (http_code != 200) {
+            // HEAD not supported, we don't know if the file has changed
+            // force trigger downloading
+            force_download = true;
+            LOG_ERR("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
        }
    }

-    // if head_request_ok is false, we don't have the etag or last-modified headers
-    // we leave should_download as-is, which is true if the file does not exist
-    if (head_request_ok) {
-        // check if ETag or Last-Modified headers are different
-        // if it is, we need to download the file again
+    bool should_download = !file_exists || force_download;
+    if (!should_download) {
        if (!etag.empty() && etag != headers.etag) {
            LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
            should_download = true;
@@ -369,7 +350,6 @@ static bool common_download_file_single(const std::string & url, const std::stri
            should_download = true;
        }
    }
-
    if (should_download) {
        std::string path_temporary = path + ".downloadInProgress";
        if (file_exists) {
@@ -423,7 +403,7 @@ static bool common_download_file_single(const std::string & url, const std::stri
        // start the download
        LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
            llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
-        bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS, "GET");
+        bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
        if (!was_perform_successful) {
            return false;
        }
@@ -444,15 +424,13 @@ static bool common_download_file_single(const std::string & url, const std::stri
            {"etag", headers.etag},
            {"lastModified", headers.last_modified}
        });
-        write_file(metadata_path, metadata.dump(4));
-        LOG_DBG("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
+        std::ofstream(metadata_path) << metadata.dump(4);
+        LOG_INF("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());

        if (rename(path_temporary.c_str(), path.c_str()) != 0) {
            LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
            return false;
        }
-    } else {
-        LOG_INF("%s: using cached file: %s\n", __func__, path.c_str());
    }

    return true;
@@ -627,37 +605,16 @@ static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_
    // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
    // User-Agent header is already set in common_remote_get_content, no need to set it here

-    // we use "=" to avoid clashing with other component, while still being allowed on windows
-    std::string cached_response_fname = "manifest=" + hf_repo + "=" + tag + ".json";
-    string_replace_all(cached_response_fname, "/", "_");
-    std::string cached_response_path = fs_get_cache_file(cached_response_fname);
-
    // make the request
    common_remote_params params;
    params.headers = headers;
-    long res_code = 0;
-    std::string res_str;
-    bool use_cache = false;
-    try {
-        auto res = common_remote_get_content(url, params);
-        res_code = res.first;
-        res_str = std::string(res.second.data(), res.second.size());
-    } catch (const std::exception & e) {
-        LOG_WRN("error: failed to get manifest: %s\n", e.what());
-        LOG_WRN("try reading from cache\n");
-        // try to read from cache
-        try {
-            res_str = read_file(cached_response_path);
-            res_code = 200;
-            use_cache = true;
-        } catch (const std::exception & e) {
-            throw std::runtime_error("error: failed to get manifest (check your internet connection)");
-        }
-    }
+    auto res = common_remote_get_content(url, params);
+    long res_code = res.first;
+    std::string res_str(res.second.data(), res.second.size());
    std::string ggufFile;
    std::string mmprojFile;

-    if (res_code == 200 || res_code == 304) {
+    if (res_code == 200) {
        // extract ggufFile.rfilename in json, using regex
        {
            std::regex pattern("\"ggufFile\"[\\s\\S]*?\"rfilename\"\\s*:\\s*\"([^\"]+)\"");
@@ -674,10 +631,6 @@ static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_
                mmprojFile = match[1].str();
            }
        }
-        if (!use_cache) {
-            // if not using cached response, update the cache file
-            write_file(cached_response_path, res_str);
-        }
    } else if (res_code == 401) {
        throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
    } else {
@@ -720,12 +673,8 @@ static struct common_hf_file_res common_get_hf_file(const std::string &, const s
    return {};
 }

-std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params &) {
-    if (!url.empty()) {
-        throw std::runtime_error("error: built without CURL, cannot download model from the internet");
-    }
-
-    return {};
+std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params) {
+    throw std::runtime_error("error: built without CURL, cannot download model from the internet");
 }

 #endif // LLAMA_USE_CURL
@@ -1189,9 +1138,6 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
        fprintf(stderr, "%s\n", ex.what());
        ctx_arg.params = params_org;
        return false;
-    } catch (std::exception & ex) {
-        fprintf(stderr, "%s\n", ex.what());
-        exit(1); // for other exceptions, we exit with status code 1
    }

    return true;
@@ -1283,7 +1229,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params) {
            params.use_color = true;
        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
    add_opt(common_arg(
        {"-t", "--threads"}, "N",
        string_format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads),
@@ -1416,7 +1362,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    add_opt(common_arg(
        {"-n", "--predict", "--n-predict"}, "N",
        string_format(
-            ex == LLAMA_EXAMPLE_MAIN
+            ex == LLAMA_EXAMPLE_MAIN || ex == LLAMA_EXAMPLE_INFILL
                ? "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)"
                : "number of tokens to predict (default: %d, -1 = infinity)",
            params.n_predict),
@@ -1492,9 +1438,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"-f", "--file"}, "FNAME",
        "a file containing the prompt (default: none)",
        [](common_params & params, const std::string & value) {
-            params.prompt = read_file(value);
+            std::ifstream file(value);
+            if (!file) {
+                throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
+            }
            // store the external file name in params
            params.prompt_file = value;
+            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
            if (!params.prompt.empty() && params.prompt.back() == '\n') {
                params.prompt.pop_back();
            }
@@ -1504,7 +1454,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"-sysf", "--system-prompt-file"}, "FNAME",
        "a file containing the system prompt (default: none)",
        [](common_params & params, const std::string & value) {
-            params.system_prompt = read_file(value);
+            std::ifstream file(value);
+            if (!file) {
+                throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
+            }
+            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.system_prompt));
            if (!params.system_prompt.empty() && params.system_prompt.back() == '\n') {
                params.system_prompt.pop_back();
            }
@@ -1655,7 +1609,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.input_prefix = value;
            params.enable_chat_template = false;
        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
    add_opt(common_arg(
        {"--in-suffix"}, "STRING",
        "string to suffix after user inputs with (default: empty)",
@@ -1663,7 +1617,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.input_suffix = value;
            params.enable_chat_template = false;
        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
    add_opt(common_arg(
        {"--no-warmup"},
        "skip warming up the model with an empty run",
@@ -1680,7 +1634,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params) {
            params.spm_infill = true;
        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL}));
    add_opt(common_arg(
        {"--samplers"}, "SAMPLERS",
        string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
@@ -1929,7 +1883,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"--grammar-file"}, "FNAME",
        "file to read grammar from",
        [](common_params & params, const std::string & value) {
-            params.sampling.grammar = read_file(value);
+            std::ifstream file(value);
+            if (!file) {
+                throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
+            }
+            std::copy(
+                std::istreambuf_iterator<char>(file),
+                std::istreambuf_iterator<char>(),
+                std::back_inserter(params.sampling.grammar)
+            );
        }
    ).set_sparam());
    add_opt(common_arg(
@@ -1939,23 +1901,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.sampling.grammar = json_schema_to_grammar(json::parse(value));
        }
    ).set_sparam());
-    add_opt(common_arg(
-        {"-jf", "--json-schema-file"}, "FILE",
-        "File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
-        [](common_params & params, const std::string & value) {
-            std::ifstream file(value);
-            if (!file) {
-                throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
-            }
-            std::string schema;
-            std::copy(
-                std::istreambuf_iterator<char>(file),
-                std::istreambuf_iterator<char>(),
-                std::back_inserter(schema)
-            );
-            params.sampling.grammar = json_schema_to_grammar(json::parse(schema));
-        }
-    ).set_sparam());
    add_opt(common_arg(
        {"--pooling"}, "{none,mean,cls,last,rank}",
        "pooling type for embeddings, use model default if unspecified",
@@ -2097,6 +2042,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.cache_type_v = kv_cache_type_from_str(value);
        }
    ).set_env("LLAMA_ARG_CACHE_TYPE_V"));
+    add_opt(common_arg(
+        {"--perplexity", "--all-logits"},
+        string_format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"),
+        [](common_params & params) {
+            params.logits_all = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
    add_opt(common_arg(
        {"--hellaswag"},
        "compute HellaSwag score over random tasks from datafile supplied with -f",
@@ -2204,14 +2156,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
    add_opt(common_arg(
        {"--mmproj"}, "FILE",
-        "path to a multimodal projector file. see tools/mtmd/README.md",
+        "path to a multimodal projector file. see examples/llava/README.md",
        [](common_params & params, const std::string & value) {
            params.mmproj.path = value;
        }
    ).set_examples(mmproj_examples));
    add_opt(common_arg(
        {"--mmproj-url"}, "URL",
-        "URL to a multimodal projector file. see tools/mtmd/README.md",
+        "URL to a multimodal projector file. see examples/llava/README.md",
        [](common_params & params, const std::string & value) {
            params.mmproj.url = value;
        }
@@ -2776,10 +2728,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
    add_opt(common_arg(
        {"--cache-reuse"}, "N",
-        string_format(
-            "min chunk size to attempt reusing from the cache via KV shifting (default: %d)\n"
-            "[(card)](https://ggml.ai/f0.png)", params.n_cache_reuse
-        ),
+        string_format("min chunk size to attempt reusing from the cache via KV shifting (default: %d)", params.n_cache_reuse),
        [](common_params & params, int value) {
            params.n_cache_reuse = value;
        }
@@ -2862,7 +2811,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            "list of built-in templates:\n%s", list_builtin_chat_templates().c_str()
        ),
        [](common_params & params, const std::string & value) {
-            params.chat_template = read_file(value);
+            std::ifstream file(value);
+            if (!file) {
+                throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
+            }
+            std::copy(
+                std::istreambuf_iterator<char>(file),
+                std::istreambuf_iterator<char>(),
+                std::back_inserter(params.chat_template));
        }
    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
    add_opt(common_arg(
@@ -2885,7 +2841,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params) {
            params.simple_io = true;
        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
    add_opt(common_arg(
        {"--positive-file"}, "FNAME",
        string_format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1096,6 +1096,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
    cparams.n_threads         = params.cpuparams.n_threads;
    cparams.n_threads_batch   = params.cpuparams_batch.n_threads == -1 ?
                                params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
+    cparams.logits_all        = params.logits_all;
    cparams.embeddings        = params.embedding;
    cparams.rope_scaling_type = params.rope_scaling_type;
    cparams.rope_freq_base    = params.rope_freq_base;
--- a/common/common.h
+++ b/common/common.h
@@ -66,6 +66,7 @@ enum llama_example {
    LLAMA_EXAMPLE_COMMON,
    LLAMA_EXAMPLE_SPECULATIVE,
    LLAMA_EXAMPLE_MAIN,
+    LLAMA_EXAMPLE_INFILL,
    LLAMA_EXAMPLE_EMBEDDING,
    LLAMA_EXAMPLE_PERPLEXITY,
    LLAMA_EXAMPLE_RETRIEVAL,
@@ -95,7 +96,6 @@ enum common_sampler_type {
    COMMON_SAMPLER_TYPE_XTC         = 8,
    COMMON_SAMPLER_TYPE_INFILL      = 9,
    COMMON_SAMPLER_TYPE_PENALTIES   = 10,
-    COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11,
 };

 // dimensionality reduction methods, used by cvector-generator
@@ -161,7 +161,6 @@ struct common_params_sampling {
    std::vector<enum common_sampler_type> samplers = {
        COMMON_SAMPLER_TYPE_PENALTIES,
        COMMON_SAMPLER_TYPE_DRY,
-        COMMON_SAMPLER_TYPE_TOP_N_SIGMA,
        COMMON_SAMPLER_TYPE_TOP_K,
        COMMON_SAMPLER_TYPE_TYPICAL_P,
        COMMON_SAMPLER_TYPE_TOP_P,
@@ -324,6 +323,7 @@ struct common_params {
    bool ctx_shift         = true;  // context shift on inifinite text generation

    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
+    bool logits_all        = false; // return logits for all tokens in the batch
    bool use_mmap          = true;  // use mmap for faster loads
    bool use_mlock         = false; // use mlock to keep model in memory
    bool verbose_prompt    = false; // print prompt tokens before generation
@@ -340,7 +340,7 @@ struct common_params {

    common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;

-    // multimodal models (see tools/mtmd)
+    // multimodal models (see examples/llava)
    struct common_params_model mmproj;
    bool mmproj_use_gpu = true;     // use GPU for multimodal model
    bool no_mmproj = false;         // explicitly disable multimodal model
@@ -414,8 +414,8 @@ struct common_params {
    int n_pca_batch = 100;
    int n_pca_iterations = 1000;
    dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
-    std::string cvector_positive_file = "tools/cvector-generator/positive.txt";
-    std::string cvector_negative_file = "tools/cvector-generator/negative.txt";
+    std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
+    std::string cvector_negative_file = "examples/cvector-generator/negative.txt";

    bool spm_infill = false; // suffix/prefix/middle pattern for infill

--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -1,7 +1,6 @@
 #include "sampling.h"

 #include "common.h"
-#include "log.h"

 #include <cmath>
 #include <unordered_map>
@@ -230,48 +229,51 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
                params.logit_bias.data()));

    if (params.mirostat == 0) {
-        for (const auto & cnstr : params.samplers) {
-            switch (cnstr) {
-                case COMMON_SAMPLER_TYPE_DRY:
-                    {
-                        std::vector<const char *> c_breakers;
-                        c_breakers.reserve(params.dry_sequence_breakers.size());
-                        for (const auto & str : params.dry_sequence_breakers) {
-                            c_breakers.push_back(str.c_str());
-                        }
+        if (params.top_n_sigma >= 0) {
+            llama_sampler_chain_add(result->chain, llama_sampler_init_top_k        (params.top_k));
+            llama_sampler_chain_add(result->chain, llama_sampler_init_temp         (params.temp));
+            llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma  (params.top_n_sigma));
+        } else {
+            for (const auto & cnstr : params.samplers) {
+                switch (cnstr) {
+                    case COMMON_SAMPLER_TYPE_DRY:
+                        {
+                            std::vector<const char *> c_breakers;
+                            c_breakers.reserve(params.dry_sequence_breakers.size());
+                            for (const auto & str : params.dry_sequence_breakers) {
+                                c_breakers.push_back(str.c_str());
+                            }

-                        llama_sampler_chain_add(result->chain, llama_sampler_init_dry      (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
-                    }
-                    break;
-                case COMMON_SAMPLER_TYPE_TOP_K:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_top_k       (params.top_k));
-                    break;
-                case COMMON_SAMPLER_TYPE_TOP_P:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_top_p       (params.top_p, params.min_keep));
-                    break;
-                case COMMON_SAMPLER_TYPE_TOP_N_SIGMA:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
-                    break;
-                case COMMON_SAMPLER_TYPE_MIN_P:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_min_p       (params.min_p, params.min_keep));
-                    break;
-                case COMMON_SAMPLER_TYPE_XTC:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_xtc         (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
-                    break;
-                case COMMON_SAMPLER_TYPE_TYPICAL_P:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_typical     (params.typ_p, params.min_keep));
-                    break;
-                case COMMON_SAMPLER_TYPE_TEMPERATURE:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext    (params.temp, params.dynatemp_range, params.dynatemp_exponent));
-                    break;
-                case COMMON_SAMPLER_TYPE_INFILL:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_infill      (vocab));
-                    break;
-                case COMMON_SAMPLER_TYPE_PENALTIES:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_penalties   (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
-                    break;
-                default:
-                    GGML_ASSERT(false && "unknown sampler type");
+                            llama_sampler_chain_add(result->chain, llama_sampler_init_dry      (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
+                        }
+                        break;
+                    case COMMON_SAMPLER_TYPE_TOP_K:
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_top_k    (params.top_k));
+                        break;
+                    case COMMON_SAMPLER_TYPE_TOP_P:
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_top_p    (params.top_p, params.min_keep));
+                        break;
+                    case COMMON_SAMPLER_TYPE_MIN_P:
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_min_p    (params.min_p, params.min_keep));
+                        break;
+                    case COMMON_SAMPLER_TYPE_XTC:
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_xtc      (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
+                        break;
+                    case COMMON_SAMPLER_TYPE_TYPICAL_P:
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_typical  (params.typ_p, params.min_keep));
+                        break;
+                    case COMMON_SAMPLER_TYPE_TEMPERATURE:
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
+                        break;
+                    case COMMON_SAMPLER_TYPE_INFILL:
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_infill   (vocab));
+                        break;
+                    case COMMON_SAMPLER_TYPE_PENALTIES:
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
+                        break;
+                    default:
+                        GGML_ASSERT(false && "unknown sampler type");
+                }
            }
        }
        llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
@@ -473,7 +475,6 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
        case COMMON_SAMPLER_TYPE_TOP_K:       return 'k';
        case COMMON_SAMPLER_TYPE_TYPICAL_P:   return 'y';
        case COMMON_SAMPLER_TYPE_TOP_P:       return 'p';
-        case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: return 's';
        case COMMON_SAMPLER_TYPE_MIN_P:       return 'm';
        case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
        case COMMON_SAMPLER_TYPE_XTC:         return 'x';
@@ -489,7 +490,6 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
        case COMMON_SAMPLER_TYPE_TOP_K:       return "top_k";
        case COMMON_SAMPLER_TYPE_TYPICAL_P:   return "typ_p";
        case COMMON_SAMPLER_TYPE_TOP_P:       return "top_p";
-        case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: return "top_n_sigma";
        case COMMON_SAMPLER_TYPE_MIN_P:       return "min_p";
        case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
        case COMMON_SAMPLER_TYPE_XTC:         return "xtc";
@@ -504,7 +504,6 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
        { "dry",         COMMON_SAMPLER_TYPE_DRY },
        { "top_k",       COMMON_SAMPLER_TYPE_TOP_K },
        { "top_p",       COMMON_SAMPLER_TYPE_TOP_P },
-        { "top_n_sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
        { "typ_p",       COMMON_SAMPLER_TYPE_TYPICAL_P },
        { "min_p",       COMMON_SAMPLER_TYPE_MIN_P },
        { "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
@@ -518,7 +517,6 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
    std::unordered_map<std::string, common_sampler_type> sampler_alt_name_map {
        { "top-k",       COMMON_SAMPLER_TYPE_TOP_K },
        { "top-p",       COMMON_SAMPLER_TYPE_TOP_P },
-        { "top-n-sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
        { "nucleus",     COMMON_SAMPLER_TYPE_TOP_P },
        { "typical-p",   COMMON_SAMPLER_TYPE_TYPICAL_P },
        { "typical",     COMMON_SAMPLER_TYPE_TYPICAL_P },
@@ -535,16 +533,14 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
        auto sampler = sampler_canonical_name_map.find(name);
        if (sampler != sampler_canonical_name_map.end()) {
            samplers.push_back(sampler->second);
-            continue;
-        }
-        if (allow_alt_names) {
-            sampler = sampler_alt_name_map.find(name);
-            if (sampler != sampler_alt_name_map.end()) {
-                samplers.push_back(sampler->second);
-                continue;
+        } else {
+            if (allow_alt_names) {
+                sampler = sampler_alt_name_map.find(name);
+                if (sampler != sampler_alt_name_map.end()) {
+                    samplers.push_back(sampler->second);
+                }
            }
        }
-        LOG_WRN("%s: unable to match sampler by name '%s'\n", __func__, name.c_str());
    }

    return samplers;
@@ -556,7 +552,6 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K),       COMMON_SAMPLER_TYPE_TOP_K },
        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P),   COMMON_SAMPLER_TYPE_TYPICAL_P },
        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P),       COMMON_SAMPLER_TYPE_TOP_P },
-        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_N_SIGMA), COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P),       COMMON_SAMPLER_TYPE_MIN_P },
        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC),         COMMON_SAMPLER_TYPE_XTC },
@@ -571,8 +566,6 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
        const auto sampler = sampler_name_map.find(c);
        if (sampler != sampler_name_map.end()) {
            samplers.push_back(sampler->second);
-        } else {
-            LOG_WRN("%s: unable to match sampler by char '%c'\n", __func__, c);
        }
    }

--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -16,7 +16,6 @@ from pathlib import Path
 from hashlib import sha256
 from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast
 from itertools import chain
-from transformers import AutoConfig

 import math
 import numpy as np
@@ -67,6 +66,8 @@ class ModelBase:
    part_names: list[str]
    is_safetensors: bool
    hparams: dict[str, Any]
+    block_count: int
+    tensor_map: gguf.TensorNameMap
    tensor_names: set[str] | None
    gguf_writer: gguf.GGUFWriter
    model_name: str | None
@@ -77,11 +78,7 @@ class ModelBase:
    # subclasses should define this!
    model_arch: gguf.MODEL_ARCH

-    # subclasses should initialize this!
-    block_count: int
-    tensor_map: gguf.TensorNameMap
-
-    def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, *, is_big_endian: bool = False,
+    def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
                 use_temp_file: bool = False, eager: bool = False,
                 metadata_override: Path | None = None, model_name: str | None = None,
                 split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False,
@@ -116,6 +113,8 @@ class ModelBase:
            if not self.is_safetensors:
                self.part_names = ModelBase.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
        self.hparams = ModelBase.load_hparams(self.dir_model) if hparams is None else hparams
+        self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
+        self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
        self.tensor_names = None
        self.metadata_override = metadata_override
        self.model_name = model_name
@@ -418,15 +417,15 @@ class ModelBase:

    @staticmethod
    def load_hparams(dir_model: Path):
-        try:
-            # for security reason, we don't allow loading remote code by default
-            # if a model need remote code, we will fallback to config.json
-            return AutoConfig.from_pretrained(dir_model, trust_remote_code=False).to_dict()
-        except Exception as e:
-            logger.warning(f"Failed to load model config from {dir_model}: {e}")
-            logger.warning("Trying to load config.json instead")
-            with open(dir_model / "config.json", "r", encoding="utf-8") as f:
-                return json.load(f)
+        with open(dir_model / "config.json", "r", encoding="utf-8") as f:
+            hparams = json.load(f)
+            architectures = hparams.get("architectures")
+            if "text_config" in hparams:
+                hparams = {**hparams, **hparams["text_config"]}
+            if architectures is not None:
+                # preserve "architectures" from root level config
+                hparams["architectures"] = architectures
+            return hparams

    @classmethod
    def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
@@ -455,20 +454,6 @@ class ModelBase:


 class TextModel(ModelBase):
-    model_type = ModelType.TEXT
-    hf_arch: str
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.hf_arch = get_model_architecture(self.hparams, self.model_type)
-
-        if "text_config" in self.hparams:
-            # move the text_config to the root level
-            self.hparams = {**self.hparams, **self.hparams["text_config"]}
-
-        self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
-        self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
-
    @classmethod
    def __init_subclass__(cls):
        # can't use an abstract property, because overriding it without type errors
@@ -510,7 +495,7 @@ class TextModel(ModelBase):
    def set_gguf_parameters(self):
        self.gguf_writer.add_block_count(self.block_count)

-        if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions"], optional=True)) is not None:
+        if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None:
            self.gguf_writer.add_context_length(n_ctx)
            logger.info(f"gguf: context length = {n_ctx}")

@@ -1079,36 +1064,10 @@ class TextModel(ModelBase):
        if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.ADD_EOS)) is not None:
            self.gguf_writer.add_add_eos_token(field.parts[-1].tolist()[0])

-    def _try_set_pooling_type(self) -> None:
-        # get pooling path
-        pooling_path = None
-        module_path = self.dir_model / "modules.json"
-        if module_path.is_file():
-            with open(module_path, encoding="utf-8") as f:
-                modules = json.load(f)
-            for mod in modules:
-                if mod["type"] == "sentence_transformers.models.Pooling":
-                    pooling_path = mod["path"]
-                    break
-
-        # get pooling type
-        if pooling_path is not None:
-            with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f:
-                pooling = json.load(f)
-            if pooling["pooling_mode_mean_tokens"]:
-                pooling_type = gguf.PoolingType.MEAN
-            elif pooling["pooling_mode_cls_token"]:
-                pooling_type = gguf.PoolingType.CLS
-            elif pooling["pooling_mode_lasttoken"]:
-                pooling_type = gguf.PoolingType.LAST
-            else:
-                raise NotImplementedError("Only MEAN, CLS, and LAST pooling types supported")
-            self.gguf_writer.add_pooling_type(pooling_type)
-

 class VisionModel(ModelBase):
-    model_type = ModelType.VISION
    model_arch = gguf.MODEL_ARCH.CLIP_VISION
+    n_text_embd = 0
    preprocessor_config: dict[str, Any]
    global_config: dict[str, Any]

@@ -1118,11 +1077,9 @@ class VisionModel(ModelBase):
        if self.model_arch != gguf.MODEL_ARCH.CLIP_VISION:
            raise TypeError("VisionModel must be subclassed with model_arch = gguf.MODEL_ARCH.CLIP_VISION")

-        # get n_embd of the text model
-        if "text_config" not in self.hparams:
-            self.hparams["text_config"] = {}
-        text_config = {**self.hparams, **self.hparams["text_config"]}
-        self.n_embd_text = text_config.get("hidden_size", text_config.get("n_embd", 0))
+        # small hack to correct the number of layers
+        self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.CLIP_VISION, 128)
+        self.n_embd_text = self.find_hparam(["hidden_size", "n_embd"])
        assert self.n_embd_text > 0, "n_embd not found in hparams"

        if "vision_config" not in self.hparams:
@@ -1131,9 +1088,6 @@ class VisionModel(ModelBase):
        self.global_config = self.hparams
        self.hparams = self.hparams["vision_config"]

-        self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth"])
-        self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.CLIP_VISION, self.block_count)
-
        # load preprocessor config
        with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f:
            self.preprocessor_config = json.load(f)
@@ -1151,12 +1105,12 @@ class VisionModel(ModelBase):
        self.gguf_writer.add_vision_patch_size(self.find_hparam(["patch_size"]))
        self.gguf_writer.add_vision_embedding_length(self.find_hparam(["hidden_size"]))
        self.gguf_writer.add_vision_feed_forward_length(self.find_hparam(["intermediate_size"]))
-        self.gguf_writer.add_vision_block_count(self.block_count)
+        self.gguf_writer.add_vision_block_count(self.find_hparam(["num_hidden_layers"]))
        self.gguf_writer.add_vision_head_count(self.find_hparam(["num_attention_heads"]))

        # preprocessor config
        self.gguf_writer.add_vision_image_mean(self.preprocessor_config["image_mean"])
-        self.gguf_writer.add_vision_image_std(self.preprocessor_config["image_std"])
+        self.gguf_writer.add_vision_image_std(self.preprocessor_config["image_mean"])

    def write_vocab(self):
        raise ValueError("VisionModel does not support vocab writing")
@@ -1772,7 +1726,8 @@ class StableLMModel(TextModel):
    "LlamaForCausalLM",
    "MistralForCausalLM",
    "MixtralForCausalLM",
-    "VLlama3ForCausalLM",
+    "Idefics3ForConditionalGeneration",
+    "SmolVLMForConditionalGeneration",
    "LlavaForConditionalGeneration")
 class LlamaModel(TextModel):
    model_arch = gguf.MODEL_ARCH.LLAMA
@@ -1781,7 +1736,11 @@ class LlamaModel(TextModel):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # fix for SmolVLM2, missing `num_attention_heads` in config.json
-        if self.hf_arch == "VLlama3ForCausalLM":
+        if self.hparams["architectures"][0] == "SmolVLMForConditionalGeneration":
+            self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
+        # fix for Pixtral, missing `num_attention_heads` in config.json
+        if self.hparams["architectures"][0] == "LlavaForConditionalGeneration" \
+                and self.hparams.get("model_type") == "mistral":
            self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)

    def set_vocab(self):
@@ -1939,50 +1898,31 @@ class LlamaModel(TextModel):
                raise ValueError(f"Unprocessed experts: {experts}")


-@ModelBase.register(
-    "LlavaForConditionalGeneration", # pixtral
-    "Mistral3ForConditionalGeneration", # mistral small 3.1
-)
+@ModelBase.register("LlavaForConditionalGeneration")
 class LlavaVisionModel(VisionModel):
    img_break_tok_id = -1

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        if self.hparams["model_type"] == "pixtral":
-            # layer_norm_eps is not in config.json, it is hard-coded in modeling_pixtral.py
+            # fix missing config.json values
+            self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 16)
+            self.hparams["num_hidden_layers"] = self.hparams.get("num_hidden_layers", 24)
+            self.hparams["intermediate_size"] = self.hparams.get("intermediate_size", 4096)
+            self.hparams["hidden_size"] = self.hparams.get("hidden_size", 1024)
            self.hparams["layer_norm_eps"] = self.hparams.get("layer_norm_eps", 1e-5)
-            self.img_break_tok_id = self.get_token_id("[IMG_BREAK]")
-            logger.info(f"Image break token id: {self.img_break_tok_id}")
+            self.img_break_tok_id = 12 # see tokenizer_config.json
        else:
            raise ValueError(f"Unsupported model type: {self.hparams['model_type']}")

-    def get_token_id(self, token: str) -> int:
-        tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
-        with open(tokenizer_config_file, "r", encoding="utf-8") as f:
-            added_tokens_decoder = json.load(f)['added_tokens_decoder']
-            for id_, token_data in added_tokens_decoder.items():
-                if token_data["content"] == token:
-                    return int(id_)
-        raise ValueError(f"Token '{token}' not found in tokenizer config.")
-
    def set_gguf_parameters(self):
        super().set_gguf_parameters()
        hparams = self.hparams
        if hparams["model_type"] == "pixtral":
            self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.PIXTRAL)
+            # default values below are taken from HF tranformers code
            self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"])
-
-            # hidden_act
-            if hparams["hidden_act"] == "silu":
-                self.gguf_writer.add_vision_use_silu(True)
-            elif hparams["hidden_act"] == "gelu":
-                self.gguf_writer.add_vision_use_gelu(True)
-            else:
-                raise ValueError(f"Unsupported hidden_act: {hparams['hidden_act']}")
-
-            # spatial_merge_size
-            if "spatial_merge_size" in self.global_config:
-                self.gguf_writer.add_vision_spatial_merge_size(self.global_config["spatial_merge_size"])
+            self.gguf_writer.add_vision_use_silu(True)

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        del bid  # unused
@@ -2011,12 +1951,13 @@ class LlavaVisionModel(VisionModel):
 class SmolVLMModel(VisionModel):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
+        # fix for SmolVLM2, missing some keys in config.json
+        # default values are taken from transformers code
        if self.hparams["model_type"] == "smolvlm_vision":
-            # fix for SmolVLM2, missing some keys in config.json
-            # default values are taken from transformers code
            self.hparams["hidden_size"] = self.hparams.get("hidden_size", 1152)
            self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 16)
            self.hparams["intermediate_size"] = self.hparams.get("intermediate_size", 3072)
+            self.hparams["num_hidden_layers"] = self.hparams.get("num_hidden_layers", 12)

    def set_gguf_parameters(self):
        super().set_gguf_parameters()
@@ -2129,9 +2070,6 @@ class DeciModel(TextModel):
            # if n_heads_in_group is not None, then
            # _num_kv_heads[il] is num_attention_head // n_heads_in_group and
            # _num_heads[il] is num_attention_head
-            # ***dummy layer*** for nemotron 253B
-            # if n_heads_in_group is None and ffn_mult is None
-            # then _num_kv_heads[il] is 0 and _num_heads[il] is 0 and _ffn_dims is 0
            for il in range(len(_block_configs)):
                if _block_configs[il]["attention"]["n_heads_in_group"] is None:
                    if _block_configs[il]["attention"]["replace_with_linear"] is True:
@@ -2143,10 +2081,7 @@ class DeciModel(TextModel):
                else:
                    self._num_kv_heads.append(self.hparams["num_attention_heads"] // _block_configs[il]["attention"]["n_heads_in_group"])
                    self._num_heads.append(self.hparams["num_attention_heads"])
-                if _block_configs[il]["ffn"]["ffn_mult"] is None: # dummy layer
-                    _ffn_multipliers.append(0.0)
-                else:
-                    _ffn_multipliers.append(_block_configs[il]["ffn"]["ffn_mult"])
+                _ffn_multipliers.append(_block_configs[il]["ffn"]["ffn_mult"])
            assert self.block_count == len(self._num_kv_heads)
            assert self.block_count == len(self._num_heads)
            assert self.block_count == len(_ffn_multipliers)
@@ -2584,7 +2519,7 @@ class QwenModel(TextModel):
        self.gguf_writer.add_file_type(self.ftype)


-@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM")
+@ModelBase.register("Qwen2ForCausalLM")
 class Qwen2Model(TextModel):
    model_arch = gguf.MODEL_ARCH.QWEN2

@@ -2596,18 +2531,12 @@ class Qwen2Model(TextModel):

    def set_gguf_parameters(self):
        super().set_gguf_parameters()
-        self._try_set_pooling_type()
        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
            if self.hparams["rope_scaling"].get("type") == "yarn":
                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
                self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])

-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        if self.hf_arch == "Qwen2Model":
-            name = f"model.{name}"  # map to Qwen2ForCausalLM tensors
-        yield from super().modify_tensors(data_torch, name, bid)
-

@ModelBase.register("Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
 class Qwen2VLModel(TextModel):
@@ -2633,82 +2562,6 @@ class Qwen2VLModel(TextModel):
        return [(self.map_tensor_name(name), data_torch)]


-@ModelBase.register("Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
-class Qwen2VLVisionModel(VisionModel):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.hparams["image_size"] = self.hparams.get("image_size", 560)
-        # rename config.json values
-        self.hparams["num_attention_heads"] = self.hparams.get("num_heads")
-        self.hparams["num_hidden_layers"] = self.hparams.get("depth")
-        if "embed_dim" in self.hparams: # qwen2vl
-            self.hparams["intermediate_size"] = self.hparams.get("hidden_size")
-            self.hparams["hidden_size"] = self.hparams.get("embed_dim")
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        hparams = self.hparams
-        if self.global_config['model_type'] == 'qwen2_vl':
-            self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.QWEN2VL)
-        elif self.global_config['model_type'] == 'qwen2_5_vl':
-            self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.QWEN25VL)
-            self.gguf_writer.add_vision_use_silu(True)
-            # find n_wa_pattern (window attention pattern)
-            fullatt_block_indexes = hparams.get("fullatt_block_indexes")
-            assert fullatt_block_indexes is not None, "fullatt_block_indexes is required for qwen2_5_vl"
-            n_wa_pattern = fullatt_block_indexes[0] + 1
-            # validate n_wa_pattern
-            for i in range(1, len(fullatt_block_indexes)):
-                if fullatt_block_indexes[i] - fullatt_block_indexes[i - 1] != n_wa_pattern:
-                    raise ValueError(f"Invalid fullatt_block_indexes: {fullatt_block_indexes}")
-            self.gguf_writer.add_vision_n_wa_pattern(n_wa_pattern)
-        else:
-            raise ValueError(f"Unknown QwenVL model type: {self.global_config['model_type']}")
-        # default values below are taken from HF tranformers code
-        self.gguf_writer.add_vision_attention_layernorm_eps(self.global_config.get("rms_norm_eps", 1e-6))
-
-    def tensor_force_quant(self, name, new_name, bid, n_dims):
-        del bid, name, n_dims  # unused
-        if ".patch_embd." in new_name:
-            return gguf.GGMLQuantizationType.F16
-        if ".position_embd." in new_name:
-            return gguf.GGMLQuantizationType.F32
-        return False
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        del bid  # unused
-        if name.startswith("visual."):
-            # process visual tensors
-            # split QKV tensors if needed
-            if ".qkv." in name:
-                if data_torch.ndim == 2: # weight
-                    c3, _ = data_torch.shape
-                else: # bias
-                    c3 = data_torch.shape[0]
-                assert c3 % 3 == 0
-                c = c3 // 3
-                wq = data_torch[:c]
-                wk = data_torch[c: c * 2]
-                wv = data_torch[c * 2:]
-                return [
-                    (self.map_tensor_name(name.replace("qkv", "q")), wq),
-                    (self.map_tensor_name(name.replace("qkv", "k")), wk),
-                    (self.map_tensor_name(name.replace("qkv", "v")), wv),
-                ]
-            elif 'patch_embed.proj.weight' in name:
-                # split Conv3D into Conv2Ds
-                c1, c2, kt, kh, kw = data_torch.shape
-                del c1, c2, kh, kw  # unused
-                assert kt == 2, "Current implmentation only support temporal_patch_size of 2"
-                return [
-                    (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight"  , data_torch[:, :, 0, ...]),
-                    (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight.1", data_torch[:, :, 1, ...]),
-                ]
-            else:
-                return [(self.map_tensor_name(name), data_torch)]
-        return [] # skip other tensors
-
-
@ModelBase.register("WavTokenizerDec")
 class WavTokenizerDecModel(TextModel):
    model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC
@@ -2761,13 +2614,6 @@ class Qwen2MoeModel(TextModel):
        if (shared_expert_intermediate_size := self.hparams.get('shared_expert_intermediate_size')) is not None:
            self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size)
            logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}")
-        # YaRN is not enabled by default
-        # To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts
-        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
-            if self.hparams["rope_scaling"].get("type") == "yarn":
-                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
-                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
-                self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])

    _experts: list[dict[str, Tensor]] | None = None

@@ -3451,7 +3297,29 @@ class BertModel(TextModel):
    def set_gguf_parameters(self):
        super().set_gguf_parameters()
        self.gguf_writer.add_causal_attention(False)
-        self._try_set_pooling_type()
+
+        # get pooling path
+        pooling_path = None
+        module_path = self.dir_model / "modules.json"
+        if module_path.is_file():
+            with open(module_path, encoding="utf-8") as f:
+                modules = json.load(f)
+            for mod in modules:
+                if mod["type"] == "sentence_transformers.models.Pooling":
+                    pooling_path = mod["path"]
+                    break
+
+        # get pooling type
+        if pooling_path is not None:
+            with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f:
+                pooling = json.load(f)
+            if pooling["pooling_mode_mean_tokens"]:
+                pooling_type = gguf.PoolingType.MEAN
+            elif pooling["pooling_mode_cls_token"]:
+                pooling_type = gguf.PoolingType.CLS
+            else:
+                raise NotImplementedError("Only MEAN and CLS pooling types supported")
+            self.gguf_writer.add_pooling_type(pooling_type)

    def set_vocab(self):
        tokens, toktypes, tokpre = self.get_vocab_base()
@@ -3505,7 +3373,14 @@ class BertModel(TextModel):

        return [(self.map_tensor_name(name), data_torch)]

-    def _xlmroberta_tokenizer_init(self) -> None:
+
+@ModelBase.register("RobertaModel")
+class RobertaModel(BertModel):
+    model_arch = gguf.MODEL_ARCH.BERT
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
        # we need the pad_token_id to know how to chop down position_embd matrix
        if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
            self._position_offset = 1 + pad_token_id
@@ -3514,7 +3389,82 @@ class BertModel(TextModel):
        else:
            self._position_offset = None

-    def _xlmroberta_set_vocab(self) -> None:
+    def set_vocab(self):
+        """Support BPE tokenizers for roberta models"""
+        bpe_tok_path = self.dir_model / "tokenizer.json"
+        if bpe_tok_path.exists():
+            self._set_vocab_gpt2()
+            self.gguf_writer.add_add_bos_token(True)
+            self.gguf_writer.add_add_eos_token(True)
+
+            # we need this to validate the size of the token_type embeddings
+            # though currently we are passing all zeros to the token_type embeddings
+            # "Sequence A" or "Sequence B"
+            self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
+
+        else:
+            return super().set_vocab()
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # if name starts with "roberta.", remove the prefix
+        # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
+        if name.startswith("roberta."):
+            name = name[8:]
+
+        # position embeddings start at pad_token_id + 1, so just chop down the weight tensor
+        if name == "embeddings.position_embeddings.weight":
+            if self._position_offset is not None:
+                data_torch = data_torch[self._position_offset:,:]
+
+        return super().modify_tensors(data_torch, name, bid)
+
+
+@ModelBase.register("NomicBertModel")
+class NomicBertModel(BertModel):
+    model_arch = gguf.MODEL_ARCH.NOMIC_BERT
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # the HF config claims n_ctx=8192, but it uses RoPE scaling
+        self.hparams["n_ctx"] = 2048
+
+        # SwigLU activation
+        assert self.hparams["activation_function"] == "swiglu"
+        # this doesn't do anything in the HF version
+        assert self.hparams["causal"] is False
+        # no bias tensors
+        assert self.hparams["qkv_proj_bias"] is False
+        assert self.hparams["mlp_fc1_bias"] is False
+        assert self.hparams["mlp_fc2_bias"] is False
+        # norm at end of layer
+        assert self.hparams["prenorm"] is False
+        # standard RoPE
+        assert self.hparams["rotary_emb_fraction"] == 1.0
+        assert self.hparams["rotary_emb_interleaved"] is False
+        assert self.hparams["rotary_emb_scale_base"] is None
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
+
+
+@ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
+class XLMRobertaModel(BertModel):
+    model_arch = gguf.MODEL_ARCH.BERT
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # we need the pad_token_id to know how to chop down position_embd matrix
+        if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
+            self._position_offset = 1 + pad_token_id
+            if "max_position_embeddings" in self.hparams:
+                self.hparams["max_position_embeddings"] -= self._position_offset
+        else:
+            self._position_offset = None
+
+    def set_vocab(self):
        # to avoid TypeError: Descriptors cannot be created directly
        # exception when importing sentencepiece_model_pb2
        os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
@@ -3596,145 +3546,6 @@ class BertModel(TextModel):
        self.gguf_writer.add_add_bos_token(True)
        self.gguf_writer.add_add_eos_token(True)

-
-@ModelBase.register("RobertaModel")
-class RobertaModel(BertModel):
-    model_arch = gguf.MODEL_ARCH.BERT
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        # we need the pad_token_id to know how to chop down position_embd matrix
-        if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
-            self._position_offset = 1 + pad_token_id
-            if "max_position_embeddings" in self.hparams:
-                self.hparams["max_position_embeddings"] -= self._position_offset
-        else:
-            self._position_offset = None
-
-    def set_vocab(self):
-        """Support BPE tokenizers for roberta models"""
-        bpe_tok_path = self.dir_model / "tokenizer.json"
-        if bpe_tok_path.exists():
-            self._set_vocab_gpt2()
-            self.gguf_writer.add_add_bos_token(True)
-            self.gguf_writer.add_add_eos_token(True)
-
-            # we need this to validate the size of the token_type embeddings
-            # though currently we are passing all zeros to the token_type embeddings
-            # "Sequence A" or "Sequence B"
-            self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
-
-        else:
-            return super().set_vocab()
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        # if name starts with "roberta.", remove the prefix
-        # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
-        if name.startswith("roberta."):
-            name = name[8:]
-
-        # position embeddings start at pad_token_id + 1, so just chop down the weight tensor
-        if name == "embeddings.position_embeddings.weight":
-            if self._position_offset is not None:
-                data_torch = data_torch[self._position_offset:,:]
-
-        return super().modify_tensors(data_torch, name, bid)
-
-
-@ModelBase.register("NomicBertModel")
-class NomicBertModel(BertModel):
-    model_arch = gguf.MODEL_ARCH.BERT
-
-    def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, **kwargs: Any):
-        hparams = kwargs.pop("hparams", None)
-        if hparams is None:
-            hparams = ModelBase.load_hparams(dir_model)
-
-        self.is_moe = bool(hparams.get("moe_every_n_layers"))
-        self.model_arch = gguf.MODEL_ARCH.NOMIC_BERT_MOE if self.is_moe else gguf.MODEL_ARCH.NOMIC_BERT
-
-        super().__init__(dir_model, ftype, fname_out, hparams=hparams, **kwargs)
-
-        self._tokenizer_is_xlmroberta = self._is_tokenizer_xlmroberta()
-        if self._tokenizer_is_xlmroberta:
-            self._xlmroberta_tokenizer_init()
-
-        npos, mtp = self.hparams["n_positions"], self.hparams.get("max_trained_positions", 2048)
-        if npos == 8192 and mtp == 2048:
-            self.hparams["n_positions"] = 2048  # nomic-embed-text v1 and v1.5 are trained for 2048 tokens.
-        elif npos == 2048 and mtp == 2048:
-            self.hparams["n_positions"] = 512   # nomic-embed-text-v2-moe is trained for 512 tokens.
-        else:
-            raise ValueError(f"unrecognized parameters: n_positions={npos}, max_trained_positions={mtp}")
-
-        assert self.hparams["activation_function"] == "gelu" if self.is_moe else "swiglu"
-
-        # this doesn't do anything in the HF version
-        assert self.hparams["causal"] is False
-        # no bias tensors unless MoE
-        assert self.hparams["qkv_proj_bias"] == self.is_moe
-        assert self.hparams["mlp_fc1_bias"]  == self.is_moe
-        assert self.hparams["mlp_fc2_bias"]  == self.is_moe
-
-        # norm at end of layer
-        assert self.hparams["prenorm"] is False
-        # standard RoPE
-        assert self.hparams["rotary_emb_fraction"] == 1.0
-        assert self.hparams["rotary_emb_interleaved"] is False
-        assert self.hparams["rotary_emb_scale_base"] is None
-
-    def set_vocab(self) -> None:
-        if self._tokenizer_is_xlmroberta:
-            return self._xlmroberta_set_vocab()
-        return super().set_vocab()
-
-    def modify_tensors(self, data_torch: torch.Tensor, name: str, bid: int | None) -> Iterable[tuple[str, torch.Tensor]]:
-        # If the tensor is an experts bias tensor, skip it by returning an empty list.
-        if "mlp.experts.bias" in name:
-            return []  # Explicitly return an empty list.
-
-        if "mlp.experts.mlp.w1" in name:
-            data_torch = data_torch.view(self.hparams["num_experts"], self.hparams["n_inner"], self.hparams["n_embd"])
-            name += ".weight"
-
-        if "mlp.experts.mlp.w2" in name:
-            data_torch = data_torch.view(self.hparams["num_experts"], self.hparams["n_inner"], self.hparams["n_embd"])
-            data_torch = data_torch.transpose(1, 2)
-            name += ".weight"
-
-        return [(self.map_tensor_name(name), data_torch)]
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
-        if self.is_moe:
-            self.gguf_writer.add_moe_every_n_layers(self.hparams["moe_every_n_layers"])
-            self.gguf_writer.add_expert_count(self.hparams["num_experts"])
-            self.gguf_writer.add_expert_used_count(self.hparams["moe_top_k"])
-
-    def _is_tokenizer_xlmroberta(self) -> bool:
-        with open(self.dir_model / "tokenizer.json") as f:
-            tokenizer_json = json.load(f)
-        toktyp = tokenizer_json["model"]["type"]
-        if toktyp == "Unigram":
-            return True
-        if toktyp == "WordPiece":
-            return False
-        raise ValueError(f"unknown tokenizer: {toktyp}")
-
-
-@ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
-class XLMRobertaModel(BertModel):
-    model_arch = gguf.MODEL_ARCH.BERT
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self._xlmroberta_tokenizer_init()
-
-    def set_vocab(self):
-        self._xlmroberta_set_vocab()
-
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        # if name starts with "roberta.", remove the prefix
        # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
@@ -3915,16 +3726,6 @@ class Gemma3VisionModel(VisionModel):
        # default values below are taken from HF tranformers code
        self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6))
        self.gguf_writer.add_vision_use_gelu(True)
-        # calculate proj_scale_factor (used by tinygemma3 test model)
-        image_seq_length = self.preprocessor_config.get("image_seq_length", 256)
-        n_per_side = int(image_seq_length ** 0.5)
-        image_size = self.hparams["image_size"]
-        patch_size = self.hparams["patch_size"]
-        proj_scale_factor = (image_size // patch_size) // n_per_side
-        if proj_scale_factor > 0 and proj_scale_factor != 4:
-            # we only need to write this if it's not the default value
-            # in this case, we are converting a test model
-            self.gguf_writer.add_vision_projector_scale_factor(proj_scale_factor)

    def tensor_force_quant(self, name, new_name, bid, n_dims):
        del bid, new_name, n_dims  # unused
@@ -3938,9 +3739,6 @@ class Gemma3VisionModel(VisionModel):
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        del bid  # unused

-        if "vision_model.head." in name:
-            return [] # skip redundant tensors for tinygemma3
-
        if name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \
                or name.startswith("multimodal_projector.") or name.startswith("vision_model."):
            # process vision tensors
@@ -5356,7 +5154,7 @@ class Glm4Model(TextModel):
        special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
        special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
        special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"])
-        special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"])
+        special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["[gMASK]"])
        special_vocab.add_to_gguf(self.gguf_writer)

    def set_gguf_parameters(self):
@@ -5706,12 +5504,7 @@ class BailingMoeModel(TextModel):
        rope_dim = hparams.get("head_dim") or hparams["hidden_size"] // hparams["num_attention_heads"]

        self.gguf_writer.add_rope_dimension_count(rope_dim)
-        if (self.hparams.get("rope_scaling") or {}).get("type") == "yarn" and "factor" in self.hparams["rope_scaling"]:
-            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
-            self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
-            self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
-        else:
-            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
+        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
        self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
        self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
@@ -6013,18 +5806,6 @@ def split_str_to_n_bytes(split_str: str) -> int:
    return n


-def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> str:
-    text_config = hparams.get("text_config", {})
-    vision_config = hparams.get("vision_config", {})
-    arch = hparams["architectures"][0]
-    # if "architectures" is found in the sub-config, use that instead
-    if model_type == ModelType.TEXT and text_config.get("architectures") is not None:
-        arch = text_config["architectures"][0]
-    elif model_type == ModelType.VISION and vision_config.get("architectures") is not None:
-        arch = vision_config["architectures"][0]
-    return arch
-
-
 def main() -> None:
    args = parse_args()

@@ -6077,16 +5858,16 @@ def main() -> None:

    logger.info(f"Loading model: {dir_model.name}")

+    hparams = ModelBase.load_hparams(dir_model)
+
    if args.mmproj:
        if "mmproj" not in fname_out.name:
            fname_out = ModelBase.add_prefix_to_filename(fname_out, "mmproj-")

    with torch.inference_mode():
        output_type = ftype_map[args.outtype]
+        model_architecture = hparams["architectures"][0]
        model_type = ModelType.VISION if args.mmproj else ModelType.TEXT
-        hparams = ModelBase.load_hparams(dir_model)
-        model_architecture = get_model_architecture(hparams, model_type)
-        logger.info(f"Model architecture: {model_architecture}")
        try:
            model_class = ModelBase.from_model_architecture(model_architecture, model_type=model_type)
        except NotImplementedError:
--- a/docs/development/HOWTO-add-model.md
+++ b/docs/development/HOWTO-add-model.md
@@ -9,10 +9,10 @@ Adding a model requires few steps:
 After following these steps, you can open PR.

 Also, it is important to check that the examples and main ggml backends (CUDA, METAL, CPU) are working with the new architecture, especially:
- [main](/tools/main/)
- [imatrix](/tools/imatrix/)
- [quantize](/tools/quantize/)
- [server](/tools/server/)
+- [main](/examples/main/)
+- [imatrix](/examples/imatrix/)
+- [quantize](/examples/quantize/)
+- [server](/examples/server/)

 ### 1. Convert the model to GGUF

--- a/docs/multimodal/MobileVLM.md
+++ b/docs/multimodal/MobileVLM.md
@@ -33,13 +33,13 @@ git clone https://huggingface.co/openai/clip-vit-large-patch14-336
 2. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:

 ```sh
-python ./tools/mtmd/llava_surgery.py -m path/to/MobileVLM-1.7B
+python ./examples/llava/llava_surgery.py -m path/to/MobileVLM-1.7B
 ```

 3. Use `convert_image_encoder_to_gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:

 ```sh
-python ./tools/mtmd/convert_image_encoder_to_gguf.py \
+python ./examples/llava/convert_image_encoder_to_gguf.py \
    -m path/to/clip-vit-large-patch14-336 \
    --llava-projector path/to/MobileVLM-1.7B/llava.projector \
    --output-dir path/to/MobileVLM-1.7B \
@@ -47,7 +47,7 @@ python ./tools/mtmd/convert_image_encoder_to_gguf.py \
 ```

 ```sh
-python ./tools/mtmd/convert_image_encoder_to_gguf.py \
+python ./examples/llava/convert_image_encoder_to_gguf.py \
    -m path/to/clip-vit-large-patch14-336 \
    --llava-projector path/to/MobileVLM-1.7B_V2/llava.projector \
    --output-dir path/to/MobileVLM-1.7B_V2 \
@@ -69,10 +69,10 @@ Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directo

 ## Android compile and run
 ### compile
-refer to `tools/mtmd/android/build_64.sh`
+refer to `examples/llava/android/build_64.sh`
 ```sh
-mkdir tools/mtmd/android/build_64
-cd tools/mtmd/android/build_64
+mkdir examples/llava/android/build_64
+cd examples/llava/android/build_64
 ../build_64.sh
 ```
 ### run on Android
--- a/docs/multimodal/glmedge.md
+++ b/docs/multimodal/glmedge.md
@@ -25,13 +25,13 @@ git clone https://huggingface.co/THUDM/glm-edge-v-5b or https://huggingface.co/T
 2. Use `glmedge-surgery.py` to split the GLMV-EDGE model to LLM and multimodel projector constituents:

 ```sh
-python ./tools/mtmd/glmedge-surgery.py -m ../model_path
+python ./examples/llava/glmedge-surgery.py -m ../model_path
 ```

 4. Use `glmedge-convert-image-encoder-to-gguf.py` to convert the GLMV-EDGE image encoder to GGUF:

 ```sh
-python ./tools/mtmd/glmedge-convert-image-encoder-to-gguf.py -m ../model_path --llava-projector ../model_path/glm.projector --output-dir ../model_path
+python ./examples/llava/glmedge-convert-image-encoder-to-gguf.py -m ../model_path --llava-projector ../model_path/glm.projector --output-dir ../model_path
 ```

 5. Use `examples/convert_hf_to_gguf.py` to convert the LLM part of GLMV-EDGE to GGUF:
--- a/docs/multimodal/llava.md
+++ b/docs/multimodal/llava.md
@@ -37,19 +37,19 @@ git clone https://huggingface.co/openai/clip-vit-large-patch14-336
 2. Install the required Python packages:

 ```sh
-pip install -r tools/mtmd/requirements.txt
+pip install -r examples/llava/requirements.txt
 ```

 3. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:

 ```sh
-python ./tools/mtmd/llava_surgery.py -m ../llava-v1.5-7b
+python ./examples/llava/llava_surgery.py -m ../llava-v1.5-7b
 ```

 4. Use `convert_image_encoder_to_gguf.py` to convert the LLaVA image encoder to GGUF:

 ```sh
-python ./tools/mtmd/convert_image_encoder_to_gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
+python ./examples/llava/convert_image_encoder_to_gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
 ```

 5. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF:
@@ -69,12 +69,12 @@ git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b
 2) Install the required Python packages:

 ```sh
-pip install -r tools/mtmd/requirements.txt
+pip install -r examples/llava/requirements.txt
 ```

 3) Use `llava_surgery_v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
 ```console
-python tools/mtmd/llava_surgery_v2.py -C -m ../llava-v1.6-vicuna-7b/
+python examples/llava/llava_surgery_v2.py -C -m ../llava-v1.6-vicuna-7b/
 ```
 - you will find a llava.projector and a llava.clip file in your model directory

@@ -88,7 +88,7 @@ curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.jso

 5) Create the visual gguf model:
 ```console
-python ./tools/mtmd/convert_image_encoder_to_gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
+python ./examples/llava/convert_image_encoder_to_gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
 ```
 - This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP

--- a/docs/multimodal/minicpmo2.6.md
+++ b/docs/multimodal/minicpmo2.6.md
@@ -29,8 +29,8 @@ cmake --build build --config Release
 Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf) by us)

 ```bash
-python ./tools/mtmd/minicpmv-surgery.py -m ../MiniCPM-o-2_6
-python ./tools/mtmd/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-o-2_6 --minicpmv-projector ../MiniCPM-o-2_6/minicpmv.projector --output-dir ../MiniCPM-o-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 4
+python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-o-2_6
+python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-o-2_6 --minicpmv-projector ../MiniCPM-o-2_6/minicpmv.projector --output-dir ../MiniCPM-o-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 4
 python ./convert_hf_to_gguf.py ../MiniCPM-o-2_6/model

 # quantize int4 version
--- a/docs/multimodal/minicpmv2.5.md
+++ b/docs/multimodal/minicpmv2.5.md
@@ -28,8 +28,8 @@ cmake --build build --config Release
 Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us)

 ```bash
-python ./tools/mtmd/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
-python ./tools/mtmd/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2
+python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
+python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2
 python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model

 # quantize int4 version
--- a/docs/multimodal/minicpmv2.6.md
+++ b/docs/multimodal/minicpmv2.6.md
@@ -28,8 +28,8 @@ cmake --build build --config Release
 Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) by us)

 ```bash
-python ./tools/mtmd/minicpmv-surgery.py -m ../MiniCPM-V-2_6
-python ./tools/mtmd/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-2_6 --minicpmv-projector ../MiniCPM-V-2_6/minicpmv.projector --output-dir ../MiniCPM-V-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 3
+python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-V-2_6
+python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-2_6 --minicpmv-projector ../MiniCPM-V-2_6/minicpmv.projector --output-dir ../MiniCPM-V-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 3
 python ./convert_hf_to_gguf.py ../MiniCPM-V-2_6/model

 # quantize int4 version
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -12,29 +12,51 @@ llama_add_compile_flags()

 # examples

+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+
 if (EMSCRIPTEN)
 else()
+    add_subdirectory(batched-bench)
    add_subdirectory(batched)
    add_subdirectory(embedding)
    add_subdirectory(eval-callback)

    add_subdirectory(gguf-hash)
+    add_subdirectory(gguf-split)
    add_subdirectory(gguf)
    add_subdirectory(gritlm)
+    add_subdirectory(imatrix)
+    add_subdirectory(infill)
+    add_subdirectory(llama-bench)
    add_subdirectory(lookahead)
    add_subdirectory(lookup)
+    add_subdirectory(main)
    add_subdirectory(parallel)
    add_subdirectory(passkey)
+    add_subdirectory(perplexity)
+    add_subdirectory(quantize)
    add_subdirectory(retrieval)
+    if (LLAMA_BUILD_SERVER)
+        add_subdirectory(server)
+    endif()
    add_subdirectory(save-load-state)
+    add_subdirectory(run)
    add_subdirectory(simple)
    add_subdirectory(simple-chat)
    add_subdirectory(speculative)
    add_subdirectory(speculative-simple)
+    add_subdirectory(tokenize)
+    add_subdirectory(tts)
    add_subdirectory(gen-docs)
    if (NOT GGML_BACKEND_DL)
-        add_subdirectory(convert-llama2c-to-ggml)
        # these examples use the backends directly and cannot be built with dynamic loading
+        add_subdirectory(convert-llama2c-to-ggml)
+        add_subdirectory(cvector-generator)
+        add_subdirectory(export-lora)
+        add_subdirectory(llava)
+        if (GGML_RPC)
+            add_subdirectory(rpc)
+        endif()
        if (GGML_SYCL)
            add_subdirectory(sycl)
        endif()
--- a/examples/batched-bench/CMakeLists.txt
+++ b/examples/batched-bench/CMakeLists.txt
--- a/examples/batched-bench/README.md
+++ b/examples/batched-bench/README.md
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
--- a/examples/cvector-generator/CMakeLists.txt
+++ b/examples/cvector-generator/CMakeLists.txt
--- a/examples/cvector-generator/README.md
+++ b/examples/cvector-generator/README.md
--- a/examples/cvector-generator/completions.txt
+++ b/examples/cvector-generator/completions.txt
--- a/examples/cvector-generator/cvector-generator.cpp
+++ b/examples/cvector-generator/cvector-generator.cpp
--- a/examples/cvector-generator/mean.hpp
+++ b/examples/cvector-generator/mean.hpp
--- a/examples/cvector-generator/negative.txt
+++ b/examples/cvector-generator/negative.txt
--- a/examples/cvector-generator/pca.hpp
+++ b/examples/cvector-generator/pca.hpp
--- a/examples/cvector-generator/positive.txt
+++ b/examples/cvector-generator/positive.txt
--- a/examples/export-lora/CMakeLists.txt
+++ b/examples/export-lora/CMakeLists.txt
--- a/examples/export-lora/README.md
+++ b/examples/export-lora/README.md
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
--- a/examples/gguf-split/CMakeLists.txt
+++ b/examples/gguf-split/CMakeLists.txt
--- a/examples/gguf-split/README.md
+++ b/examples/gguf-split/README.md
--- a/examples/gguf-split/gguf-split.cpp
+++ b/examples/gguf-split/gguf-split.cpp
--- a/examples/gguf-split/tests.sh
+++ b/examples/gguf-split/tests.sh
--- a/examples/imatrix/CMakeLists.txt
+++ b/examples/imatrix/CMakeLists.txt
--- a/examples/imatrix/README.md
+++ b/examples/imatrix/README.md
@@ -1,4 +1,4 @@
-# llama.cpp/tools/imatrix
+# llama.cpp/examples/imatrix

 Compute an importance matrix for a model and given text dataset. Can be used during quantization to enhance the quality of the quantized models.
 More information is available here: https://github.com/ggml-org/llama.cpp/pull/4861
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -46,7 +46,7 @@ private:
    common_params                          m_params;
    std::mutex                             m_mutex;
    int                                    m_last_call = 0;
-    std::vector<char>                      m_src1_data;
+    std::vector<float>                     m_src1_data;
    std::vector<char>                      m_ids; // the expert ids from ggml_mul_mat_id
 };

@@ -93,13 +93,11 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
    const bool is_host = ggml_backend_buffer_is_host(src1->buffer);

    if (!is_host) {
-        const size_t src1_nbytes = ggml_nbytes(src1);
-        m_src1_data.resize(src1_nbytes);
-        ggml_backend_tensor_get(src1, m_src1_data.data(), 0, src1_nbytes);
+        m_src1_data.resize(ggml_nelements(src1));
+        ggml_backend_tensor_get(src1, m_src1_data.data(), 0, ggml_nbytes(src1));
    }

-    const char * data = is_host ? (const char *) src1->data : m_src1_data.data();
-    GGML_ASSERT(src1->nb[0] == ggml_element_size(src1));
+    const float * data = is_host ? (const float *) src1->data : m_src1_data.data();

    // this has been adapted to the new format of storing merged experts in a single 3d tensor
    // ref: https://github.com/ggml-org/llama.cpp/pull/6387
@@ -146,7 +144,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *

                    const int64_t i11 = idx % src1->ne[1];
                    const int64_t i12 = row;
-                    const float * x = (const float *)(data + i11*src1->nb[1] + i12*src1->nb[2]);
+                    const float * x = (const float *)((const char *)data + i11*src1->nb[1] + i12*src1->nb[2]);

                    for (int j = 0; j < (int)src1->ne[0]; ++j) {
                        e.values[e_start + j] += x[j]*x[j];
@@ -182,7 +180,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
        ++e.ncall;
        LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
        for (int row = 0; row < (int)src1->ne[1]; ++row) {
-            const float * x = (const float *) (data + row * src1->nb[1]);
+            const float * x = data + row * src1->ne[0];
            for (int j = 0; j < (int)src1->ne[0]; ++j) {
                e.values[j] += x[j]*x[j];
                e.counts[j]++;
@@ -585,6 +583,7 @@ int main(int argc, char ** argv) {
    params.out_file = "imatrix.dat" ;

    params.n_ctx = 512;
+    params.logits_all = true;
    params.escape = false;

    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) {
--- a/examples/infill/CMakeLists.txt
+++ b/examples/infill/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET llama-infill)
+add_executable(${TARGET} infill.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/infill/README.md
+++ b/examples/infill/README.md
@@ -0,0 +1,47 @@
+# llama.cpp/example/infill
+
+This example shows how to use the infill mode with Code Llama models supporting infill mode.
+Currently the 7B and 13B models support infill mode.
+
+Infill supports most of the options available in the main example.
+
+For further information have a look at the main README.md in llama.cpp/example/main/README.md
+
+## Common Options
+
+In this section, we cover the most commonly used options for running the `infill` program with the LLaMA models:
+
+-   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
+-   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
+-   `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
+-   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 4096, but if a LLaMA model was built with a longer context, increasing this value will provide better results for longer input/inference.
+-   `--spm-infill`: Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this.
+
+## Input Prompts
+
+The `infill` program provides several ways to interact with the LLaMA models using input prompts:
+
+-   `--in-prefix PROMPT_BEFORE_CURSOR`: Provide the prefix directly as a command-line option.
+-   `--in-suffix PROMPT_AFTER_CURSOR`: Provide the suffix directly as a command-line option.
+-   `--interactive-first`: Run the program in interactive mode and wait for input right away. (More on this below.)
+
+## Interaction
+
+The `infill` program offers a seamless way to interact with LLaMA models, allowing users to receive real-time infill suggestions. The interactive mode can be triggered using `--interactive`, and `--interactive-first`
+
+### Interaction Options
+
+-   `-i, --interactive`: Run the program in interactive mode, allowing users to get real time code suggestions from model.
+-   `--interactive-first`: Run the program in interactive mode and immediately wait for user input before starting the text generation.
+-   `--color`: Enable colorized output to differentiate visually distinguishing between prompts, user input, and generated text.
+
+### Example
+
+Download a model that supports infill, for example CodeLlama:
+```console
+scripts/hf.sh --repo TheBloke/CodeLlama-13B-GGUF --file codellama-13b.Q5_K_S.gguf --outdir models
+```
+
+```bash
+./llama-infill -t 10 -ngl 0 -m models/codellama-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n    print(\"hell" --in-suffix "\n   print(\"goodbye world\")\n    "
+```
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -0,0 +1,590 @@
+#include "arg.h"
+#include "common.h"
+#include "console.h"
+#include "sampling.h"
+#include "log.h"
+#include "llama.h"
+
+#include <cassert>
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <ctime>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+#include <signal.h>
+#include <unistd.h>
+#elif defined (_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#include <windows.h>
+#include <signal.h>
+#endif
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+static llama_context           ** g_ctx;
+static llama_model             ** g_model;
+static common_sampler          ** g_smpl;
+static common_params            * g_params;
+static std::vector<llama_token> * g_input_tokens;
+static std::ostringstream       * g_output_ss;
+static std::vector<llama_token> * g_output_tokens;
+
+static bool is_interacting = false;
+
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
+static void sigint_handler(int signo) {
+    if (signo == SIGINT) {
+        if (!is_interacting) {
+            is_interacting = true;
+        } else {
+            console::cleanup();
+            LOG("\n");
+            common_perf_print(*g_ctx, *g_smpl);
+
+            // make sure all logs are flushed
+            LOG("Interrupted by user\n");
+            common_log_pause(common_log_main());
+
+            _exit(130);
+        }
+    }
+}
+#endif
+
+int main(int argc, char ** argv) {
+    common_params params;
+    g_params = &params;
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_INFILL)) {
+        return 1;
+    }
+
+    common_init();
+
+    auto & sparams = params.sampling;
+
+    console::init(params.simple_io, params.use_color);
+    atexit([]() { console::cleanup(); });
+
+    if (params.logits_all) {
+        LOG_ERR("\n************\n");
+        LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
+        LOG_ERR("************\n\n");
+
+        return 0;
+    }
+
+    if (params.embedding) {
+        LOG_ERR("\n************\n");
+        LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
+        LOG_ERR("************\n\n");
+
+        return 0;
+    }
+
+    if (params.n_ctx != 0 && params.n_ctx < 8) {
+        LOG_WRN("%s: minimum context size is 8, using minimum size.\n", __func__);
+        params.n_ctx = 8;
+    }
+
+    if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) {
+        LOG_ERR("\n************\n");
+        LOG_ERR("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
+        LOG_ERR("************\n\n");
+
+        return 0;
+    }
+
+    if (params.rope_freq_base != 0.0) {
+        LOG_WRN("%s: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
+    }
+
+    if (params.rope_freq_scale != 0.0) {
+        LOG_WRN("%s: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
+    }
+
+    LOG_INF("%s: llama backend init\n", __func__);
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    llama_model * model = nullptr;
+    llama_context * ctx = nullptr;
+    common_sampler * smpl = nullptr;
+
+    g_model = &model;
+    g_ctx = &ctx;
+    g_smpl = &smpl;
+
+    // load the model and apply lora adapter, if any
+    LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
+    common_init_result llama_init = common_init_from_params(params);
+
+    model = llama_init.model.get();
+    ctx = llama_init.context.get();
+
+    if (model == NULL) {
+        LOG_ERR("%s: unable to load model\n", __func__);
+        return 1;
+    }
+
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+
+    const int n_ctx_train = llama_model_n_ctx_train(model);
+    const int n_ctx = llama_n_ctx(ctx);
+    LOG_DBG("n_ctx: %d\n", n_ctx);
+
+    if (n_ctx > n_ctx_train) {
+        LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
+    }
+
+    // print system information
+    {
+        LOG_INF("\n");
+        LOG_INF("%s\n", common_params_get_system_info(params).c_str());
+    }
+    const bool add_bos = llama_vocab_get_add_bos(vocab);
+    GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
+
+    std::vector<llama_token> embd_inp;
+    std::vector<llama_token> embd_end;
+    std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
+    std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
+
+    GGML_ASSERT(llama_vocab_fim_pre(vocab) >= 0);
+    GGML_ASSERT(llama_vocab_fim_suf(vocab) >= 0);
+
+    inp_pfx.insert(inp_pfx.begin(), llama_vocab_fim_pre(vocab));
+    inp_sfx.insert(inp_sfx.begin(), llama_vocab_fim_suf(vocab));
+
+    embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
+    embd_end = params.spm_infill ? inp_pfx : inp_sfx;
+    if (add_bos) {
+        embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab));
+    }
+    embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
+
+    const llama_token middle_token = llama_vocab_fim_mid(vocab);
+    if (middle_token >= 0) {
+        embd_inp.push_back(middle_token);
+    }
+
+    LOG_DBG("add_bos: %d\n", add_bos);
+    LOG_DBG("prefix: \"%s\"\n", params.input_prefix.c_str());
+    LOG_DBG("suffix: \"%s\"\n", params.input_suffix.c_str());
+    LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
+
+    // Should not run without any tokens
+    if (embd_inp.empty()) {
+        embd_inp.push_back(llama_vocab_bos(vocab));
+        LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
+    }
+
+    if ((int) embd_inp.size() > n_ctx - 4) {
+        LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
+        return 1;
+    }
+
+    // number of tokens to keep when resetting context
+    if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size()) {
+        params.n_keep = (int)embd_inp.size();
+    }
+
+    LOG_INF("inp_pfx: %s\n", string_from(ctx, inp_pfx).c_str());
+    LOG_INF("inp_sfx: %s\n", string_from(ctx, inp_sfx).c_str());
+
+    // enable interactive mode if interactive start is specified
+    if (params.interactive_first) {
+        params.interactive = true;
+    }
+
+    if (params.verbose_prompt) {
+        LOG_INF("\n");
+        LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
+        LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
+        for (int i = 0; i < (int) embd_inp.size(); i++) {
+            LOG_INF("%6d -> '%s'\n", embd_inp[i], common_token_to_piece(ctx, embd_inp[i]).c_str());
+        }
+
+        if (params.n_keep > 0) {
+        LOG_INF("%s: static prompt based on n_keep: '", __func__);
+            for (int i = 0; i < params.n_keep; i++) {
+                LOG_CNT("%s", common_token_to_piece(ctx, embd_inp[i]).c_str());
+            }
+            LOG_CNT("'\n");
+        }
+        LOG_INF("\n");
+    }
+
+    if (params.interactive) {
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+        struct sigaction sigint_action;
+        sigint_action.sa_handler = sigint_handler;
+        sigemptyset (&sigint_action.sa_mask);
+        sigint_action.sa_flags = 0;
+        sigaction(SIGINT, &sigint_action, NULL);
+#elif defined (_WIN32)
+        auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
+            return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
+        };
+        SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
+#endif
+
+        LOG_INF("%s: interactive mode on.\n", __func__);
+
+        if (params.input_prefix_bos) {
+            LOG_INF("Input prefix with BOS\n");
+        }
+
+        if (!params.input_prefix.empty()) {
+            LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str());
+        }
+
+        if (!params.input_suffix.empty()) {
+            LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
+        }
+    }
+    smpl = common_sampler_init(model, sparams);
+
+    LOG_INF("sampler seed: %u\n",     common_sampler_get_seed(smpl));
+    LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
+    LOG_INF("sampler chain: %s\n",    common_sampler_print(smpl).c_str());
+
+    LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
+
+    LOG_INF("\n");
+    LOG_INF("\n#####  Infill mode  #####\n\n");
+    if (params.interactive) {
+        const char *control_message;
+        if (params.multiline_input) {
+            control_message = " - To return control to LLaMA, end your input with '\\'.\n"
+                              " - To return control without starting a new line, end your input with '/'.\n";
+        } else {
+            control_message = " - Press Return to return control to LLaMA.\n"
+                              " - To return control without starting a new line, end your input with '/'.\n"
+                              " - If you want to submit another line, end your input with '\\'.\n";
+        }
+        LOG_INF("== Running in interactive mode. ==\n");
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
+        LOG_INF(       " - Press Ctrl+C to interject at any time.\n");
+#endif
+        LOG_INF(       "%s\n", control_message);
+
+        is_interacting = params.interactive_first;
+    }
+
+    bool input_echo = true;
+
+    int n_past     = 0;
+    int n_remain   = params.n_predict;
+    int n_consumed = 0;
+
+    std::vector<int>   input_tokens;  g_input_tokens  = &input_tokens;
+    std::vector<int>   output_tokens; g_output_tokens = &output_tokens;
+    std::ostringstream output_ss;     g_output_ss     = &output_ss;
+
+    // the first thing we will do is to output the prompt, so set color accordingly
+    console::set_display(console::prompt);
+
+    std::vector<llama_token> embd;
+
+    while (n_remain != 0 || params.interactive) {
+        // predict
+        if (!embd.empty()) {
+            // Note: n_ctx - 4 here is to match the logic for commandline prompt handling via
+            // --prompt or --file which uses the same value.
+            int max_embd_size = n_ctx - 4;
+
+            // Ensure the input doesn't exceed the context size by truncating embd if necessary.
+            if ((int) embd.size() > max_embd_size) {
+                const int skipped_tokens = (int) embd.size() - max_embd_size;
+                embd.resize(max_embd_size);
+
+                console::set_display(console::error);
+                LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
+                console::set_display(console::reset);
+            }
+
+            // infinite text generation via context swapping
+            // if we run out of context:
+            // - take the n_keep first tokens from the original prompt (via n_past)
+            // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
+            if (n_past + (int) embd.size() > n_ctx) {
+                if (params.n_predict == -2) {
+                    LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
+                    break;
+                }
+
+                const int n_left    = n_past - params.n_keep - 1;
+                const int n_discard = n_left/2;
+
+                LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
+                    n_past, n_left, n_ctx, params.n_keep, n_discard);
+
+                llama_kv_self_seq_rm (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
+                llama_kv_self_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
+
+                n_past -= n_discard;
+
+                LOG_DBG("after swap: n_past = %d\n", n_past);
+
+                LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
+
+            }
+
+            // evaluate tokens in batches
+            // embd is typically prepared beforehand to fit within a batch, but not always
+            for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
+                int n_eval = (int) embd.size() - i;
+                if (n_eval > params.n_batch) {
+                    n_eval = params.n_batch;
+                }
+
+                LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
+
+                if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) {
+                    LOG_ERR("%s : failed to eval\n", __func__);
+                    return 1;
+                }
+
+                n_past += n_eval;
+
+                LOG_DBG("n_past = %d\n", n_past);
+            }
+
+        }
+
+        embd.clear();
+
+        if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
+            const llama_token id = common_sampler_sample(smpl, ctx, -1);
+
+            common_sampler_accept(smpl, id, true);
+
+            // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
+
+            embd.push_back(id);
+
+            // echo this to console
+            input_echo = true;
+
+            // decrement remaining sampling budget
+            --n_remain;
+
+            LOG_DBG("n_remain: %d\n", n_remain);
+        } else {
+            // some user input remains from prompt or interaction, forward it to processing
+            LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
+            while ((int) embd_inp.size() > n_consumed) {
+                embd.push_back(embd_inp[n_consumed]);
+
+                // push the prompt in the sampling context in order to apply repetition penalties later
+                // for the prompt, we don't apply grammar rules
+                common_sampler_accept(smpl, embd_inp[n_consumed], false);
+
+                ++n_consumed;
+                if ((int) embd.size() >= params.n_batch) {
+                    break;
+                }
+            }
+        }
+
+        // display text
+        if (input_echo) {
+            for (auto id : embd) {
+                const std::string token_str = common_token_to_piece(ctx, id);
+                LOG("%s", token_str.c_str());
+
+                if (embd.size() > 1) {
+                    input_tokens.push_back(id);
+                } else {
+                    output_tokens.push_back(id);
+                    output_ss << token_str;
+                }
+            }
+        }
+        // reset color to default if we there is no pending user input
+        if (input_echo && (int) embd_inp.size() == n_consumed) {
+            console::set_display(console::reset);
+        }
+
+        // if not currently processing queued inputs;
+        if ((int) embd_inp.size() <= n_consumed) {
+            // deal with eot token in infill mode
+            if ((common_sampler_last(smpl) == llama_vocab_eot(vocab) || is_interacting) && params.interactive){
+                if (is_interacting && !params.interactive_first) {
+                    // print an eot token
+                    LOG("%s", common_token_to_piece(ctx, llama_vocab_eot(vocab)).c_str());
+                }
+                LOG("\n");
+                console::set_display(console::user_input);
+                std::string buffer;
+                std::string line;
+                bool another_line=true;
+                // set a new prefix via stdin
+                do {
+                    another_line = console::readline(line, params.multiline_input);
+                    buffer += line;
+                } while (another_line);
+                // check if we got an empty line, if so we use the old input
+                if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
+                    params.input_prefix = buffer;
+                }
+                buffer.clear();
+                // set a new suffix via stdin
+                do {
+                    another_line = console::readline(line, params.multiline_input);
+                    buffer += line;
+                } while (another_line);
+                // check if we got an empty line
+                if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
+                    params.input_suffix = buffer;
+                }
+                buffer.clear();
+                // done taking input, reset color
+                console::set_display(console::reset);
+
+                if (params.escape) {
+                    //process escape sequences, for the initial prompt this is done in common.cpp when we load the params, but for the interactive mode we need to do it here
+                    string_process_escapes(params.input_prefix);
+                    string_process_escapes(params.input_suffix);
+                }
+
+                // tokenize new prefix and suffix
+                std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
+                std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
+
+                inp_pfx.insert(inp_pfx.begin(), llama_vocab_fim_pre(vocab));
+                inp_sfx.insert(inp_sfx.begin(), llama_vocab_fim_suf(vocab));
+
+                embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
+                embd_end = params.spm_infill ? inp_pfx : inp_sfx;
+                if (add_bos) {
+                    embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab));
+                }
+                embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
+
+                if (middle_token >= 0) {
+                    embd_inp.push_back(middle_token);
+                }
+
+                embd.clear();
+                n_remain = params.n_predict;
+                n_past = 0;
+                n_consumed = 0;
+                is_interacting = false;
+            }
+            // deal with end of generation tokens in interactive mode
+            else if (llama_vocab_is_eog(vocab, common_sampler_last(smpl))) {
+                LOG_DBG("found EOS token\n");
+
+                if (params.interactive) {
+
+                    is_interacting = true;
+                    LOG("\n");
+                    console::set_display(console::user_input);
+               }
+            }
+
+            if (n_past > 0 && is_interacting && !params.interactive) {
+                LOG_DBG("waiting for user input\n");
+
+                if (params.input_prefix_bos) {
+                    LOG_DBG("adding input prefix BOS token\n");
+                    embd_inp.push_back(llama_vocab_bos(vocab));
+                }
+
+                std::string buffer;
+                if (!params.input_prefix.empty()) {
+                    LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
+                    buffer += params.input_prefix;
+                    LOG("%s", buffer.c_str());
+                }
+
+                std::string line;
+                bool another_line = true;
+                do {
+                    another_line = console::readline(line, params.multiline_input);
+                    buffer += line;
+                } while (another_line);
+
+                // done taking input, reset color
+                console::set_display(console::reset);
+
+                // Add tokens to embd only if the input buffer is non-empty
+                // Entering a empty line lets the user pass control back
+                if (buffer.length() > 1) {
+                    // append input suffix if any
+                    if (!params.input_suffix.empty()) {
+                        LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
+                        buffer += params.input_suffix;
+                        LOG("%s", params.input_suffix.c_str());
+                    }
+
+                    LOG_DBG("buffer: '%s'\n", buffer.c_str());
+
+                    const size_t original_size = embd_inp.size();
+
+                    const auto line_inp = common_tokenize(ctx, buffer, false);
+                    LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
+
+                    embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
+
+                    for (size_t i = original_size; i < embd_inp.size(); ++i) {
+                        const llama_token token = embd_inp[i];
+                        output_tokens.push_back(token);
+                        output_ss << common_token_to_piece(ctx, token);
+                    }
+
+                    n_remain -= line_inp.size();
+                    LOG_DBG("n_remain: %d\n", n_remain);
+                } else {
+                    LOG_DBG("empty line, passing control back\n");
+                }
+
+                input_echo = false; // do not echo this again
+            }
+
+            if (n_past > 0) {
+                if (is_interacting) {
+                    common_sampler_reset(smpl);
+                }
+                is_interacting = false;
+            }
+        }
+
+        // end of generation
+        if (!embd.empty() && llama_vocab_is_eog(vocab, embd.back()) && !params.interactive) {
+            break;
+        }
+
+        // In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
+        // We skip this logic when n_predict == -1 (infinite) or -2 (stop at context size).
+        if (params.interactive && n_remain <= 0 && params.n_predict >= 0) {
+            n_remain = params.n_predict;
+            is_interacting = true;
+        }
+    }
+    if (!params.interactive && n_remain <= 0) {
+        LOG("%s", common_token_to_piece(ctx, llama_vocab_eot(vocab)).c_str());
+    }
+
+    LOG("\n");
+    common_perf_print(ctx, smpl);
+
+    common_sampler_free(smpl);
+    llama_backend_free();
+
+    return 0;
+}
--- a/examples/llama-bench/CMakeLists.txt
+++ b/examples/llama-bench/CMakeLists.txt
--- a/examples/llama-bench/README.md
+++ b/examples/llama-bench/README.md
@@ -1,4 +1,4 @@
-# llama.cpp/tools/llama-bench
+# llama.cpp/examples/llama-bench

 Performance testing tool for llama.cpp.

@@ -28,7 +28,6 @@ options:
  -p, --n-prompt <n>                        (default: 512)
  -n, --n-gen <n>                           (default: 128)
  -pg <pp,tg>                               (default: )
-  -d, --n-depth <n>                         (default: 0)
  -b, --batch-size <n>                      (default: 2048)
  -ub, --ubatch-size <n>                    (default: 512)
  -ctk, --cache-type-k <t>                  (default: f16)
@@ -67,8 +66,6 @@ With the exception of `-r`, `-o` and `-v`, all options can be specified multiple

 Each test is repeated the number of times given by `-r`, and the results are averaged. The results are given in average tokens per second (t/s) and standard deviation. Some output formats (e.g. json) also include the individual results of each repetition.

-Using the `-d <n>` option, each test can be run at a specified context depth, prefilling the KV cache with `<n>` tokens.
-
 For a description of the other options, see the [main example](../main/README.md).

 Note:
@@ -151,19 +148,6 @@ $ ./llama-bench -ngl 10,20,30,31,32,33,34,35
 | llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  35 | pp 512     |   2400.01 ± 7.72 |
 | llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  35 | tg 128     |    131.66 ± 0.49 |

-### Different prefilled context
-
-```
-$ ./llama-bench -d 0,512
-```
-
-| model                          |       size |     params | backend    | ngl |            test |                  t/s |
-| ------------------------------ | ---------: | ---------: | ---------- | --: | --------------: | -------------------: |
-| qwen2 7B Q4_K - Medium         |   4.36 GiB |     7.62 B | CUDA       |  99 |           pp512 |      7340.20 ± 23.45 |
-| qwen2 7B Q4_K - Medium         |   4.36 GiB |     7.62 B | CUDA       |  99 |           tg128 |        120.60 ± 0.59 |
-| qwen2 7B Q4_K - Medium         |   4.36 GiB |     7.62 B | CUDA       |  99 |    pp512 @ d512 |      6425.91 ± 18.88 |
-| qwen2 7B Q4_K - Medium         |   4.36 GiB |     7.62 B | CUDA       |  99 |    tg128 @ d512 |        116.71 ± 0.60 |
-
 ## Output formats

 By default, llama-bench outputs the results in markdown format. The results can be output in other formats by using the `-o` option.
@@ -186,9 +170,9 @@ $ ./llama-bench -o csv
 ```

 ```csv
-build_commit,build_number,cpu_info,gpu_info,backends,model_filename,model_type,model_size,model_n_params,n_batch,n_ubatch,n_threads,cpu_mask,cpu_strict,poll,type_k,type_v,n_gpu_layers,split_mode,main_gpu,no_kv_offload,flash_attn,tensor_split,use_mmap,embeddings,n_prompt,n_gen,n_depth,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
-"8cf427ff","5163","AMD Ryzen 7 7800X3D 8-Core Processor","NVIDIA GeForce RTX 4080","CUDA","models/Qwen2.5-7B-Instruct-Q4_K_M.gguf","qwen2 7B Q4_K - Medium","4677120000","7615616512","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","512","0","0","2025-04-24T11:57:09Z","70285660","982040","7285.676949","100.064434"
-"8cf427ff","5163","AMD Ryzen 7 7800X3D 8-Core Processor","NVIDIA GeForce RTX 4080","CUDA","models/Qwen2.5-7B-Instruct-Q4_K_M.gguf","qwen2 7B Q4_K - Medium","4677120000","7615616512","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","0","128","0","2025-04-24T11:57:10Z","1067431600","3834831","119.915244","0.430617"
+build_commit,build_number,cuda,metal,gpu_blas,blas,cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv,n_gpu_layers,main_gpu,mul_mat_q,tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
+"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","512","0","2023-09-23T12:09:01Z","212155977","732372","2413.341687","8.305961"
+"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","0","128","2023-09-23T12:09:02Z","969320879","2728399","132.052051","0.371342"
 ```

 ### JSON
@@ -200,78 +184,64 @@ $ ./llama-bench -o json
 ```json
 [
  {
-    "build_commit": "8cf427ff",
-    "build_number": 5163,
-    "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor",
-    "gpu_info": "NVIDIA GeForce RTX 4080",
-    "backends": "CUDA",
-    "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf",
-    "model_type": "qwen2 7B Q4_K - Medium",
-    "model_size": 4677120000,
-    "model_n_params": 7615616512,
-    "n_batch": 2048,
-    "n_ubatch": 512,
-    "n_threads": 8,
-    "cpu_mask": "0x0",
-    "cpu_strict": false,
-    "poll": 50,
-    "type_k": "f16",
-    "type_v": "f16",
+    "build_commit": "3469684",
+    "build_number": 1275,
+    "cuda": true,
+    "metal": false,
+    "gpu_blas": true,
+    "blas": true,
+    "cpu_info": "13th Gen Intel(R) Core(TM) i9-13900K",
+    "gpu_info": "NVIDIA GeForce RTX 3090 Ti",
+    "model_filename": "models/7B/ggml-model-q4_0.gguf",
+    "model_type": "llama 7B mostly Q4_0",
+    "model_size": 3825065984,
+    "model_n_params": 6738415616,
+    "n_batch": 512,
+    "n_threads": 16,
+    "f16_kv": true,
    "n_gpu_layers": 99,
-    "split_mode": "layer",
    "main_gpu": 0,
-    "no_kv_offload": false,
-    "flash_attn": false,
+    "mul_mat_q": true,
    "tensor_split": "0.00",
-    "use_mmap": true,
-    "embeddings": false,
    "n_prompt": 512,
    "n_gen": 0,
-    "n_depth": 0,
-    "test_time": "2025-04-24T11:58:50Z",
-    "avg_ns": 72135640,
-    "stddev_ns": 1453752,
-    "avg_ts": 7100.002165,
-    "stddev_ts": 140.341520,
-    "samples_ns": [ 74601900, 71632900, 71745200, 71952700, 70745500 ],
-    "samples_ts": [ 6863.1, 7147.55, 7136.37, 7115.79, 7237.21 ]
+    "test_time": "2023-09-23T12:09:57Z",
+    "avg_ns": 212365953,
+    "stddev_ns": 985423,
+    "avg_ts": 2410.974041,
+    "stddev_ts": 11.163766,
+    "samples_ns": [ 213837238, 211635853, 212328053, 211329715, 212698907 ],
+    "samples_ts": [ 2394.34, 2419.25, 2411.36, 2422.75, 2407.16 ]
  },
  {
-    "build_commit": "8cf427ff",
-    "build_number": 5163,
-    "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor",
-    "gpu_info": "NVIDIA GeForce RTX 4080",
-    "backends": "CUDA",
-    "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf",
-    "model_type": "qwen2 7B Q4_K - Medium",
-    "model_size": 4677120000,
-    "model_n_params": 7615616512,
-    "n_batch": 2048,
-    "n_ubatch": 512,
-    "n_threads": 8,
-    "cpu_mask": "0x0",
-    "cpu_strict": false,
-    "poll": 50,
-    "type_k": "f16",
-    "type_v": "f16",
+    "build_commit": "3469684",
+    "build_number": 1275,
+    "cuda": true,
+    "metal": false,
+    "gpu_blas": true,
+    "blas": true,
+    "cpu_info": "13th Gen Intel(R) Core(TM) i9-13900K",
+    "gpu_info": "NVIDIA GeForce RTX 3090 Ti",
+    "model_filename": "models/7B/ggml-model-q4_0.gguf",
+    "model_type": "llama 7B mostly Q4_0",
+    "model_size": 3825065984,
+    "model_n_params": 6738415616,
+    "n_batch": 512,
+    "n_threads": 16,
+    "f16_kv": true,
    "n_gpu_layers": 99,
-    "split_mode": "layer",
    "main_gpu": 0,
-    "no_kv_offload": false,
-    "flash_attn": false,
+    "mul_mat_q": true,
    "tensor_split": "0.00",
-    "use_mmap": true,
-    "embeddings": false,
    "n_prompt": 0,
    "n_gen": 128,
-    "n_depth": 0,
-    "test_time": "2025-04-24T11:58:51Z",
-    "avg_ns": 1076767880,
-    "stddev_ns": 9449585,
-    "avg_ts": 118.881588,
-    "stddev_ts": 1.041811,
-    "samples_ns": [ 1075361300, 1065089400, 1071761200, 1081934900, 1089692600 ],
-    "samples_ts": [ 119.03, 120.178, 119.43, 118.307, 117.464 ]
+    "test_time": "2023-09-23T12:09:59Z",
+    "avg_ns": 977425219,
+    "stddev_ns": 9268593,
+    "avg_ts": 130.965708,
+    "stddev_ts": 1.238924,
+    "samples_ns": [ 984472709, 974901233, 989474741, 970729355, 967548060 ],
+    "samples_ts": [ 130.019, 131.295, 129.362, 131.86, 132.293 ]
  }
 ]
 ```
@@ -284,8 +254,8 @@ $ ./llama-bench -o jsonl
 ```

 ```json lines
-{"build_commit": "8cf427ff", "build_number": 5163, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", "model_type": "qwen2 7B Q4_K - Medium", "model_size": 4677120000, "model_n_params": 7615616512, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 512, "n_gen": 0, "n_depth": 0, "test_time": "2025-04-24T11:59:33Z", "avg_ns": 70497220, "stddev_ns": 883196, "avg_ts": 7263.609157, "stddev_ts": 90.940578, "samples_ns": [ 71551000, 71222800, 70364100, 69439100, 69909100 ],"samples_ts": [ 7155.74, 7188.71, 7276.44, 7373.37, 7323.8 ]}
-{"build_commit": "8cf427ff", "build_number": 5163, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", "model_type": "qwen2 7B Q4_K - Medium", "model_size": 4677120000, "model_n_params": 7615616512, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 0, "n_gen": 128, "n_depth": 0, "test_time": "2025-04-24T11:59:33Z", "avg_ns": 1068078400, "stddev_ns": 6279455, "avg_ts": 119.844681, "stddev_ts": 0.699739, "samples_ns": [ 1066331700, 1064864900, 1079042600, 1063328400, 1066824400 ],"samples_ts": [ 120.038, 120.203, 118.624, 120.377, 119.982 ]}
+{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":512,"n_gen":0,"test_time":"2023-09-23T12:09:57Z","avg_ns":212365953,"stddev_ns":985423,"avg_ts":2410.974041,"stddev_ts":11.163766,"samples_ns":[213837238,211635853,212328053,211329715,212698907],"samples_ts":[2394.34,2419.25,2411.36,2422.75,2407.16]}
+{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":0,"n_gen":128,"test_time":"2023-09-23T12:09:59Z","avg_ns":977425219,"stddev_ns":9268593,"avg_ts":130.965708,"stddev_ts":1.238924,"samples_ns":[984472709,974901233,989474741,970729355,967548060],"samples_ts":[130.019,131.295,129.362,131.86,132.293]}
 ```


@@ -301,32 +271,25 @@ $ ./llama-bench -o sql
 CREATE TABLE IF NOT EXISTS test (
  build_commit TEXT,
  build_number INTEGER,
+  cuda INTEGER,
+  metal INTEGER,
+  gpu_blas INTEGER,
+  blas INTEGER,
  cpu_info TEXT,
  gpu_info TEXT,
-  backends TEXT,
  model_filename TEXT,
  model_type TEXT,
  model_size INTEGER,
  model_n_params INTEGER,
  n_batch INTEGER,
-  n_ubatch INTEGER,
  n_threads INTEGER,
-  cpu_mask TEXT,
-  cpu_strict INTEGER,
-  poll INTEGER,
-  type_k TEXT,
-  type_v TEXT,
+  f16_kv INTEGER,
  n_gpu_layers INTEGER,
-  split_mode TEXT,
  main_gpu INTEGER,
-  no_kv_offload INTEGER,
-  flash_attn INTEGER,
+  mul_mat_q INTEGER,
  tensor_split TEXT,
-  use_mmap INTEGER,
-  embeddings INTEGER,
  n_prompt INTEGER,
  n_gen INTEGER,
-  n_depth INTEGER,
  test_time TEXT,
  avg_ns INTEGER,
  stddev_ns INTEGER,
@@ -334,6 +297,6 @@ CREATE TABLE IF NOT EXISTS test (
  stddev_ts REAL
 );

-INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('8cf427ff', '5163', 'AMD Ryzen 7 7800X3D 8-Core Processor', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Qwen2.5-7B-Instruct-Q4_K_M.gguf', 'qwen2 7B Q4_K - Medium', '4677120000', '7615616512', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '512', '0', '0', '2025-04-24T12:00:08Z', '69905000', '519516', '7324.546977', '54.032613');
-INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('8cf427ff', '5163', 'AMD Ryzen 7 7800X3D 8-Core Processor', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Qwen2.5-7B-Instruct-Q4_K_M.gguf', 'qwen2 7B Q4_K - Medium', '4677120000', '7615616512', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '0', '128', '0', '2025-04-24T12:00:09Z', '1063608780', '4464130', '120.346696', '0.504647');
+INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634');
+INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692');
 ```
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -36,46 +36,6 @@ static uint64_t get_time_ns() {
    return std::chrono::nanoseconds(clock::now().time_since_epoch()).count();
 }

-static bool tensor_buft_override_equal(const llama_model_tensor_buft_override& a, const llama_model_tensor_buft_override& b) {
-    if (a.pattern != b.pattern) {
-        // cString comparison that may be null
-        if (a.pattern == nullptr || b.pattern == nullptr) {
-            return false;
-        }
-        if (strcmp(a.pattern, b.pattern) != 0) {
-            return false;
-        }
-    }
-    if (a.buft != b.buft) {
-        return false;
-    }
-    return true;
-}
-
-static bool vec_tensor_buft_override_equal(const std::vector<llama_model_tensor_buft_override>& a, const std::vector<llama_model_tensor_buft_override>& b) {
-    if (a.size() != b.size()) {
-        return false;
-    }
-    for (size_t i = 0; i < a.size(); i++) {
-        if (!tensor_buft_override_equal(a[i], b[i])) {
-            return false;
-        }
-    }
-    return true;
-}
-
-static bool vec_vec_tensor_buft_override_equal(const std::vector<std::vector<llama_model_tensor_buft_override>>& a, const std::vector<std::vector<llama_model_tensor_buft_override>>& b) {
-    if (a.size() != b.size()) {
-        return false;
-    }
-    for (size_t i = 0; i < a.size(); i++) {
-        if (!vec_tensor_buft_override_equal(a[i], b[i])) {
-            return false;
-        }
-    }
-    return true;
-}
-
 template <class T> static std::string join(const std::vector<T> & values, const std::string & delim) {
    std::ostringstream str;
    for (size_t i = 0; i < values.size(); i++) {
@@ -200,7 +160,6 @@ struct cmd_params {
    std::vector<int>                 n_prompt;
    std::vector<int>                 n_gen;
    std::vector<std::pair<int, int>> n_pg;
-    std::vector<int>                 n_depth;
    std::vector<int>                 n_batch;
    std::vector<int>                 n_ubatch;
    std::vector<ggml_type>           type_k;
@@ -216,7 +175,6 @@ struct cmd_params {
    std::vector<bool>                no_kv_offload;
    std::vector<bool>                flash_attn;
    std::vector<std::vector<float>>  tensor_split;
-    std::vector<std::vector<llama_model_tensor_buft_override>> tensor_buft_overrides;
    std::vector<bool>                use_mmap;
    std::vector<bool>                embeddings;
    ggml_numa_strategy               numa;
@@ -234,7 +192,6 @@ static const cmd_params cmd_params_defaults = {
    /* n_prompt             */ { 512 },
    /* n_gen                */ { 128 },
    /* n_pg                 */ {},
-    /* n_depth              */ { 0 },
    /* n_batch              */ { 2048 },
    /* n_ubatch             */ { 512 },
    /* type_k               */ { GGML_TYPE_F16 },
@@ -250,7 +207,6 @@ static const cmd_params cmd_params_defaults = {
    /* no_kv_offload        */ { false },
    /* flash_attn           */ { false },
    /* tensor_split         */ { std::vector<float>(llama_max_devices(), 0.0f) },
-    /* tensor_buft_overrides*/ { std::vector<llama_model_tensor_buft_override>{{nullptr,nullptr}} },
    /* use_mmap             */ { true },
    /* embeddings           */ { false },
    /* numa                 */ GGML_NUMA_STRATEGY_DISABLED,
@@ -274,7 +230,6 @@ static void print_usage(int /* argc */, char ** argv) {
    printf("  -n, --n-gen <n>                           (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
    printf("  -pg <pp,tg>                               (default: %s)\n",
           join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
-    printf("  -d, --n-depth <n>                         (default: %s)\n", join(cmd_params_defaults.n_depth, ",").c_str());
    printf("  -b, --batch-size <n>                      (default: %s)\n",
           join(cmd_params_defaults.n_batch, ",").c_str());
    printf("  -ub, --ubatch-size <n>                    (default: %s)\n",
@@ -310,7 +265,6 @@ static void print_usage(int /* argc */, char ** argv) {
    printf("  -embd, --embeddings <0|1>                 (default: %s)\n",
           join(cmd_params_defaults.embeddings, ",").c_str());
    printf("  -ts, --tensor-split <ts0/ts1/..>          (default: 0)\n");
-    printf("  -ot --override-tensors <tensor name pattern>=<buffer type>;... (default: disabled)\n");
    printf("  -r, --repetitions <n>                     (default: %d)\n", cmd_params_defaults.reps);
    printf("  --prio <0|1|2|3>                          (default: %d)\n", cmd_params_defaults.prio);
    printf("  --delay <0...N> (seconds)                 (default: %d)\n", cmd_params_defaults.delay);
@@ -412,13 +366,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                break;
            }
            params.n_pg.push_back({ std::stoi(p[0]), std::stoi(p[1]) });
-        } else if (arg == "-d" || arg == "--n-depth") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            auto p = string_split<int>(argv[i], split_delim);
-            params.n_depth.insert(params.n_depth.end(), p.begin(), p.end());
        } else if (arg == "-b" || arg == "--batch-size") {
            if (++i >= argc) {
                invalid_param = true;
@@ -610,87 +557,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                }
                params.tensor_split.push_back(tensor_split);
            }
-        } else if (arg == "-ot" || arg == "--override-tensor") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            auto value = argv[i];
-            /* static */ std::map<std::string, ggml_backend_buffer_type_t> buft_list;
-            if (buft_list.empty()) {
-                // enumerate all the devices and add their buffer types to the list
-                for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
-                    auto * dev = ggml_backend_dev_get(i);
-                    auto * buft = ggml_backend_dev_buffer_type(dev);
-                    if (buft) {
-                        buft_list[ggml_backend_buft_name(buft)] = buft;
-                    }
-                }
-            }
-            auto override_group_span_len = std::strcspn(value, ",");
-            bool last_group = false;
-            do {
-                if (override_group_span_len == 0) {
-                    // Adds an empty override-tensors for an empty span
-                    params.tensor_buft_overrides.push_back({{}});
-                    if (value[override_group_span_len] == '\0') {
-                        value = &value[override_group_span_len];
-                        last_group = true;
-                    } else {
-                        value = &value[override_group_span_len + 1];
-                        override_group_span_len = std::strcspn(value, ",");
-                    }
-                    continue;
-                }
-                // Stamps null terminators into the argv
-                // value for this option to avoid the
-                // memory leak present in the implementation
-                // over in arg.cpp. Acceptable because we
-                // only parse these args once in this program.
-                auto override_group = value;
-                if (value[override_group_span_len] == '\0') {
-                    value = &value[override_group_span_len];
-                    last_group = true;
-                } else {
-                    value[override_group_span_len] = '\0';
-                    value = &value[override_group_span_len + 1];
-                }
-                std::vector<llama_model_tensor_buft_override> group_tensor_buft_overrides{};
-                auto override_span_len = std::strcspn(override_group, ";");
-                while (override_span_len > 0) {
-                    auto override = override_group;
-                    if (override_group[override_span_len] != '\0') {
-                        override_group[override_span_len] = '\0';
-                        override_group = &override_group[override_span_len + 1];
-                    } else {
-                        override_group = &override_group[override_span_len];
-                    }
-                    auto tensor_name_span_len = std::strcspn(override, "=");
-                    if (tensor_name_span_len >= override_span_len) {
-                        invalid_param = true;
-                        break;
-                    }
-                    override[tensor_name_span_len] = '\0';
-                    auto tensor_name = override;
-                    auto buffer_type = &override[tensor_name_span_len + 1];
-                    if (buft_list.find(buffer_type) == buft_list.end()) {
-                        printf("Available buffer types:\n");
-                        for (const auto & it : buft_list) {
-                            printf("  %s\n", ggml_backend_buft_name(it.second));
-                        }
-                        invalid_param = true;
-                        break;
-                    }
-                    group_tensor_buft_overrides.push_back({tensor_name, buft_list.at(buffer_type)});
-                    override_span_len = std::strcspn(override_group, ";");
-                }
-                if (invalid_param) {
-                    break;
-                }
-                group_tensor_buft_overrides.push_back({nullptr,nullptr});
-                params.tensor_buft_overrides.push_back(group_tensor_buft_overrides);
-                override_group_span_len = std::strcspn(value, ",");
-            } while (!last_group);
        } else if (arg == "-r" || arg == "--repetitions") {
            if (++i >= argc) {
                invalid_param = true;
@@ -749,9 +615,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
    if (params.n_pg.empty()) {
        params.n_pg = cmd_params_defaults.n_pg;
    }
-    if (params.n_depth.empty()) {
-        params.n_depth = cmd_params_defaults.n_depth;
-    }
    if (params.n_batch.empty()) {
        params.n_batch = cmd_params_defaults.n_batch;
    }
@@ -785,9 +648,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
    if (params.tensor_split.empty()) {
        params.tensor_split = cmd_params_defaults.tensor_split;
    }
-    if (params.tensor_buft_overrides.empty()) {
-        params.tensor_buft_overrides = cmd_params_defaults.tensor_buft_overrides;
-    }
    if (params.use_mmap.empty()) {
        params.use_mmap = cmd_params_defaults.use_mmap;
    }
@@ -814,7 +674,6 @@ struct cmd_params_instance {
    std::string        model;
    int                n_prompt;
    int                n_gen;
-    int                n_depth;
    int                n_batch;
    int                n_ubatch;
    ggml_type          type_k;
@@ -830,7 +689,6 @@ struct cmd_params_instance {
    bool               no_kv_offload;
    bool               flash_attn;
    std::vector<float> tensor_split;
-    std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
    bool               use_mmap;
    bool               embeddings;

@@ -875,26 +733,19 @@ struct cmd_params_instance {
        mparams.tensor_split = tensor_split.data();
        mparams.use_mmap     = use_mmap;

-        if (tensor_buft_overrides.empty()) {
-            mparams.tensor_buft_overrides = nullptr;
-        } else {
-            GGML_ASSERT(tensor_buft_overrides.back().pattern == nullptr && "Tensor buffer overrides not terminated with empty pattern");
-            mparams.tensor_buft_overrides = tensor_buft_overrides.data();
-        }
-
        return mparams;
    }

    bool equal_mparams(const cmd_params_instance & other) const {
        return model == other.model && n_gpu_layers == other.n_gpu_layers && rpc_servers_str == other.rpc_servers_str &&
               split_mode == other.split_mode && main_gpu == other.main_gpu && use_mmap == other.use_mmap &&
-               tensor_split == other.tensor_split && vec_tensor_buft_override_equal(tensor_buft_overrides, other.tensor_buft_overrides);
+               tensor_split == other.tensor_split;
    }

    llama_context_params to_llama_cparams() const {
        llama_context_params cparams = llama_context_default_params();

-        cparams.n_ctx       = n_prompt + n_gen + n_depth;
+        cparams.n_ctx       = n_prompt + n_gen;
        cparams.n_batch     = n_batch;
        cparams.n_ubatch    = n_ubatch;
        cparams.type_k      = type_k;
@@ -918,7 +769,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
    for (const auto & sm : params.split_mode)
    for (const auto & mg : params.main_gpu)
    for (const auto & ts : params.tensor_split)
-    for (const auto & ot : params.tensor_buft_overrides)
    for (const auto & mmp : params.use_mmap)
    for (const auto & embd : params.embeddings)
    for (const auto & nb : params.n_batch)
@@ -930,7 +780,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
    for (const auto & nt : params.n_threads)
    for (const auto & cm : params.cpu_mask)
    for (const auto & cs : params.cpu_strict)
-    for (const auto & nd : params.n_depth)
    for (const auto & pl : params.poll) {
        for (const auto & n_prompt : params.n_prompt) {
            if (n_prompt == 0) {
@@ -940,7 +789,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .model        = */ m,
                /* .n_prompt     = */ n_prompt,
                /* .n_gen        = */ 0,
-                /* .n_depth      = */ nd,
                /* .n_batch      = */ nb,
                /* .n_ubatch     = */ nub,
                /* .type_k       = */ tk,
@@ -956,7 +804,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .no_kv_offload= */ nkvo,
                /* .flash_attn   = */ fa,
                /* .tensor_split = */ ts,
-                /* .tensor_buft_overrides = */ ot,
                /* .use_mmap     = */ mmp,
                /* .embeddings   = */ embd,
            };
@@ -971,7 +818,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .model        = */ m,
                /* .n_prompt     = */ 0,
                /* .n_gen        = */ n_gen,
-                /* .n_depth      = */ nd,
                /* .n_batch      = */ nb,
                /* .n_ubatch     = */ nub,
                /* .type_k       = */ tk,
@@ -987,7 +833,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .no_kv_offload= */ nkvo,
                /* .flash_attn   = */ fa,
                /* .tensor_split = */ ts,
-                /* .tensor_buft_overrides = */ ot,
                /* .use_mmap     = */ mmp,
                /* .embeddings   = */ embd,
            };
@@ -1002,7 +847,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .model        = */ m,
                /* .n_prompt     = */ n_pg.first,
                /* .n_gen        = */ n_pg.second,
-                /* .n_depth      = */ nd,
                /* .n_batch      = */ nb,
                /* .n_ubatch     = */ nub,
                /* .type_k       = */ tk,
@@ -1018,7 +862,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .no_kv_offload= */ nkvo,
                /* .flash_attn   = */ fa,
                /* .tensor_split = */ ts,
-                /* .tensor_buft_overrides = */ ot,
                /* .use_mmap     = */ mmp,
                /* .embeddings   = */ embd,
            };
@@ -1053,12 +896,10 @@ struct test {
    bool                     no_kv_offload;
    bool                     flash_attn;
    std::vector<float>       tensor_split;
-    std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
    bool                     use_mmap;
    bool                     embeddings;
    int                      n_prompt;
    int                      n_gen;
-    int                      n_depth;
    std::string              test_time;
    std::vector<uint64_t>    samples_ns;

@@ -1086,12 +927,10 @@ struct test {
        no_kv_offload  = inst.no_kv_offload;
        flash_attn     = inst.flash_attn;
        tensor_split   = inst.tensor_split;
-        tensor_buft_overrides = inst.tensor_buft_overrides;
        use_mmap       = inst.use_mmap;
        embeddings     = inst.embeddings;
        n_prompt       = inst.n_prompt;
        n_gen          = inst.n_gen;
-        n_depth        = inst.n_depth;
        // RFC 3339 date-time format
        time_t t       = time(NULL);
        std::strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&t));
@@ -1133,9 +972,9 @@ struct test {
            "build_commit", "build_number", "cpu_info",       "gpu_info",   "backends",     "model_filename",
            "model_type",   "model_size",   "model_n_params", "n_batch",    "n_ubatch",     "n_threads",
            "cpu_mask",     "cpu_strict",   "poll",           "type_k",     "type_v",       "n_gpu_layers",
-            "split_mode",   "main_gpu",     "no_kv_offload",  "flash_attn", "tensor_split", "tensor_buft_overrides",
-            "use_mmap",     "embeddings",   "n_prompt",       "n_gen",      "n_depth",      "test_time",
-            "avg_ns",       "stddev_ns",    "avg_ts",         "stddev_ts",
+            "split_mode",   "main_gpu",     "no_kv_offload",  "flash_attn", "tensor_split", "use_mmap",
+            "embeddings",   "n_prompt",     "n_gen",          "test_time",  "avg_ns",       "stddev_ns",
+            "avg_ts",       "stddev_ts",
        };
        return fields;
    }
@@ -1145,8 +984,8 @@ struct test {
    static field_type get_field_type(const std::string & field) {
        if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" ||
            field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" ||
-            field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" ||
-            field == "avg_ns" || field == "stddev_ns") {
+            field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "avg_ns" ||
+            field == "stddev_ns") {
            return INT;
        }
        if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" ||
@@ -1161,7 +1000,6 @@ struct test {

    std::vector<std::string> get_values() const {
        std::string tensor_split_str;
-        std::string tensor_buft_overrides_str;
        int         max_nonzero = 0;
        for (size_t i = 0; i < llama_max_devices(); i++) {
            if (tensor_split[i] > 0) {
@@ -1176,26 +1014,6 @@ struct test {
                tensor_split_str += "/";
            }
        }
-        if (tensor_buft_overrides.size() == 1) {
-            // Last element of tensor_buft_overrides is always a null pattern
-            // so if it is only one element long, it must be a null pattern.
-            GGML_ASSERT(tensor_buft_overrides[0].pattern == nullptr);
-            tensor_buft_overrides_str += "none";
-        } else {
-            for (size_t i = 0; i < tensor_buft_overrides.size()-1; i++) {
-                // Last element of tensor_buft_overrides is always a null pattern
-                if (tensor_buft_overrides[i].pattern == nullptr) {
-                    tensor_buft_overrides_str += "none";
-                } else {
-                    tensor_buft_overrides_str += tensor_buft_overrides[i].pattern;
-                    tensor_buft_overrides_str += "=";
-                    tensor_buft_overrides_str += ggml_backend_buft_name(tensor_buft_overrides[i].buft);
-                }
-                if (i + 2 < tensor_buft_overrides.size()) {
-                    tensor_buft_overrides_str += ";";
-                }
-            }
-        }
        std::vector<std::string> values = { build_commit,
                                            std::to_string(build_number),
                                            cpu_info,
@@ -1219,12 +1037,10 @@ struct test {
                                            std::to_string(no_kv_offload),
                                            std::to_string(flash_attn),
                                            tensor_split_str,
-                                            tensor_buft_overrides_str,
                                            std::to_string(use_mmap),
                                            std::to_string(embeddings),
                                            std::to_string(n_prompt),
                                            std::to_string(n_gen),
-                                            std::to_string(n_depth),
                                            test_time,
                                            std::to_string(avg_ns()),
                                            std::to_string(stdev_ns()),
@@ -1402,7 +1218,7 @@ struct markdown_printer : public printer {
            return 4;
        }
        if (field == "test") {
-            return 15;
+            return 13;
        }

        int width = std::max((int) field.length(), 10);
@@ -1438,9 +1254,6 @@ struct markdown_printer : public printer {
        if (field == "tensor_split") {
            return "ts";
        }
-        if (field == "tensor_buft_overrides") {
-            return "ot";
-        }
        return field;
    }

@@ -1494,9 +1307,6 @@ struct markdown_printer : public printer {
        if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
            fields.emplace_back("tensor_split");
        }
-        if (params.tensor_buft_overrides.size() > 1 || !vec_vec_tensor_buft_override_equal(params.tensor_buft_overrides, cmd_params_defaults.tensor_buft_overrides)) {
-            fields.emplace_back("tensor_buft_overrides");
-        }
        if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) {
            fields.emplace_back("use_mmap");
        }
@@ -1552,10 +1362,6 @@ struct markdown_printer : public printer {
                } else {
                    snprintf(buf, sizeof(buf), "pp%d+tg%d", t.n_prompt, t.n_gen);
                }
-                if (t.n_depth > 0) {
-                    int len = strlen(buf);
-                    snprintf(buf + len, sizeof(buf) - len, " @ d%d", t.n_depth);
-                }
                value = buf;
            } else if (field == "t/s") {
                snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_ts(), t.stdev_ts());
@@ -1814,14 +1620,6 @@ int main(int argc, char ** argv) {
        for (int i = 0; i < params.reps; i++) {
            llama_kv_self_clear(ctx);

-            if (t.n_depth > 0) {
-                if (params.progress) {
-                    fprintf(stderr, "llama-bench: benchmark %d/%zu: depth run %d/%d\n", params_idx, params_count,
-                            i + 1, params.reps);
-                }
-                test_prompt(ctx, t.n_depth, t.n_batch, t.n_threads);
-            }
-
            uint64_t t_start = get_time_ns();

            if (t.n_prompt > 0) {
--- a/examples/llava/CMakeLists.txt
+++ b/examples/llava/CMakeLists.txt
@@ -64,7 +64,13 @@ endif()
 add_executable(llama-llava-cli    deprecation-warning.cpp)
 add_executable(llama-gemma3-cli   deprecation-warning.cpp)
 add_executable(llama-minicpmv-cli deprecation-warning.cpp)
-add_executable(llama-qwen2vl-cli  deprecation-warning.cpp)
+
+set(TARGET llama-qwen2vl-cli)
+add_executable(${TARGET} qwen2vl-cli.cpp)
+set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-qwen2vl-cli)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)

 set(TARGET llama-mtmd-cli)
 add_executable(${TARGET} mtmd-cli.cpp)
--- a/examples/llava/README-quantize.md
+++ b/examples/llava/README-quantize.md
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@@ -34,19 +34,6 @@ llama-mtmd-cli -hf ggml-org/SmolVLM2-500M-Video-Instruct-GGUF

 # Pixtral 12B
 llama-mtmd-cli -hf ggml-org/pixtral-12b-GGUF
-
-# Qwen 2 VL
-llama-mtmd-cli -hf ggml-org/Qwen2-VL-2B-Instruct-GGUF
-llama-mtmd-cli -hf ggml-org/Qwen2-VL-7B-Instruct-GGUF
-
-# Qwen 2.5 VL
-llama-mtmd-cli -hf ggml-org/Qwen2.5-VL-3B-Instruct-GGUF
-llama-mtmd-cli -hf ggml-org/Qwen2.5-VL-7B-Instruct-GGUF
-llama-mtmd-cli -hf ggml-org/Qwen2.5-VL-32B-Instruct-GGUF
-llama-mtmd-cli -hf ggml-org/Qwen2.5-VL-72B-Instruct-GGUF
-
-# Mistral Small 3.1 24B (IQ2_M quantization)
-llama-mtmd-cli -hf ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF --chat-template mistral-v7
 ```

 ## How it works and what is `mmproj`?
@@ -70,17 +57,7 @@ Built upon `clip.cpp` (similar to `llava.cpp`), `libmtmd` offers several advanta

 ## How to obtain `mmproj`

-Multimodal projector (`mmproj`) files are specific to each model architecture.
-
-For the following models, you can use `convert_hf_to_gguf.py`with `--mmproj` flag to get the `mmproj` file:
- [Gemma 3](https://huggingface.co/collections/google/gemma-3-release-67c6c6f89c4f76621268bb6d) - Note: 1B variant does not have vision support
- SmolVLM (from [HuggingFaceTB](https://huggingface.co/HuggingFaceTB))
- SmolVLM2 (from [HuggingFaceTB](https://huggingface.co/HuggingFaceTB))
- [Pixtral 12B](https://huggingface.co/mistral-community/pixtral-12b) - only works with `transformers`-compatible checkpoint
- Qwen 2 VL and Qwen 2.5 VL (from [Qwen](https://huggingface.co/Qwen))
- [Mistral Small 3.1 24B](https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503)
-
-For older models, please refer to the relevant guide for instructions on how to obtain or create them:
+Multimodal projector (`mmproj`) files are specific to each model architecture. Please refer to the relevant guide for instructions on how to obtain or create them:

 - [LLaVA](../../docs/multimodal/llava.md)
 - [MobileVLM](../../docs/multimodal/MobileVLM.md)
@@ -90,3 +67,9 @@ For older models, please refer to the relevant guide for instructions on how to
 - [MiniCPM-o 2.6](../../docs/multimodal/minicpmo2.6.md)
 - [IBM Granite Vision](../../docs/multimodal/granitevision.md)
 - [Google Gemma 3](../../docs/multimodal/gemma3.md)
+
+For the following models, you can use `convert_hf_to_gguf.py`with `--mmproj` flag to get the `mmproj` file:
+- [Gemma 3](https://huggingface.co/collections/google/gemma-3-release-67c6c6f89c4f76621268bb6d) - Note: 1B variant does not have vision support
+- SmolVLM (from [HuggingFaceTB](https://huggingface.co/HuggingFaceTB))
+- SmolVLM2 (from [HuggingFaceTB](https://huggingface.co/HuggingFaceTB))
+- [Pixtral 12B](https://huggingface.co/mistral-community/pixtral-12b) - only works with `transformers`-compatible checkpoint
--- a/examples/llava/android/adb_run.sh
+++ b/examples/llava/android/adb_run.sh
--- a/examples/llava/android/build_64.sh
+++ b/examples/llava/android/build_64.sh
--- a/examples/llava/clip-impl.h
+++ b/examples/llava/clip-impl.h
@@ -2,6 +2,8 @@
 #include "gguf.h"
 #include "clip.h"

+#include "clip.h"
+
 #include <climits>
 #include <cstdarg>
 #include <string>
@@ -31,7 +33,6 @@
 #define KEY_FEATURE_LAYER       "clip.vision.feature_layer"
 #define KEY_PROJ_SCALE_FACTOR   "clip.vision.projector.scale_factor"
 #define KEY_PROJ_TYPE           "clip.projector_type"
-#define KEY_SPATIAL_MERGE_SIZE  "clip.vision.spatial_merge_size"

 #define KEY_USE_GLU_MLP         "clip.use_glu_mlp"  // for qwen2.5vl
 #define KEY_USE_RMS_NORM        "clip.use_rms_norm" // for qwen2.5vl
@@ -69,14 +70,10 @@
 #define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
 #define TN_MVLM_PROJ_PEG   "mm.model.peg.%d.%s"
 #define TN_IMAGE_NEWLINE   "model.image_newline"
-#define TN_MM_INP_NORM     "mm.input_norm.weight"
 #define TN_MM_INP_PROJ     "mm.input_projection.weight" // gemma3
 #define TN_MM_SOFT_EMB_N   "mm.soft_emb_norm.weight"    // gemma3
 #define TN_MM_PROJECTOR    "mm.model.fc.weight"         // idefics3
-#define TN_MM_PATCH_MERGER "mm.patch_merger.weight"     // mistral small 3.1
 #define TN_TOK_IMG_BREAK   "v.token_embd.img_break"     // pixtral
-#define TN_TOK_GLM_BOI     "adapter.boi"                // glm-edge (these embeddings are not in text model)
-#define TN_TOK_GLM_EOI     "adapter.eoi"                // glm-edge (these embeddings are not in text model)

 // mimicpmv
 #define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
@@ -233,15 +230,6 @@ struct clip_image_u8_batch {

 struct clip_image_f32_batch {
    std::vector<clip_image_f32_ptr> entries;
-
-    clip_image_f32_batch clone() const {
-        clip_image_f32_batch new_batch;
-        new_batch.entries.reserve(entries.size());
-        for (const auto & entry : entries) {
-            new_batch.entries.emplace_back(new clip_image_f32(*entry));
-        }
-        return new_batch;
-    }
 };

 //
--- a/examples/llava/clip-quantize-cli.cpp
+++ b/examples/llava/clip-quantize-cli.cpp
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@@ -47,7 +47,7 @@ CLIP_API struct clip_ctx * clip_init(const char * fname, struct clip_context_par
 CLIP_API void clip_free(struct clip_ctx * ctx);

 CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
-CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h);
+CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w);

 CLIP_API int32_t clip_get_image_size (const struct clip_ctx * ctx);
 CLIP_API int32_t clip_get_patch_size (const struct clip_ctx * ctx);
@@ -59,29 +59,18 @@ CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
 CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
 CLIP_API size_t get_clip_image_grid_size(const struct clip_ctx * ctx);

-GGML_DEPRECATED(CLIP_API int clip_n_patches(const struct clip_ctx * ctx),
-    "use clip_n_output_tokens instead");
-GGML_DEPRECATED(CLIP_API int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img),
-    "use clip_n_output_tokens instead");
-
-CLIP_API int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img);
-
-// for M-RoPE, this will be the number of token positions in X and Y directions
-// for other models, X will be the total number of tokens and Y will be 1
-CLIP_API int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img);
-CLIP_API int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img);
-
-// this should be equal to the embedding dimension of the text model
-CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
+CLIP_API int clip_n_patches        (const struct clip_ctx * ctx);
+CLIP_API int clip_n_patches_by_img (const struct clip_ctx * ctx, struct clip_image_f32 * img);
+CLIP_API int clip_n_mmproj_embd    (const struct clip_ctx * ctx);

 CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
 CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
 CLIP_API struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip);

-CLIP_API struct clip_image_size      * clip_image_size_init(void);
-CLIP_API struct clip_image_u8        * clip_image_u8_init (void);
-CLIP_API struct clip_image_f32       * clip_image_f32_init(void);
-CLIP_API struct clip_image_f32_batch * clip_image_f32_batch_init(void); // only used by libllava
+CLIP_API struct clip_image_size      * clip_image_size_init();
+CLIP_API struct clip_image_u8        * clip_image_u8_init ();
+CLIP_API struct clip_image_f32       * clip_image_f32_init();
+CLIP_API struct clip_image_f32_batch * clip_image_f32_batch_init(); // only used by libllava

 // nx, ny are the output image dimensions
 CLIP_API unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny);
--- a/examples/llava/convert_image_encoder_to_gguf.py
+++ b/examples/llava/convert_image_encoder_to_gguf.py
--- a/examples/llava/deprecation-warning.cpp
+++ b/examples/llava/deprecation-warning.cpp
--- a/examples/llava/glmedge-convert-image-encoder-to-gguf.py
+++ b/examples/llava/glmedge-convert-image-encoder-to-gguf.py
--- a/examples/llava/glmedge-surgery.py
+++ b/examples/llava/glmedge-surgery.py
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -2,7 +2,6 @@
 #include "llava.h"

 #include "llama.h"
-#include "ggml-cpp.h"

 #include <algorithm>
 #include <cerrno>
@@ -113,7 +112,7 @@ static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<
 }

 // Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
-static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out, clip_image_f32 * img_input) {
+static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) {
    struct {
        struct ggml_context * ctx;
    } model;
@@ -176,7 +175,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>

    model.ctx = ggml_init(params);

-    struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_output_tokens(ctx_clip, img_input), num_images - 1); // example: 4096 x 576 x 4
+    struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
    // ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
    // fill it with the image embeddings, ignoring the base
    for (size_t i = 1; i < num_images; i++) {
@@ -210,16 +209,13 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
    struct ggml_tensor *flatten = ggml_view_2d(model.ctx, permuted_cont, clip_n_mmproj_embd(ctx_clip), num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side,  size_ele * clip_n_mmproj_embd(ctx_clip), 0);
    // ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
    ggml_build_forward_expand(gf, flatten);
-
-    ggml_backend_ptr backend { ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr) };
-    ggml_backend_graph_compute(backend.get(), gf);
-
+    ggml_graph_compute_with_ctx(model.ctx, gf, 1);
    struct ggml_tensor* result = ggml_graph_node(gf, -1);

    memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
    // append without newline tokens (default behavior in llava_arch when not using unpad ):
-    memcpy(image_embd_out + clip_n_output_tokens(ctx_clip, img_input) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
-    *n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_output_tokens(ctx_clip, img_input));
+    memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
+    *n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_patches(ctx_clip));

    // Debug: Test single segments
    // Current findings: sending base image, sending a segment embedding all works similar to python
@@ -317,7 +313,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
                image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip),
                image_embd_v[i],
                clip_embd_nbytes_by_img(ctx_clip, nx, ny));
-            n_img_pos_out += clip_n_output_tokens(ctx_clip, img_res);
+            n_img_pos_out += clip_n_patches_by_img(ctx_clip, img_res);
        }
        *n_img_pos = n_img_pos_out;
        for (size_t i = 0; i < image_embd_v.size(); i++) {
@@ -346,8 +342,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
    }
    else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
        // flat / default llava-1.5 type embedding
+        *n_img_pos = clip_n_patches(ctx_clip);
        clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0);
-        *n_img_pos = clip_n_output_tokens(ctx_clip, img_res);
        bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096
        if (!encoded) {
            LOG_ERR("Unable to encode image\n");
@@ -385,8 +381,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
        struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size);

        int n_img_pos_out;
-        clip_image_f32 * img_input = clip_image_f32_get_img(img_res_v.get(), 0);
-        clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out, img_input);
+        clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out);
        *n_img_pos = n_img_pos_out;

        for (size_t i = 0; i < image_embd_v.size(); i++) {
--- a/examples/llava/llava.h
+++ b/examples/llava/llava.h
--- a/examples/llava/llava_surgery.py
+++ b/examples/llava/llava_surgery.py
--- a/examples/llava/llava_surgery_v2.py
+++ b/examples/llava/llava_surgery_v2.py
--- a/examples/llava/minicpmv-convert-image-encoder-to-gguf.py
+++ b/examples/llava/minicpmv-convert-image-encoder-to-gguf.py
--- a/examples/llava/minicpmv-surgery.py
+++ b/examples/llava/minicpmv-surgery.py
--- a/examples/llava/mtmd-cli.cpp
+++ b/examples/llava/mtmd-cli.cpp
@@ -63,7 +63,7 @@ static void sigint_handler(int signo) {
 #endif

 struct mtmd_cli_context {
-    mtmd::context_ptr ctx_vision;
+    mtmd_context_ptr ctx_vision;
    common_init_result llama_init;

    llama_model       * model;
@@ -72,8 +72,6 @@ struct mtmd_cli_context {
    llama_batch         batch;
    int                 n_batch;

-    mtmd::bitmaps bitmaps;
-
    // note: we know that gemma3 template is "linear", meaning each turn is completely separated to another
    // so here we don't need to keep track of chat history
    common_chat_templates_ptr tmpls;
@@ -92,15 +90,10 @@ struct mtmd_cli_context {
        batch = llama_batch_init(params.n_batch, 0, 1);
        n_batch = params.n_batch;

-        if (!model || !lctx) {
-            exit(1);
-        }
-
        if (!llama_model_chat_template(model, nullptr) && params.chat_template.empty()) {
            LOG_ERR("Model does not have chat template.\n");
            LOG_ERR("  For old llava models, you may need to use '--chat-template vicuna'\n");
            LOG_ERR("  For MobileVLM models, use '--chat-template deepseek'\n");
-            LOG_ERR("  For Mistral Small 3.1, use '--chat-template mistral-v7'\n");
            exit(1);
        }

@@ -119,12 +112,12 @@ struct mtmd_cli_context {

    void init_vision_context(common_params & params) {
        const char * clip_path = params.mmproj.path.c_str();
-        mtmd_context_params mparams = mtmd_context_params_default();
-        mparams.use_gpu = params.mmproj_use_gpu;
-        mparams.print_timings = true;
-        mparams.n_threads = params.cpuparams.n_threads;
-        mparams.verbosity = params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO;
-        ctx_vision.reset(mtmd_init_from_file(clip_path, model, mparams));
+        ctx_vision.reset(mtmd_init_from_file(clip_path, model, mtmd_context_params{
+            /* use_gpu */   params.mmproj_use_gpu,
+            /* timings */   true,
+            /* n_threads */ params.cpuparams.n_threads,
+            /* verbosity */ params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO,
+        }));
        if (!ctx_vision.get()) {
            LOG_ERR("Failed to load vision model from %s\n", clip_path);
            exit(1);
@@ -141,14 +134,38 @@ struct mtmd_cli_context {
            antiprompt_tokens.begin()
        );
    }
+};

-    bool load_image(const std::string & fname) {
-        mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(fname.c_str()));
-        if (!bmp.ptr) {
-            return false;
+struct decode_embd_batch {
+    std::vector<llama_pos>      pos;
+    std::vector<int32_t>        n_seq_id;
+    std::vector<llama_seq_id>   seq_id_0;
+    std::vector<llama_seq_id *> seq_ids;
+    std::vector<int8_t>         logits;
+    llama_batch batch;
+    decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
+        pos     .resize(n_tokens);
+        n_seq_id.resize(n_tokens);
+        seq_ids .resize(n_tokens + 1);
+        logits  .resize(n_tokens);
+        seq_id_0.resize(1);
+        seq_id_0[0] = seq_id;
+        seq_ids [n_tokens] = nullptr;
+        batch = {
+            /*n_tokens       =*/ n_tokens,
+            /*tokens         =*/ nullptr,
+            /*embd           =*/ embd,
+            /*pos            =*/ pos.data(),
+            /*n_seq_id       =*/ n_seq_id.data(),
+            /*seq_id         =*/ seq_ids.data(),
+            /*logits         =*/ logits.data(),
+        };
+        for (int i = 0; i < n_tokens; i++) {
+            batch.pos     [i] = pos_0 + i;
+            batch.n_seq_id[i] = 1;
+            batch.seq_id  [i] = seq_id_0.data();
+            batch.logits  [i] = false;
        }
-        bitmaps.entries.push_back(std::move(bmp));
-        return true;
    }
 };

@@ -156,7 +173,7 @@ static int generate_response(mtmd_cli_context & ctx, common_sampler * smpl, int
    llama_tokens generated_tokens;
    for (int i = 0; i < n_predict; i++) {
        if (i > n_predict || !g_is_generating || g_is_interrupted) {
-            LOG("\n");
+            printf("\n");
            break;
        }

@@ -165,15 +182,15 @@ static int generate_response(mtmd_cli_context & ctx, common_sampler * smpl, int
        common_sampler_accept(smpl, token_id, true);

        if (llama_vocab_is_eog(ctx.vocab, token_id) || ctx.check_antiprompt(generated_tokens)) {
-            LOG("\n");
+            printf("\n");
            break; // end of generation
        }

-        LOG("%s", common_token_to_piece(ctx.lctx, token_id).c_str());
+        printf("%s", common_token_to_piece(ctx.lctx, token_id).c_str());
        fflush(stdout);

        if (g_is_interrupted) {
-            LOG("\n");
+            printf("\n");
            break;
        }

@@ -188,7 +205,9 @@ static int generate_response(mtmd_cli_context & ctx, common_sampler * smpl, int
    return 0;
 }

-static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, bool add_bos = false) {
+static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, std::vector<std::string> & images_fname, bool add_bos = false) {
+    std::vector<mtmd_bitmap> bitmaps;
+
    common_chat_templates_inputs tmpl_inputs;
    tmpl_inputs.messages = {msg};
    tmpl_inputs.add_generation_prompt = true;
@@ -196,43 +215,35 @@ static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, bool add_
    auto formatted_chat = common_chat_templates_apply(ctx.tmpls.get(), tmpl_inputs);
    LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.prompt.c_str());

+    for (auto & fname : images_fname) {
+        mtmd_bitmap bitmap;
+        if (mtmd_helper_bitmap_init_from_file(fname.c_str(), bitmap)) {
+            LOG_ERR("Unable to load image %s\n", fname.c_str());
+            return 2; // image not found
+        }
+        bitmaps.push_back(std::move(bitmap));
+    }
+
    mtmd_input_text text;
-    text.text          = formatted_chat.prompt.c_str();
+    text.text          = formatted_chat.prompt;
    text.add_special   = add_bos;
    text.parse_special = true;
+    mtmd_input_chunks chunks;

    if (g_is_interrupted) return 0;

-    mtmd::input_chunks chunks(mtmd_input_chunks_init());
-    auto bitmaps_c_ptr = ctx.bitmaps.c_ptr();
-    int32_t res = mtmd_tokenize(ctx.ctx_vision.get(),
-                        chunks.ptr.get(), // output
-                        &text, // text
-                        bitmaps_c_ptr.data(),
-                        bitmaps_c_ptr.size());
+    int32_t res = mtmd_tokenize(ctx.ctx_vision.get(), chunks, text, bitmaps);
    if (res != 0) {
        LOG_ERR("Unable to tokenize prompt, res = %d\n", res);
        return 1;
    }

-    ctx.bitmaps.entries.clear();
-
-    llama_pos new_n_past;
-    if (mtmd_helper_eval_chunks(ctx.ctx_vision.get(),
-                ctx.lctx, // lctx
-                chunks.ptr.get(), // chunks
-                ctx.n_past, // n_past
-                0, // seq_id
-                ctx.n_batch, // n_batch
-                true, // logits_last
-                &new_n_past)) {
+    if (mtmd_helper_eval(ctx.ctx_vision.get(), ctx.lctx, chunks, ctx.n_past, 0, ctx.n_batch)) {
        LOG_ERR("Unable to eval prompt\n");
        return 1;
    }

-    ctx.n_past = new_n_past;
-
-    LOG("\n");
+    ctx.n_past += mtmd_helper_get_n_tokens(chunks);

    return 0;
 }
@@ -256,14 +267,14 @@ int main(int argc, char ** argv) {
    }

    mtmd_cli_context ctx(params);
-    LOG("%s: loading model: %s\n", __func__, params.model.path.c_str());
+    printf("%s: %s\n", __func__, params.model.path.c_str());

    bool is_single_turn = !params.prompt.empty() && !params.image.empty();

    struct common_sampler * smpl = common_sampler_init(ctx.model, params.sampling);
    int n_predict = params.n_predict < 0 ? INT_MAX : params.n_predict;

-    // Ctrl+C handling
+    // ctrl+C handling
    {
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
        struct sigaction sigint_action;
@@ -289,12 +300,7 @@ int main(int argc, char ** argv) {
        common_chat_msg msg;
        msg.role = "user";
        msg.content = params.prompt;
-        for (const auto & image : params.image) {
-            if (!ctx.load_image(image)) {
-                return 1; // error is already printed by libmtmd
-            }
-        }
-        if (eval_message(ctx, msg, true)) {
+        if (eval_message(ctx, msg, params.image, true)) {
            return 1;
        }
        if (!g_is_interrupted && generate_response(ctx, smpl, n_predict)) {
@@ -309,6 +315,7 @@ int main(int argc, char ** argv) {
        LOG("\n");

        bool is_first_msg = true;
+        std::vector<std::string> images_fname;
        std::string content;

        while (!g_is_interrupted) {
@@ -333,17 +340,10 @@ int main(int argc, char ** argv) {
                continue;
            }
            g_is_generating = true;
-            if (line == "/image" || line.find("/image ") == 0) {
-                if (line.size() < 8) {
-                    LOG_ERR("ERR: Missing image filename\n");
-                    continue;
-                }
+            if (line.find("/image") == 0) {
                std::string image = line.substr(7);
-                if (ctx.load_image(image)) {
-                    LOG("Image %s loaded\n", image.c_str());
-                    content += "<__image__>";
-                }
-                // else, error is already printed by libmtmd
+                images_fname.push_back(string_strip(image));
+                content += "<__image__>";
                continue;
            } else {
                content += line;
@@ -351,20 +351,26 @@ int main(int argc, char ** argv) {
            common_chat_msg msg;
            msg.role = "user";
            msg.content = content;
-            int ret = eval_message(ctx, msg, is_first_msg);
+            int ret = eval_message(ctx, msg, images_fname, is_first_msg);
+            if (g_is_interrupted) break;
+            if (ret == 2) {
+                // non-fatal error
+                images_fname.clear();
+                content.clear();
+                continue;
+            }
            if (ret) {
                return 1;
            }
-            if (g_is_interrupted) break;
            if (generate_response(ctx, smpl, n_predict)) {
                return 1;
            }
+            images_fname.clear();
            content.clear();
            is_first_msg = false;
        }
    }
    if (g_is_interrupted) LOG("\nInterrupted by user\n");
-    LOG("\n\n");
    llama_perf_context_print(ctx.lctx);
    return g_is_interrupted ? 130 : 0;
 }
--- a/examples/llava/mtmd.cpp
+++ b/examples/llava/mtmd.cpp
@@ -12,30 +12,6 @@
 #include <limits>
 #include <vector>

-// represents raw image data, layout is RGBRGBRGB...
-// length of data must be nx * ny * 3
-struct mtmd_bitmap {
-    uint32_t nx;
-    uint32_t ny;
-    std::vector<unsigned char> data;
-    std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking
-};
-
-struct mtmd_image_tokens_deleter {
-    void operator()(mtmd_image_tokens * val); // forward declaration
-};
-using mtmd_image_tokens_ptr = std::unique_ptr<mtmd_image_tokens, mtmd_image_tokens_deleter>;
-
-struct mtmd_input_chunk {
-    mtmd_input_chunk_type type;
-    std::vector<llama_token> tokens_text;
-    mtmd_image_tokens_ptr tokens_image;
-};
-
-struct mtmd_input_chunks {
-    std::vector<mtmd_input_chunk> entries;
-};
-
 // slice template, used by some llava-uhd models to correctly place the special tokens around image embeddings
 // models not having it (llava-1.6) will process embeddings without any special tokens in-between
 enum mtmd_slice_tmpl {
@@ -45,16 +21,6 @@ enum mtmd_slice_tmpl {
    // TODO @ngxson : add support for idefics (SmolVLM)
 };

-mtmd_context_params mtmd_context_params_default() {
-    mtmd_context_params params;
-    params.use_gpu = true;
-    params.print_timings = true;
-    params.n_threads = 4;
-    params.verbosity = GGML_LOG_LEVEL_INFO;
-    params.image_marker = MTMD_DEFAULT_IMAGE_MARKER;
-    return params;
-}
-
 struct mtmd_context {
    struct clip_ctx * ctx_clip;
    const struct llama_model * text_model;
@@ -74,14 +40,11 @@ struct mtmd_context {
    llama_token tok_sli_img_end   = LLAMA_TOKEN_NULL; // single slice
    llama_token tok_row_end       = LLAMA_TOKEN_NULL; // end of row

-    bool use_mrope = false; // for Qwen2VL, we need to use M-RoPE
-
    // TODO @ngxson : add timings

    mtmd_context(const char * mmproj_fname,
                   const llama_model * text_model,
                   const mtmd_context_params & ctx_params) :
-        text_model   (text_model),
        print_timings(ctx_params.print_timings),
        n_threads    (ctx_params.n_threads),
        image_marker (ctx_params.image_marker)
@@ -93,8 +56,9 @@ struct mtmd_context {
        if (!ctx_clip) {
            throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname));
        }
+        this->text_model = text_model;

-        use_mrope = clip_is_qwen2vl(ctx_clip);
+        GGML_ASSERT(!clip_is_qwen2vl(ctx_clip) && "Qwen2VL model is not supported yet, use llama-qwen2vl-cli instead");

        int minicpmv_version = clip_is_minicpmv(ctx_clip);
        if (minicpmv_version == 2) {
@@ -162,20 +126,9 @@ struct mtmd_image_tokens_data {
 struct mtmd_image_tokens {
    uint32_t nx; // number of tokens in x direction
    uint32_t ny; // number of tokens in y direction
-    bool use_mrope_pos = false; // use M-RoPE position counting (the whole image is 1 temporal position)
    uint32_t n_tokens() const { return nx * ny; }
    clip_image_f32_batch batch_f32; // preprocessed image patches
    std::string id; // optional user-defined ID, useful for KV cache tracking
-
-    mtmd_image_tokens clone() {
-        return mtmd_image_tokens{
-            nx,
-            ny,
-            use_mrope_pos,
-            batch_f32.clone(),
-            id
-        };
-    }
 };

 mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
@@ -216,13 +169,12 @@ static std::vector<llama_token> mtmd_tokenize_text_internal(
 }

 int32_t mtmd_tokenize(mtmd_context * ctx,
-            mtmd_input_chunks * output,
-            const mtmd_input_text * text,
-            const mtmd_bitmap ** bitmaps,
-            size_t n_bitmaps) {
+                        std::vector<mtmd_input_chunk> & output,
+                        const mtmd_input_text & text,
+                        const std::vector<mtmd_bitmap> & bitmaps) {
    auto vocab = llama_model_get_vocab(ctx->text_model);

-    std::string prompt_modified(text->text);
+    std::string prompt_modified(text.text);
    std::string marker_modified(ctx->image_marker);
    projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);

@@ -234,6 +186,11 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
        marker_modified = "<start_of_image>" + ctx->image_marker + "<end_of_image>";
        string_replace_all(prompt_modified, ctx->image_marker, marker_modified);

+    } else if (proj_type == PROJECTOR_TYPE_GLM_EDGE) {
+        // <|begin_of_image|> ... (image embeddings) ... <|end_of_image|>
+        marker_modified = "<|begin_of_image|>" + ctx->image_marker + "<|end_of_image|>";
+        string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
+
    } else if (proj_type == PROJECTOR_TYPE_IDEFICS3) {
        // https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
        marker_modified = "<fake_token_around_image><global-img>" + ctx->image_marker + "<fake_token_around_image>";
@@ -245,19 +202,14 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
        string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
    }

-    else if (proj_type == PROJECTOR_TYPE_QWEN2VL || proj_type == PROJECTOR_TYPE_QWEN25VL) {
-        // <|vision_start|> ... (image embeddings) ... <|vision_end|>
-        marker_modified = "<|vision_start|>" + ctx->image_marker + "<|vision_end|>";
-        string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
-
-    }
-
    // llava-1.5, llava-1.6, Yi-VL, Yi-34B, granite: don't need to add prefix and suffix
-    // for glm-edge, BOI and EOI token's embeddings are not present in the text model
+    // for glm-edge, we don't need to add because the tokens are already in the returned embeddings
+
+    // TODO @ngxson : glm-edge : remove BOI / EOI tokens embeddings, decode them as normal tokens

    std::vector<std::string> parts = string_split_str(prompt_modified, ctx->image_marker);
-    output->entries.clear();
-    output->entries.reserve(parts.size());
+    output.clear();
+    output.reserve(parts.size());

    size_t i_img = 0;

@@ -268,7 +220,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
            std::move(tokens),
            {},
        };
-        output->entries.emplace_back(std::move(chunk));
+        output.emplace_back(std::move(chunk));
    };

    // utility for splitting batch of multiple images into chunks of batch having single images
@@ -277,7 +229,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,

        for (auto & entry : batch_f32.entries) {
            mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
-            image_tokens->nx = clip_n_output_tokens(ctx->ctx_clip, entry.get());
+            image_tokens->nx = clip_n_patches_by_img(ctx->ctx_clip, entry.get());
            image_tokens->ny = 1;
            image_tokens->batch_f32.entries.push_back(std::move(entry));
            image_tokens->id = id;
@@ -294,9 +246,9 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
    };

    for (const auto & part : parts) {
-        // printf("tokenizing part: %s\n", part.c_str());
+        //printf("tokenizing part: %s\n", part.c_str());
        bool add_bos = &parts.front() == &part;
-        auto tokens = mtmd_tokenize_text_internal(vocab, part, text->add_special && add_bos, text->parse_special);
+        auto tokens = mtmd_tokenize_text_internal(vocab, part, text.add_special && add_bos, text.parse_special);
        if (tokens.empty()) {
            continue;
        }
@@ -305,22 +257,22 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
            std::move(tokens),
            {},
        };
-        output->entries.emplace_back(std::move(chunk));
+        output.emplace_back(std::move(chunk));

        if (&parts.back() != &part) {
            // add image token to middle of 2 parts

-            if (i_img >= n_bitmaps) {
+            if (i_img >= bitmaps.size()) {
                LOG_ERR("%s: error: not enough images for %d parts\n", __func__, (int)parts.size());
                return 1;
            }

            // convert mtmd_bitmap to clip_image_u8
            clip_image_u8_ptr img_u8(clip_image_u8_init());
-            img_u8->nx = bitmaps[i_img]->nx;
-            img_u8->ny = bitmaps[i_img]->ny;
-            img_u8->buf.resize(bitmaps[i_img]->data.size());
-            std::memcpy(img_u8->buf.data(), bitmaps[i_img]->data.data(), img_u8->nx * img_u8->ny * 3);
+            img_u8->nx = bitmaps[i_img].nx;
+            img_u8->ny = bitmaps[i_img].ny;
+            img_u8->buf.resize(bitmaps[i_img].data.size());
+            std::memcpy(img_u8->buf.data(), bitmaps[i_img].data.data(), img_u8->nx * img_u8->ny * 3);
            clip_image_size img_u8_size{img_u8->nx, img_u8->ny};

            // preprocess image
@@ -333,12 +285,12 @@ int32_t mtmd_tokenize(mtmd_context * ctx,

            if (ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5 || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6) {
                // split batch into chunks of single images
-                auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmaps[i_img]->id);
+                auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmaps[i_img].id);
                GGML_ASSERT(chunks.size() > 0);

                // add overview image
                add_text_chunk({ctx->tok_ov_img_start});
-                output->entries.emplace_back(std::move(chunks.front()));
+                output.emplace_back(std::move(chunks.front()));
                chunks.erase(chunks.begin());
                add_text_chunk({ctx->tok_ov_img_end});

@@ -356,7 +308,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
                            if (ctx->tok_sli_img_start != LLAMA_TOKEN_NULL) {
                                add_text_chunk({ctx->tok_sli_img_start});
                            }
-                            output->entries.emplace_back(std::move(chunks[y * n_col + x]));
+                            output.emplace_back(std::move(chunks[y * n_col + x]));
                            if (ctx->tok_sli_img_end != LLAMA_TOKEN_NULL) {
                                add_text_chunk({ctx->tok_sli_img_end});
                            }
@@ -373,33 +325,30 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
            } else {
                size_t n_tokens = 0;
                for (const auto & entry : batch_f32.entries) {
-                    n_tokens += clip_n_output_tokens(ctx->ctx_clip, entry.get());
+                    n_tokens += clip_n_patches_by_img(ctx->ctx_clip, entry.get());
                }

                mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
-                if (ctx->use_mrope) {
-                    // for Qwen2VL, we need this information for M-RoPE decoding positions
-                    image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_clip, batch_f32.entries[0].get());
-                    image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_clip, batch_f32.entries[0].get());
-                    image_tokens->use_mrope_pos = true;
-                } else {
-                    // other models, we only need the total number of tokens
-                    image_tokens->nx = n_tokens;
-                    image_tokens->ny = 1;
-                }
+                image_tokens->nx = n_tokens;
+                image_tokens->ny = 1; // TODO
                image_tokens->batch_f32 = std::move(batch_f32);
-                image_tokens->id = bitmaps[i_img]->id; // optional
+                image_tokens->id = bitmaps[i_img].id; // optional

                LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx);
                LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny);
                LOG_DBG("batch_f32 size = %d\n", (int)image_tokens->batch_f32.entries.size());

+                if (clip_is_glm(ctx->ctx_clip)) {
+                    // glm-edge
+                    image_tokens->nx += 2; // add 2 for the begin_of_image and end_of_image token embeddings
+                }
+
                mtmd_input_chunk chunk{
                    MTMD_INPUT_CHUNK_TYPE_IMAGE,
                    {},
                    std::move(image_tokens),
                };
-                output->entries.emplace_back(std::move(chunk));
+                output.emplace_back(std::move(chunk));
            }

            i_img++; // move to next image
@@ -409,12 +358,28 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
    return 0;
 }

-static void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens) {
+void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens) {
    if (image_tokens) {
        delete image_tokens;
    }
 }

+size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens) {
+    return image_tokens->n_tokens();
+}
+
+size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens) {
+    return image_tokens->nx;
+}
+
+size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) {
+    return image_tokens->ny;
+}
+
+std::string mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {
+    return image_tokens->id;
+}
+
 int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
    int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
    ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
@@ -432,7 +397,7 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
        // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
        const auto & entries = image_tokens->batch_f32.entries;
        for (size_t i = 0; i < entries.size(); i++) {
-            int n_tokens_per_image = clip_n_output_tokens(ctx->ctx_clip, entries[i].get());
+            int n_tokens_per_image = clip_n_patches_by_img(ctx->ctx_clip, entries[i].get());
            ok = clip_image_encode(
                ctx->ctx_clip,
                ctx->n_threads,
@@ -454,18 +419,13 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
    return ctx->image_embd_v.data();
 }

-size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks) {
+size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks) {
    size_t n_tokens = 0;
-    for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) {
-        auto chunk = mtmd_input_chunks_get(chunks, i);
-        auto chunk_type = mtmd_input_chunk_get_type(chunk);
-        if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
-            size_t n_tokens_text;
-            mtmd_input_chunk_get_tokens_text(chunk, &n_tokens_text);
-            n_tokens += n_tokens_text;
-        } else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
-            auto tokens_image = mtmd_input_chunk_get_tokens_image(chunk);
-            n_tokens += mtmd_image_tokens_get_n_tokens(tokens_image);
+    for (auto & chunk : chunks) {
+        if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+            n_tokens += chunk.tokens_text.size();
+        } else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+            n_tokens += chunk.tokens_image->n_tokens();
        } else {
            GGML_ASSERT(false && "chunk type not supported");
        }
@@ -473,43 +433,22 @@ size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks) {
    return n_tokens;
 }

-llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks) {
-    llama_pos n_pos = 0;
-    for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) {
-        auto chunk = mtmd_input_chunks_get(chunks, i);
-        auto chunk_type = mtmd_input_chunk_get_type(chunk);
-        if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
-            size_t n_tokens_text;
-            mtmd_input_chunk_get_tokens_text(chunk, &n_tokens_text);
-            n_pos += n_tokens_text;
-        } else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
-            auto tokens_image = mtmd_input_chunk_get_tokens_image(chunk);
-            n_pos += mtmd_image_tokens_get_n_pos(tokens_image);
-        } else {
-            GGML_ASSERT(false && "chunk type not supported");
-        }
-    }
-    return n_pos;
-}
-
 // helper struct to make working with embd batch easier
 // note: this will be removed after llama_batch_ext refactoring
 struct decode_embd_batch {
-    int n_pos_per_embd;
-    int n_mmproj_embd;
    std::vector<llama_pos>      pos;
-    std::vector<llama_pos>      pos_view; // used by mrope
    std::vector<int32_t>        n_seq_id;
    std::vector<llama_seq_id>   seq_id_0;
    std::vector<llama_seq_id *> seq_ids;
    std::vector<int8_t>         logits;
    llama_batch batch;
-    decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
-        pos     .resize(n_tokens * n_pos_per_embd);
+    decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
+        pos     .resize(n_tokens);
        n_seq_id.resize(n_tokens);
        seq_ids .resize(n_tokens + 1);
        logits  .resize(n_tokens);
        seq_id_0.resize(1);
+        seq_id_0[0] = seq_id;
        seq_ids [n_tokens] = nullptr;
        batch = {
            /*n_tokens       =*/ n_tokens,
@@ -520,232 +459,140 @@ struct decode_embd_batch {
            /*seq_id         =*/ seq_ids.data(),
            /*logits         =*/ logits.data(),
        };
-    }
-
-    void set_position_normal(llama_pos pos_0, llama_seq_id seq_id) {
-        seq_id_0[0] = seq_id;
-        for (int i = 0; i < batch.n_tokens; i++) {
+        for (int i = 0; i < n_tokens; i++) {
            batch.pos     [i] = pos_0 + i;
            batch.n_seq_id[i] = 1;
            batch.seq_id  [i] = seq_id_0.data();
            batch.logits  [i] = false;
        }
    }
-
-    void set_position_mrope(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) {
-        GGML_ASSERT(n_pos_per_embd == 4);
-        seq_id_0[0] = seq_id;
-        for (int y = 0; y < ny; y++) {
-            for (int x = 0; x < nx; x++) {
-                int i = y * nx + x;
-                pos[i                     ] = pos_0;
-                pos[i + batch.n_tokens    ] = pos_0 + y;
-                pos[i + batch.n_tokens * 2] = pos_0 + x;
-                pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
-            }
-        }
-        for (int i = 0; i < batch.n_tokens; i++) {
-            batch.n_seq_id[i] = 1;
-            batch.seq_id  [i] = seq_id_0.data();
-            batch.logits  [i] = false;
-        }
-    }
-
-    llama_batch get_view(int offset, int n_tokens) {
-        llama_pos * pos_ptr;
-        pos_view.clear();
-        pos_view.resize(n_tokens * n_pos_per_embd);
-        if (n_pos_per_embd > 1) {
-            // mrope
-            // for example, with layout of src: 1234...1234...1234...1234...
-            //       offset 2 will give us dst: 34...34...34...34...
-            for (int i = 0; i < n_pos_per_embd; i++) {
-                auto src = pos.begin() + i * batch.n_tokens + offset;
-                pos_view.insert(pos_view.end(), src, src + n_tokens);
-            }
-            pos_ptr = pos_view.data();
-        } else {
-            // normal
-            pos_ptr = pos.data() + offset;
-        }
-        return {
-            /*n_tokens       =*/ n_tokens,
-            /*tokens         =*/ nullptr,
-            /*embd           =*/ batch.embd     + offset * n_mmproj_embd,
-            /*pos            =*/ pos_ptr,
-            /*n_seq_id       =*/ batch.n_seq_id + offset,
-            /*seq_id         =*/ batch.seq_id   + offset,
-            /*logits         =*/ batch.logits   + offset,
-        };
-    }
 };

-int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
-        struct llama_context * lctx,
-        const mtmd_input_chunk * chunk,
-        llama_pos n_past,
+int32_t mtmd_helper_eval(mtmd_context * ctx,
+        llama_context * lctx,
+        mtmd_input_chunks & chunks,
+        llama_pos pos0,
        llama_seq_id seq_id,
-        int32_t n_batch,
-        bool logits_last,
-        llama_pos * new_n_past) {
+        int32_t n_batch) {
    int32_t ret;
+    llama_pos n_past = pos0;
    llama_batch text_batch = llama_batch_init(n_batch, 0, 1);
-    auto chunk_type = mtmd_input_chunk_get_type(chunk);
    int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
-    int n_pos_per_embd = mtmd_decode_use_mrope(ctx) ? 4 : 1;

-    if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
-        size_t n_tokens;
-        const auto tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
-        LOG_DBG("decoding text chunk, n_tokens = %zu\n", n_tokens);
-        size_t i = 0;
-        while (i < n_tokens) { // split into batches
-            text_batch.n_tokens = 0; // clear the batch
-            for (; i < n_tokens && text_batch.n_tokens < n_batch; i++) {
-                text_batch.n_tokens++;
-                text_batch.token   [i]    = tokens[i];
-                text_batch.pos     [i]    = n_past++;
-                text_batch.n_seq_id[i]    = 1;
-                text_batch.seq_id  [i][0] = seq_id;
-                text_batch.logits  [i]    = false;
-            }
-            bool is_last_token = (i == n_tokens);
-            if (logits_last && is_last_token) {
-                text_batch.logits[text_batch.n_tokens - 1] = true;
-            }
-            ret = llama_decode(lctx, text_batch);
-            if (ret != 0) {
-                LOG_ERR("failed to decode text\n");
-                llama_batch_free(text_batch);
-                return ret;
-            }
-            *new_n_past += text_batch.n_tokens;
-        }
-
-    } else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
-        const auto image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
-        int64_t t0 = ggml_time_ms();
-        if (ctx->print_timings) {
-            LOG_INF("encoding image or slice...\n");
-        }
-        ret = mtmd_encode(ctx, image_tokens);
-        if (ret != 0) {
-            LOG_ERR("failed to encode image\n");
-            llama_batch_free(text_batch);
-            return ret;
-        }
-        if (ctx->print_timings) {
-            LOG_INF("image/slice encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
-        }
-
-        int32_t n_tokens = mtmd_image_tokens_get_n_tokens(image_tokens);
-        int32_t i_batch = 0;
-        int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
-        float * embd = mtmd_get_output_embd(ctx);
-        decode_embd_batch batch_embd(embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
-
-        const int nx = mtmd_image_tokens_get_nx(image_tokens);
-        const int ny = mtmd_image_tokens_get_ny(image_tokens);
-
-        if (mtmd_decode_use_mrope(ctx)) {
-            batch_embd.set_position_mrope(n_past, nx, ny, seq_id);
-        } else {
-            batch_embd.set_position_normal(n_past, seq_id);
-        }
-
-        if (mtmd_decode_use_non_causal(ctx)) {
-            llama_set_causal_attn(lctx, false);
-            // TODO @ngxson : need to make sure only one image is processed at a time, and n_ubatch must be enough to hold the image
-        }
-
-        while (i_batch < n_img_batches) { // split into batches
-            int pos_offset = i_batch*n_batch;
-            int n_tokens_batch = std::min(n_batch, n_tokens - pos_offset);
-            llama_batch batch_embd_view = batch_embd.get_view(pos_offset, n_tokens_batch);
-
-            LOG_INF("decoding image batch %d/%d, n_tokens_batch = %d\n", i_batch+1, n_img_batches, n_tokens_batch);
-
-            int64_t t1 = ggml_time_ms();
-            ret = llama_decode(lctx, batch_embd_view);
-            if (ret != 0) {
-                LOG_ERR("failed to decode image\n");
-                llama_set_causal_attn(lctx, true); // restore causal attn
-                llama_batch_free(text_batch);
-                return ret;
+    for (auto & chunk : chunks) {
+        bool is_last = &chunk == &chunks.back();
+        if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+            text_batch.n_tokens = chunk.tokens_text.size();
+            size_t i = 0;
+            while (i < chunk.tokens_text.size()) { // split into batches
+                for (; i < chunk.tokens_text.size() && text_batch.n_tokens < n_batch; i++) {
+                    text_batch.token   [i]    = chunk.tokens_text[i];
+                    text_batch.pos     [i]    = n_past++;
+                    text_batch.n_seq_id[i]    = 1;
+                    text_batch.seq_id  [i][0] = seq_id;
+                    text_batch.logits  [i]    = false;
+                }
+                if (is_last) {
+                    // always get logits for last input chunk
+                    text_batch.logits[text_batch.n_tokens - 1] = true;
+                }
+                ret = llama_decode(lctx, text_batch);
+                if (ret != 0) {
+                    LOG_ERR("failed to decode text\n");
+                    llama_batch_free(text_batch);
+                    return ret;
+                }
            }

+        } else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+            GGML_ASSERT(!is_last && "logits for last image chunk is not yet support");
+            GGML_ASSERT(chunk.tokens_image != nullptr);
+            int64_t t0 = ggml_time_ms();
            if (ctx->print_timings) {
-                LOG_INF("image decoded (batch %d/%d) in %" PRId64 " ms\n", i_batch+1, n_img_batches, ggml_time_ms() - t1);
+                LOG_INF("encoding image or slice...\n");
+            }
+            ret = mtmd_encode(ctx, chunk.tokens_image.get());
+            if (ret != 0) {
+                LOG_ERR("failed to encode image\n");
+                llama_batch_free(text_batch);
+                return ret;
+            }
+            if (ctx->print_timings) {
+                LOG_INF("image/slice encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
            }

-            i_batch++;
+            int32_t n_tokens = mtmd_image_tokens_get_n_tokens(chunk.tokens_image.get());
+            int32_t i_batch = 0;
+            int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
+            float * embd = mtmd_get_output_embd(ctx);
+
+            if (mtmd_decode_use_non_causal(ctx)) {
+                llama_set_causal_attn(lctx, false);
+                // TODO @ngxson : need to make sure only one image is processed at a time, and n_ubatch must be enough to hold the image
+            }
+
+            while (i_batch < n_img_batches) { // split into batches
+                int32_t pos_offset = i_batch*n_batch;
+                int32_t n_tokens_batch = std::min(n_batch, n_tokens - pos_offset);
+                float * embd_batch = embd + pos_offset*n_mmproj_embd;
+                decode_embd_batch batch_img(embd_batch, n_tokens_batch, n_past, 0);
+
+                printf("decoding image batch %d/%d, n_tokens_batch = %d\n", i_batch+1, n_img_batches, n_tokens_batch);
+
+                int64_t t1 = ggml_time_ms();
+                ret = llama_decode(lctx, batch_img.batch);
+                if (ret != 0) {
+                    LOG_ERR("failed to decode image\n");
+                    llama_set_causal_attn(lctx, true); // restore causal attn
+                    llama_batch_free(text_batch);
+                    return ret;
+                }
+
+                if (ctx->print_timings) {
+                    LOG_INF("image decoded (batch %d/%d) in %" PRId64 " ms\n", i_batch+1, n_img_batches, ggml_time_ms() - t1);
+                }
+
+                i_batch++;
+                n_past += n_tokens_batch;
+            }
+
+            if (mtmd_decode_use_non_causal(ctx)) {
+                llama_set_causal_attn(lctx, true);
+            }
+
+        } else {
+            GGML_ASSERT(false && "chunk type not supported");
        }
-
-        n_past += mtmd_image_tokens_get_n_pos(image_tokens);
-        *new_n_past = n_past;
-
-        if (mtmd_decode_use_non_causal(ctx)) {
-            llama_set_causal_attn(lctx, true);
-        }
-
-    } else {
-        GGML_ABORT("chunk type not supported");
    }

+    llama_batch_free(text_batch);
    return 0;
 }

-int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
-                                struct llama_context * lctx,
-                                const mtmd_input_chunks * chunks,
-                                llama_pos n_past,
-                                llama_seq_id seq_id,
-                                int32_t n_batch,
-                                bool logits_last,
-                                llama_pos * new_n_past) {
-    size_t n_chunks = mtmd_input_chunks_size(chunks);
-    if (n_chunks == 0) {
-        LOG_WRN("no chunks to eval\n");
-        return 0;
-    }
-
-    for (size_t i = 0; i < n_chunks; i++) {
-        bool chunk_logits_last = (i == n_chunks - 1) && logits_last;
-        auto chunk = mtmd_input_chunks_get(chunks, i);
-
-        int32_t res = mtmd_helper_eval_chunk_single(ctx, lctx, chunk, n_past, seq_id, n_batch, chunk_logits_last, &n_past);
-        if (res != 0) {
-            LOG_ERR("failed to eval chunk %zu\n", i);
-            return res;
-        }
-        *new_n_past = n_past;
-    }
-
-    return 0;
-}
-
-mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len) {
+int32_t mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len, mtmd_bitmap & output) {
    clip_image_u8_ptr img_u8(clip_image_u8_init());
    bool ok = clip_image_load_from_bytes(buf, len, img_u8.get());
    if (!ok) {
        LOG_ERR("Unable to load image from buffer\n");
-        return nullptr;
+        return 1;
    }
-    uint32_t nx, ny;
-    unsigned char * data = clip_image_u8_get_data(img_u8.get(), &nx, &ny);
-    return mtmd_bitmap_init(nx, ny, data);
+    unsigned char * data = clip_image_u8_get_data(img_u8.get(), &output.nx, &output.ny);
+    output.data.resize(output.nx * output.ny * 3);
+    std::memcpy(output.data.data(), data, output.nx * output.ny * 3);
+    return 0;
 }

-mtmd_bitmap * mtmd_helper_bitmap_init_from_file(const char * fname) {
+int32_t mtmd_helper_bitmap_init_from_file(const char * fname, mtmd_bitmap & output) {
    clip_image_u8_ptr img_u8(clip_image_u8_init());
    bool ok = clip_image_load_from_file(fname, img_u8.get());
    if (!ok) {
        LOG_ERR("Unable to load image %s\n", fname);
-        return nullptr;
+        return 1;
    }
-    uint32_t nx, ny;
-    unsigned char * data = clip_image_u8_get_data(img_u8.get(), &nx, &ny);
-    return mtmd_bitmap_init(nx, ny, data);
+    unsigned char * data = clip_image_u8_get_data(img_u8.get(), &output.nx, &output.ny);
+    output.data.resize(output.nx * output.ny * 3);
+    std::memcpy(output.data.data(), data, output.nx * output.ny * 3);
+    return 0;
 }

 bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
@@ -756,182 +603,6 @@ bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
    return false;
 }

-bool mtmd_decode_use_mrope(mtmd_context * ctx) {
-    return ctx->use_mrope;
-}
-
 void mtmd_image_tokens_deleter::operator()(mtmd_image_tokens * val) {
    mtmd_image_tokens_free(val);
 }
-
-
-//
-// public API functions
-//
-
-// mtmd_bitmap
-
-mtmd_bitmap * mtmd_bitmap_init(uint32_t nx,
-                               uint32_t ny,
-                               const unsigned char * data) {
-    mtmd_bitmap * bitmap = new mtmd_bitmap;
-    bitmap->nx = nx;
-    bitmap->ny = ny;
-    size_t data_size = (size_t)nx * ny * 3;
-    bitmap->data.resize(data_size);
-    std::memcpy(bitmap->data.data(), data, data_size);
-    return bitmap;
-}
-
-uint32_t mtmd_bitmap_get_nx(const mtmd_bitmap * bitmap) {
-    return bitmap->nx;
-}
-
-uint32_t mtmd_bitmap_get_ny(const mtmd_bitmap * bitmap) {
-    return bitmap->ny;
-}
-
-const unsigned char * mtmd_bitmap_get_data(const mtmd_bitmap * bitmap) {
-    return bitmap->data.data();
-}
-
-const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap) {
-    return bitmap->id.c_str();
-}
-
-void mtmd_bitmap_set_id(mtmd_bitmap * bitmap, const char * id) {
-    if (id) {
-        bitmap->id = std::string(id);
-    } else {
-        bitmap->id.clear();
-    }
-}
-
-void mtmd_bitmap_free(mtmd_bitmap * bitmap) {
-    if (bitmap) {
-        delete bitmap;
-    }
-}
-
-// mtmd_input_chunks
-
-mtmd_input_chunks * mtmd_input_chunks_init() {
-    return new mtmd_input_chunks;
-}
-
-size_t mtmd_input_chunks_size(const mtmd_input_chunks * chunks) {
-    return chunks->entries.size();
-}
-
-const mtmd_input_chunk * mtmd_input_chunks_get(const mtmd_input_chunks * chunks, size_t idx) {
-    if (idx >= chunks->entries.size()) {
-        return nullptr;
-    }
-    return &chunks->entries[idx];
-}
-
-void mtmd_input_chunks_free(mtmd_input_chunks * chunks) {
-    if (chunks) {
-        delete chunks;
-    }
-}
-
-// mtmd_input_chunk
-
-enum mtmd_input_chunk_type mtmd_input_chunk_get_type(const mtmd_input_chunk * chunk) {
-    return chunk->type;
-}
-
-const llama_token * mtmd_input_chunk_get_tokens_text(const mtmd_input_chunk * chunk, size_t * n_tokens_output) {
-    if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
-        *n_tokens_output = chunk->tokens_text.size();
-        return chunk->tokens_text.data();
-    }
-    *n_tokens_output = 0;
-    return nullptr;
-}
-
-const mtmd_image_tokens * mtmd_input_chunk_get_tokens_image(const mtmd_input_chunk * chunk) {
-    if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
-        return chunk->tokens_image.get();
-    }
-    return nullptr;
-}
-
-mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk) {
-    mtmd_input_chunk * copy = new mtmd_input_chunk{
-        chunk->type,
-        chunk->tokens_text,
-        mtmd_image_tokens_ptr(),
-    };
-    if (chunk->tokens_image) {
-        // copy the image tokens
-        copy->tokens_image = mtmd_image_tokens_ptr(new mtmd_image_tokens());
-        *copy->tokens_image = chunk->tokens_image->clone();
-    }
-    return copy;
-}
-
-void mtmd_input_chunk_free(mtmd_input_chunk * chunk) {
-    if (chunk) {
-        delete chunk;
-    }
-}
-
-// mtmd_image_tokens
-
-size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens) {
-    return image_tokens->n_tokens();
-}
-
-size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens) {
-    return image_tokens->nx;
-}
-
-size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) {
-    return image_tokens->ny;
-}
-
-const char * mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {
-    return image_tokens->id.c_str();
-}
-
-llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
-    if (image_tokens->use_mrope_pos) {
-        return 1; // for M-RoPE, the whole image is 1 in temporal dimension
-    }
-    return image_tokens->n_tokens();
-}
-
-// test function
-
-mtmd_input_chunks * mtmd_test_create_input_chunks() {
-    mtmd_input_chunks * chunks = mtmd_input_chunks_init();
-    if (!chunks) {
-        return nullptr;
-    }
-
-    // create a text chunk
-    std::vector<llama_token> tokens_text = { 1, 2, 3, 4, 5 };
-    mtmd_input_chunk chunk_text{
-        MTMD_INPUT_CHUNK_TYPE_TEXT,
-        std::move(tokens_text),
-        {},
-    };
-    chunks->entries.emplace_back(std::move(chunk_text));
-
-    // create an image chunk
-    mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
-    image_tokens->nx = 4;
-    image_tokens->ny = 4;
-    image_tokens->batch_f32.entries.resize(16);
-    image_tokens->id = "image_1";
-    mtmd_input_chunk chunk_image{
-        MTMD_INPUT_CHUNK_TYPE_IMAGE,
-        {},
-        std::move(image_tokens),
-    };
-    chunks->entries.emplace_back(std::move(chunk_image));
-
-    return chunks;
-}
--- a/examples/llava/mtmd.h
+++ b/examples/llava/mtmd.h
@@ -0,0 +1,161 @@
+#ifndef MTMD_H
+#define MTMD_H
+
+#include "ggml.h"
+#include "llama.h"
+#include "clip.h"
+
+#include <vector>
+#include <cinttypes>
+#include <memory>
+
+#ifdef LLAMA_SHARED
+#    if defined(_WIN32) && !defined(__MINGW32__)
+#        ifdef LLAMA_BUILD
+#            define MTMD_API __declspec(dllexport)
+#        else
+#            define MTMD_API __declspec(dllimport)
+#        endif
+#    else
+#        define MTMD_API __attribute__ ((visibility ("default")))
+#    endif
+#else
+#    define MTMD_API
+#endif
+
+#ifdef __cplusplus
+
+enum mtmd_input_chunk_type {
+    MTMD_INPUT_CHUNK_TYPE_TEXT,
+    MTMD_INPUT_CHUNK_TYPE_IMAGE,
+};
+
+struct mtmd_context;
+struct mtmd_image_tokens;
+
+// represents raw image data, layout is RGBRGBRGB...
+// length of data must be nx * ny * 3
+struct mtmd_bitmap {
+    uint32_t nx;
+    uint32_t ny;
+    std::vector<unsigned char> data;
+    std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking
+};
+
+struct mtmd_image_tokens_deleter {
+    void operator()(mtmd_image_tokens * val); // forward declaration
+};
+using mtmd_image_tokens_ptr = std::unique_ptr<mtmd_image_tokens, mtmd_image_tokens_deleter>;
+
+struct mtmd_input_chunk {
+    mtmd_input_chunk_type type;
+    std::vector<llama_token> tokens_text;
+    mtmd_image_tokens_ptr tokens_image;
+};
+
+using mtmd_input_chunks = std::vector<mtmd_input_chunk>;
+
+struct mtmd_context_params {
+    bool use_gpu = true;
+    bool print_timings = true;
+    int n_threads = 4;
+    enum ggml_log_level verbosity = GGML_LOG_LEVEL_INFO;
+    const char * image_marker = "<__image__>";
+};
+
+struct mtmd_input_text {
+    std::string text;
+    bool add_special;
+    bool parse_special;
+};
+
+// initialize the mtmd context
+// return nullptr on failure
+MTMD_API mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
+                                                const llama_model * text_model,
+                                                const mtmd_context_params ctx_params);
+
+MTMD_API void mtmd_free(mtmd_context * ctx);
+
+// tokenize an input text prompt and an image
+// the prompt must have the input image marker (default: "<__image__>") in it
+// the marker will be replaced with the image tokens
+// for example:
+//   "here is an image: <__image__>\ndescribe it in detail."
+//   this will gives 3 chunks:
+//   1. "here is an image: <start_of_image>"
+//   2. (image tokens)
+//   3. "<end_of_image>\ndescribe it in detail."
+// number of bitmaps must be equal to the number of image markers in the prompt
+// this function is thread-safe (shared ctx)
+// return values:
+//   0 on success
+//   1 on number of images not matching the number of markers
+//   2 on image preprocessing error
+MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx,
+                                std::vector<mtmd_input_chunk> & output,
+                                const mtmd_input_text & text,
+                                const std::vector<mtmd_bitmap> & bitmaps);
+
+// access mtmd_image_tokens
+MTMD_API size_t      mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens);
+MTMD_API size_t      mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens);
+MTMD_API size_t      mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens);
+MTMD_API std::string mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens);
+MTMD_API void        mtmd_image_tokens_free(mtmd_image_tokens * image_tokens);
+
+// returns 0 on success
+MTMD_API int32_t mtmd_encode(mtmd_context * ctx,
+                            const mtmd_image_tokens * image_tokens);
+
+// get output embeddings from the last encode pass
+MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
+
+// whether we need to set non-causal mask before llama_decode
+MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx);
+
+
+
+//
+// helper functions (can be implemented based on other functions)
+//
+
+// helper to count the total number of tokens from a list of chunks, useful to keep track of n_past
+MTMD_API size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks);
+
+// helper function that automatically:
+// 1. run llama_decode() on text chunks
+// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
+// if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
+// otherwise, returns 0 on success
+MTMD_API int32_t mtmd_helper_eval(mtmd_context * ctx,
+                                llama_context * lctx,
+                                mtmd_input_chunks & chunks,
+                                llama_pos pos0,
+                                llama_seq_id seq_id,
+                                int32_t n_batch);
+
+// helper function to construct a mtmd_bitmap from a file
+// returns 0 on success
+// this function is thread-safe
+MTMD_API int32_t mtmd_helper_bitmap_init_from_file(const char * fname, mtmd_bitmap & output);
+
+// helper function to construct a mtmd_bitmap from a buffer
+// the buffer must be an image in format supported by stb_image (jpg, png, bmp, gif, etc.)
+// returns 0 on success
+// this function is thread-safe
+MTMD_API int32_t mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len, mtmd_bitmap & output);
+
+// convenient unique_ptr wrappers
+struct mtmd_context_deleter {
+    void operator()(mtmd_context * val) { mtmd_free(val); }
+};
+using mtmd_context_ptr = std::unique_ptr<mtmd_context, mtmd_context_deleter>;
+
+#else
+
+static_assert(false && "C header is not yet supported by this library");
+
+#endif
+
+#endif
--- a/examples/llava/qwen2_vl_surgery.py
+++ b/examples/llava/qwen2_vl_surgery.py
@@ -0,0 +1,217 @@
+import argparse
+from typing import Dict, List, Optional
+
+import torch
+import numpy as np
+from gguf import *
+from transformers import (
+    AutoProcessor,
+    Qwen2VLConfig,
+    Qwen2VLProcessor,
+    Qwen2VLForConditionalGeneration,
+    Qwen2_5_VLConfig, # type: ignore[reportAttributeAccessIssue]
+    Qwen2_5_VLForConditionalGeneration, # type: ignore[reportAttributeAccessIssue]
+)
+
+
+VISION = "clip.vision"
+
+
+def k(raw_key: str, arch: str) -> str:
+    return raw_key.format(arch=arch)
+
+
+def get_n_wa_pattern(fullatt_block_indexes: Optional[List[int]]):
+    if fullatt_block_indexes is None:
+        return 0
+    n_wa = fullatt_block_indexes[0]
+    for a, b in zip(fullatt_block_indexes, fullatt_block_indexes[1:]):
+        if b - a - 1 != n_wa:
+            raise ValueError(
+                f"window/full attention layer should have fix pattern of "
+                f"for each full-attention layer followed by {n_wa} window-attention layers"
+            )
+    return n_wa + 1
+
+
+class VL2:
+
+    @staticmethod
+    def to_gguf_name(name: str) -> str:
+        og = name
+        name = name.replace("text_model", "t").replace("vision_model", "v")
+        name = name.replace("blocks", "blk").replace("embeddings.", "")
+        name = name.replace("attn.", "attn_")
+        name = name.replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("proj.", "out.")
+        # name = name.replace("layrnorm", "ln").replace("layer_norm", "ln").replace("layernorm", "ln")
+        name = name.replace("norm1", "ln1").replace("norm2", "ln2")
+        name = name.replace("merger.mlp", 'mm')
+        print(f"[to_gguf_name] {og} --> {name}")
+        return name
+
+    @classmethod
+    def find_vision_tensors(cls, qwen2vl, dtype) -> Dict[str, np.ndarray]:
+        vision_model = qwen2vl.visual
+        tensor_map = {}
+        for name, ten in vision_model.state_dict().items():
+            ten = ten.numpy()
+            if 'qkv' in name:
+                if ten.ndim == 2: # weight
+                    c3, _ = ten.shape
+                else:             # bias
+                    c3 = ten.shape[0]
+                assert c3 % 3 == 0
+                c = c3 // 3
+                wq = ten[:c]
+                wk = ten[c: c * 2]
+                wv = ten[c * 2:]
+                tensor_map[cls.to_gguf_name(f"vision_model.{name}").replace("qkv", "q")] = wq
+                tensor_map[cls.to_gguf_name(f"vision_model.{name}").replace("qkv", "k")] = wk
+                tensor_map[cls.to_gguf_name(f"vision_model.{name}").replace("qkv", "v")] = wv
+            elif 'merger' in name:
+                if name.endswith("ln_q.weight"):
+                    tensor_map['v.post_ln.weight'] = ten
+                elif name.endswith("ln_q.bias"):
+                    tensor_map['v.post_ln.bias'] = ten
+                else:
+                    # "merger.mlp.%d.weight/bias" --> "mm.%d.weight/bias"
+                    tensor_map[cls.to_gguf_name(name)] = ten
+            elif 'patch_embed.proj.weight' in name:
+                # NOTE: split Conv3D into Conv2Ds
+                c1, c2, kt, kh, kw = ten.shape
+                assert kt == 2, "Current implmentation only support temporal_patch_size of 2"
+                tensor_map["v.patch_embd.weight"] = ten[:, :, 0, ...]
+                tensor_map["v.patch_embd.weight.1"] = ten[:, :, 1, ...]
+            else:
+                tensor_map[cls.to_gguf_name(f"vision_model.{name}")] = ten
+
+        for new_name, ten in tensor_map.items():
+            if ten.ndim <= 1 or new_name.endswith("_norm.weight"):
+                tensor_map[new_name] = ten.astype(np.float32)
+            else:
+                tensor_map[new_name] = ten.astype(dtype)
+        tensor_map["v.position_embd.weight"] = np.zeros([10, 10], dtype=np.float32)  # dummy tensor, just here as a placeholder
+        return tensor_map
+
+
+class VL25(VL2):
+
+    @staticmethod
+    def to_gguf_name(name: str) -> str:
+        og = name
+        name = name.replace("text_model", "t").replace("vision_model", "v")
+        name = name.replace("blocks", "blk").replace("embeddings.", "")
+        name = name.replace("attn.", "attn_")
+        name = name.replace("mlp.down_proj", "ffn_down").replace("mlp.up_proj", "ffn_up")
+        name = name.replace("mlp.gate_proj", "ffn_gate").replace("proj.", "out.")
+        name = name.replace("norm1", "ln1").replace("norm2", "ln2")
+        name = name.replace("merger.mlp", 'mm')
+        print(f"[vl25][to_gguf_name] {og} --> {name}")
+        return name
+
+
+def main(args):
+    if args.data_type == 'fp32':
+        dtype = torch.float32
+        np_dtype = np.float32
+        ftype = 0
+    elif args.data_type == 'fp16':
+        dtype = torch.float16
+        np_dtype = np.float16
+        ftype = 1
+    else:
+        raise ValueError()
+
+    local_model = False
+    model_path = ""
+    model_name = args.model_name
+    print("model_name: ", model_name)
+    if args.model_type == "qwen2vl":
+        qwen2vl = Qwen2VLForConditionalGeneration.from_pretrained(
+            model_name, torch_dtype=dtype, device_map="cpu"
+        )
+        cfg: Qwen2VLConfig = qwen2vl.config  # type: ignore[reportAssignmentType]
+        vcfg = cfg.vision_config
+    else:
+        qwen2vl = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+            model_name, torch_dtype=dtype, device_map="cpu"
+        )
+        cfg: Qwen2_5_VLConfig = qwen2vl.config  # type: ignore[reportAssignmentType]
+        vcfg = cfg.vision_config
+
+    if os.path.isdir(model_name):
+        local_model = True
+        if model_name.endswith(os.sep):
+            model_name = model_name[:-1]
+        model_path = model_name
+        model_name = os.path.basename(model_name)
+    fname_out = f"{model_name.replace('/', '-').lower()}-vision.gguf"
+
+    fout = GGUFWriter(path=fname_out, arch="clip")
+    fout.add_description("image encoder for Qwen2VL")
+
+    fout.add_file_type(ftype)
+    fout.add_bool("clip.has_text_encoder", False)
+    fout.add_bool("clip.has_vision_encoder", True)
+    fout.add_bool("clip.has_qwen2vl_merger", True)
+
+    print(cfg.vision_config)
+    if 'silu' in cfg.vision_config.hidden_act.lower():
+        fout.add_bool("clip.use_silu", True)
+        fout.add_bool("clip.use_gelu", False)
+    elif 'gelu' in cfg.vision_config.hidden_act.lower():
+        fout.add_bool("clip.use_silu", False)
+        fout.add_bool("clip.use_gelu", 'quick' not in cfg.vision_config.hidden_act.lower())
+    else:
+        raise ValueError()
+
+    if args.model_type == "qwen2.5vl":
+        fout.add_uint32("clip.vision.n_wa_pattern", get_n_wa_pattern(vcfg.fullatt_block_indexes))
+        fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vcfg.hidden_size)
+        fout.add_uint32("clip.vision.projection_dim", vcfg.out_hidden_size)
+        fout.add_string("clip.projector_type", "qwen2.5vl_merger")
+    else:
+        fout.add_string("clip.projector_type", "qwen2vl_merger")
+        fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vcfg.embed_dim)
+        fout.add_uint32("clip.vision.projection_dim", vcfg.hidden_size)
+
+    if args.model_type == "qwen2.5vl":
+        tensor_map = VL25.find_vision_tensors(qwen2vl, np_dtype)
+    else:
+        tensor_map = VL2.find_vision_tensors(qwen2vl, np_dtype)
+    for name, data in tensor_map.items():
+        fout.add_tensor(name, data)
+
+    fout.add_uint32("clip.vision.patch_size", vcfg.patch_size)
+    fout.add_uint32("clip.vision.image_size", 14 * 40)  # some reasonable size that is divable by (14*2)
+    fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), vcfg.num_heads)
+    fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
+    fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), vcfg.depth)
+    fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), 0)  # not sure what this does, put 0 here as a placeholder
+    fout.add_name(model_name)
+    """
+    HACK: Since vision rope related parameter aren't stored in the `Qwen2VLConfig,
+            it will be hardcoded in the `clip_image_build_graph` from `clip.cpp`.
+    """
+
+    if local_model:
+        processor: Qwen2VLProcessor = AutoProcessor.from_pretrained(model_path)
+    else:
+        processor: Qwen2VLProcessor = AutoProcessor.from_pretrained(model_name)
+    fout.add_array("clip.vision.image_mean", processor.image_processor.image_mean) # type: ignore[reportAttributeAccessIssue]
+    fout.add_array("clip.vision.image_std", processor.image_processor.image_std) # type: ignore[reportAttributeAccessIssue]
+
+    fout.write_header_to_file()
+    fout.write_kv_data_to_file()
+    fout.write_tensors_to_file()
+    fout.close()
+    print("save model as: ", fname_out)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("model_name", nargs='?', default="Qwen/Qwen2-VL-2B-Instruct")
+    parser.add_argument("--model_type", nargs='?', choices=['qwen2vl', 'qwen2.5vl'], default="qwen2vl")
+    parser.add_argument("--data_type", nargs='?', choices=['fp32', 'fp16'], default="fp32")
+    args = parser.parse_args()
+    main(args)
--- a/examples/llava/qwen2vl-cli.cpp
+++ b/examples/llava/qwen2vl-cli.cpp
@@ -27,8 +27,6 @@
 #include <cassert>
 #include <cmath>

-// THIS FILE IS ONLY USED FOR TESTING THE QWEN2VL MODEL
-// IT IS NOT A PRODUCTION CODE

 static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed,
                                     int n_batch, int * n_past, int * st_pos_id, struct clip_image_size * image_size) {
@@ -94,12 +92,20 @@ static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct lla

 static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past, int * st_pos_id) {
    int N = (int) tokens.size();
+    std::vector<llama_pos> pos;
    for (int i = 0; i < N; i += n_batch) {
        int n_eval = (int) tokens.size() - i;
        if (n_eval > n_batch) {
            n_eval = n_batch;
        }
        auto batch = llama_batch_get_one(&tokens[i], n_eval);
+        // TODO: add mrope pos ids somewhere else
+        pos.resize(batch.n_tokens * 4);
+        std::fill(pos.begin(), pos.end(), 0);
+        for (int j = 0; j < batch.n_tokens * 3; j ++) {
+            pos[j] = *st_pos_id + (j % batch.n_tokens);
+        }
+        batch.pos = pos.data();

        if (llama_decode(ctx_llama, batch)) {
            LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
--- a/examples/llava/requirements.txt
+++ b/examples/llava/requirements.txt
--- a/examples/llava/test-1.jpeg
+++ b/examples/llava/test-1.jpeg
--- a/examples/llava/tests.sh
+++ b/examples/llava/tests.sh
@@ -36,6 +36,12 @@ add_test() {
    arr_tmpl+=("$tmpl")
 }

+add_test_big() {
+    if [ "$RUN_BIG_TESTS" = true ]; then
+        add_test "$@"
+    fi
+}
+
 add_test "llama-mtmd-cli"  "ggml-org/SmolVLM-500M-Instruct-GGUF:Q8_0"
 add_test "llama-mtmd-cli"  "ggml-org/SmolVLM2-2.2B-Instruct-GGUF:Q4_K_M"
 add_test "llama-mtmd-cli"  "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF:Q8_0"
@@ -48,20 +54,11 @@ add_test "llama-mtmd-cli"  "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M"
 add_test "llama-mtmd-cli"  "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K"  # model from openbmb is corrupted
 add_test "llama-mtmd-cli"  "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
 add_test "llama-mtmd-cli"  "openbmb/MiniCPM-o-2_6-gguf:Q4_0"
-add_test "llama-mtmd-cli"  "bartowski/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
-add_test "llama-mtmd-cli"  "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
+add_test "llama-qwen2vl-cli"  "bartowski/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
+add_test "llama-qwen2vl-cli"  "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"

 # to test the big models, run: ./tests.sh big
-if [ "$RUN_BIG_TESTS" = true ]; then
-    add_test "llama-mtmd-cli" "ggml-org/pixtral-12b-GGUF:Q4_K_M"
-    add_test "llama-mtmd-cli" "ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF" "mistral-v7"
-    add_test "llama-mtmd-cli" "ggml-org/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
-    add_test "llama-mtmd-cli" "ggml-org/Qwen2-VL-7B-Instruct-GGUF:Q4_K_M"
-    add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
-    add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-7B-Instruct-GGUF:Q4_K_M"
-    # add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-32B-Instruct-GGUF:Q4_K_M" # does not work on my mac M3 Ultra
-    # add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-72B-Instruct-GGUF:Q4_K_M" # too big
-fi
+add_test_big "llama-mtmd-cli" "ggml-org/pixtral-12b-GGUF:Q4_K_M"

 # these models always give the wrong answer, not sure why
 # add_test "llama-mtmd-cli"  "ggml-org/SmolVLM-Instruct-GGUF:Q4_K_M"
--- a/examples/main/CMakeLists.txt
+++ b/examples/main/CMakeLists.txt
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -1,4 +1,4 @@
-# llama.cpp/tools/main
+# llama.cpp/examples/main

 This example program allows you to use various LLaMA language models easily and efficiently. It is specifically designed to work with the [llama.cpp](https://github.com/ggml-org/llama.cpp) project, which provides a plain C/C++ implementation with optional 4-bit quantization support for faster, lower memory inference, and is optimized for desktop CPUs. This program can be used to perform various inference tasks with LLaMA models, including generating text based on user-provided prompts and chat-like interactions with reverse prompts.

--- a/Show More
+++ b/Show More