llama : one-off chat template fix for Mistral-Small-2503 (#13398 )

* llama : one-off chat template fix for Mistral-Small-2503 * update readme * add mistral-v7-tekken
rpc : add rpc_msg_set_tensor_hash_req (#13353 )
2026-02-05 13:53:23 +02:00 · 2025-05-09 11:17:51 +02:00 · 2025-05-09 10:31:07 +03:00 · 2025-05-09 09:23:41 +02:00 · 2025-05-09 09:06:37 +02:00 · 2025-05-08 23:45:22 +02:00
540 changed files with 53126 additions and 38689 deletions
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -13,6 +13,7 @@ Checks: >
    -readability-magic-numbers,
    -readability-uppercase-literal-suffix,
    -readability-simplify-boolean-expr,
+    -readability-math-missing-parentheses,
    clang-analyzer-*,
    -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
    performance-*,
--- a/.devops/cpu.Dockerfile
+++ b/.devops/cpu.Dockerfile
@@ -14,9 +14,9 @@ WORKDIR /app
 COPY . .

 RUN if [ "$TARGETARCH" = "amd64" ]; then \
-        cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
+        cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
    elif [ "$TARGETARCH" = "arm64" ]; then \
-        cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=${GGML_CPU_ARM_ARCH}; \
+        cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_CPU_ARM_ARCH=${GGML_CPU_ARM_ARCH}; \
    else \
        echo "Unsupported architecture"; \
        exit 1; \
--- a/.devops/cuda.Dockerfile
+++ b/.devops/cuda.Dockerfile
@@ -21,7 +21,7 @@ COPY . .
 RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
    export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
    fi && \
-    cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
    cmake --build build --config Release -j$(nproc)

 RUN mkdir -p /app/lib && \
--- a/.devops/intel.Dockerfile
+++ b/.devops/intel.Dockerfile
@@ -17,7 +17,7 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
        && export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
    fi && \
    echo "Building with dynamic libs" && \
-    cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${OPT_SYCL_F16} && \
    cmake --build build --config Release -j$(nproc)

 RUN mkdir -p /app/lib && \
--- a/.devops/llama-cli-cann.Dockerfile
+++ b/.devops/llama-cli-cann.Dockerfile
@@ -1,4 +1,4 @@
-ARG ASCEND_VERSION=8.0.rc2.alpha003-910b-openeuler22.03-py3.8
+ARG ASCEND_VERSION=8.1.RC1.alpha001-910b-openeuler22.03-py3.10

 FROM ascendai/cann:$ASCEND_VERSION AS build

@@ -6,7 +6,7 @@ WORKDIR /app

 COPY . .

-RUN yum install -y gcc g++ cmake make
+RUN yum install -y gcc g++ cmake make libcurl-devel
 ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
 ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
 ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
@@ -22,7 +22,7 @@ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH

 RUN echo "Building with static libs" && \
    source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
-    cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF  && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_TESTS=OFF  && \
    cmake --build build --config Release --target llama-cli

 # TODO: use image with NNRT
--- a/.devops/musa.Dockerfile
+++ b/.devops/musa.Dockerfile
@@ -35,7 +35,7 @@ COPY . .
 RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
        export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
    fi && \
-    cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
    cmake --build build --config Release -j$(nproc)

 RUN mkdir -p /app/lib && \
--- a/.devops/rocm.Dockerfile
+++ b/.devops/rocm.Dockerfile
@@ -17,8 +17,8 @@ FROM ${BASE_ROCM_DEV_CONTAINER} AS build
 # gfx906 is deprecated
 #check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.2.4/reference/system-requirements.html

-#ARG ROCM_DOCKER_ARCH='gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102'
-ARG ROCM_DOCKER_ARCH=gfx1100
+ARG ROCM_DOCKER_ARCH='gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102'
+#ARG ROCM_DOCKER_ARCH=gfx1100

 # Set nvcc architectured
 ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
@@ -40,7 +40,7 @@ WORKDIR /app
 COPY . .

 RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
-    cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON \
+    cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_BUILD_TESTS=OFF \
    && cmake --build build --config Release -j$(nproc)

 RUN mkdir -p /app/lib \
--- a/.devops/vulkan.Dockerfile
+++ b/.devops/vulkan.Dockerfile
@@ -16,7 +16,7 @@ WORKDIR /app

 COPY . .

-RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
+RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1  -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON && \
    cmake --build build --config Release -j$(nproc)

 RUN mkdir -p /app/lib && \
--- a/.editorconfig
+++ b/.editorconfig
@@ -21,15 +21,15 @@ indent_style = tab
 [prompts/*.txt]
 insert_final_newline = unset

-[examples/server/public/*]
+[tools/server/public/*]
 indent_size = 2

-[examples/server/public/deps_*]
+[tools/server/public/deps_*]
 trim_trailing_whitespace = unset
 indent_style = unset
 indent_size = unset

-[examples/server/deps_*]
+[tools/server/deps_*]
 trim_trailing_whitespace = unset
 indent_style = unset
 indent_size = unset
@@ -37,7 +37,7 @@ indent_size = unset
 [examples/llama.swiftui/llama.swiftui.xcodeproj/*]
 indent_style = tab

-[examples/cvector-generator/*.txt]
+[tools/cvector-generator/*.txt]
 trim_trailing_whitespace = unset
 insert_final_newline = unset

--- a/.flake8
+++ b/.flake8
@@ -2,8 +2,9 @@
 max-line-length = 125
 ignore = E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503
 exclude =
-    # Do not traverse examples
+    # Do not traverse examples and tools
    examples,
+    tools,
    # Do not include package initializers
    __init__.py,
    # No need to traverse our git directory
--- a/.github/actions/get-tag-name/action.yml
+++ b/.github/actions/get-tag-name/action.yml
@@ -0,0 +1,22 @@
+name: "Determine tag name"
+description: "Determine the tag name to use for a release"
+outputs:
+  name:
+    description: "The name of the tag"
+    value: ${{ steps.tag.outputs.name }}
+
+runs:
+  using: "composite"
+  steps:
+    - name: Determine tag name
+      id: tag
+      shell: bash
+      run: |
+        BUILD_NUMBER="$(git rev-list --count HEAD)"
+        SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+        if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+          echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+        else
+          SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+          echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+        fi
--- a/.github/actions/windows-setup-cuda/action.yml
+++ b/.github/actions/windows-setup-cuda/action.yml
@@ -0,0 +1,67 @@
+name: "Windows - Setup CUDA Toolkit"
+description: "Setup CUDA Toolkit for Windows"
+inputs:
+  cuda_version:
+    description: "CUDA toolkit version"
+    required: true
+
+runs:
+  using: "composite"
+  steps:
+    - name: Install Cuda Toolkit 11.7
+      if: ${{ inputs.cuda_version == '11.7' }}
+      shell: pwsh
+      run: |
+          mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
+          choco install unzip -y
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-11.7.99-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-11.7.99-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-11.7.99-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-11.7.4.6-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-11.7.91-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-11.7.91-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-11.7.101-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-11.7.91-archive.zip"
+          unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cudart-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvcc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvrtc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libcublas-windows-x86_64-11.7.4.6-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvtx-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\visual_studio_integration-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvprof-windows-x86_64-11.7.101-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cccl-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
+          echo "CUDA_PATH_V11_7=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
+
+    - name: Install Cuda Toolkit 12.4
+      if: ${{ inputs.cuda_version == '12.4' }}
+      shell: pwsh
+      run: |
+          mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
+          choco install unzip -y
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.4.127-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-12.4.131-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-12.4.127-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.4.5.8-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-12.4.127-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-12.4.127-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-12.4.127-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-12.4.127-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-12.4.127-archive.zip"
+          unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cudart-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvcc-windows-x86_64-12.4.131-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvrtc-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libcublas-windows-x86_64-12.4.5.8-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvtx-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_profiler_api-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\visual_studio_integration-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvprof-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cccl-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
+          echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
--- a/.github/actions/windows-setup-curl/action.yml
+++ b/.github/actions/windows-setup-curl/action.yml
@@ -0,0 +1,25 @@
+name: 'Windows - Setup CURL'
+description: 'Composite action, to be reused in other workflow'
+inputs:
+  curl_version:
+    description: 'CURL version'
+    required: false
+    default: '8.6.0_6'
+outputs:
+  curl_path:
+    description: "Path to the downloaded libcurl"
+    value: ${{ steps.get_libcurl.outputs.curl_path }}
+
+runs:
+  using: "composite"
+  steps:
+    - name: libCURL
+      id: get_libcurl
+      shell: powershell
+      env:
+        CURL_VERSION: ${{ inputs.curl_version }}
+      run: |
+        curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-win64-mingw.zip"
+        mkdir $env:RUNNER_TEMP/libcurl
+        tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl
+        echo "curl_path=$env:RUNNER_TEMP/libcurl" >> $env:GITHUB_OUTPUT
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -45,7 +45,9 @@ build:
            - CMakePresets.json
 examples:
    - changed-files:
-        - any-glob-to-any-file: examples/**
+        - any-glob-to-any-file:
+            - examples/**
+            - tools/**
 devops:
    - changed-files:
        - any-glob-to-any-file:
@@ -70,7 +72,7 @@ android:
 server:
    - changed-files:
        - any-glob-to-any-file:
-            - examples/server/**
+            - tools/server/**
 ggml:
    - changed-files:
        - any-glob-to-any-file:
--- a/.github/workflows/bench.yml.disabled
+++ b/.github/workflows/bench.yml.disabled
@@ -27,10 +27,10 @@ on:
  push:
    branches:
      - master
-    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
+    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'tools/server/*.h*', 'tools/server/*.cpp']
  pull_request_target:
    types: [opened, synchronize, reopened]
-    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
+    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'tools/server/*.h*', 'tools/server/*.cpp']
  schedule:
    -  cron: '04 2 * * *'

@@ -69,7 +69,7 @@ jobs:
      - name: Install python env
        id: pipenv
        run: |
-          cd examples/server/bench
+          cd tools/server/bench
          python3 -m venv venv
          source venv/bin/activate
          pip install -r requirements.txt
@@ -79,7 +79,7 @@ jobs:
        run: |
          wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz
          tar xzf prometheus*.tar.gz --strip-components=1
-          ./prometheus --config.file=examples/server/bench/prometheus.yml &
+          ./prometheus --config.file=tools/server/bench/prometheus.yml &
          while ! nc -z localhost 9090; do
            sleep 0.1
          done
@@ -92,7 +92,7 @@ jobs:
      - name: Install k6 and xk6-sse
        id: k6_installation
        run: |
-          cd examples/server/bench
+          cd tools/server/bench
          go install go.k6.io/xk6/cmd/xk6@latest
          xk6 build master \
              --with github.com/phymbert/xk6-sse
@@ -104,7 +104,6 @@ jobs:
          cmake -B build \
              -DGGML_NATIVE=OFF \
              -DLLAMA_BUILD_SERVER=ON \
-              -DLLAMA_CURL=ON \
              -DLLAMA_CUBLAS=ON \
              -DCUDAToolkit_ROOT=/usr/local/cuda \
              -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
@@ -117,7 +116,7 @@ jobs:
      - name: Download the dataset
        id: download_dataset
        run: |
-          cd examples/server/bench
+          cd tools/server/bench
          wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json

      - name: Server bench
@@ -127,7 +126,7 @@ jobs:
        run: |
          set -eux

-          cd examples/server/bench
+          cd tools/server/bench
          source venv/bin/activate
          python bench.py \
              --runner-label ${{ env.RUNNER_LABEL }} \
@@ -158,9 +157,9 @@ jobs:
          name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
          compression-level: 9
          path: |
-            examples/server/bench/*.jpg
-            examples/server/bench/*.json
-            examples/server/bench/*.log
+            tools/server/bench/*.jpg
+            tools/server/bench/*.json
+            tools/server/bench/*.log

      - name: Commit status
        uses: Sibz/github-status-action@v1
@@ -179,17 +178,17 @@ jobs:
        with:
          client_id: ${{secrets.IMGUR_CLIENT_ID}}
          path: |
-            examples/server/bench/prompt_tokens_seconds.jpg
-            examples/server/bench/predicted_tokens_seconds.jpg
-            examples/server/bench/kv_cache_usage_ratio.jpg
-            examples/server/bench/requests_processing.jpg
+            tools/server/bench/prompt_tokens_seconds.jpg
+            tools/server/bench/predicted_tokens_seconds.jpg
+            tools/server/bench/kv_cache_usage_ratio.jpg
+            tools/server/bench/requests_processing.jpg

      - name: Extract mermaid
        id: set_mermaid
        run: |
          set -eux

-          cd examples/server/bench
+          cd tools/server/bench
          PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid)
          echo "PROMPT_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
          echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV
--- a/.github/workflows/build-linux-cross.yml
+++ b/.github/workflows/build-linux-cross.yml
@@ -0,0 +1,142 @@
+name: Build on Linux using cross-compiler
+on:
+  workflow_dispatch:
+  workflow_call:
+
+jobs:
+  ubuntu-24-riscv64-cpu-cross:
+    runs-on: ubuntu-24.04
+
+    steps:
+      - uses: actions/checkout@v4
+      - name: Setup Riscv
+        run: |
+          sudo dpkg --add-architecture riscv64
+
+          # Add arch-specific repositories for non-amd64 architectures
+          cat << EOF | sudo tee /etc/apt/sources.list.d/riscv64-ports.list
+          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
+          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
+          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
+          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
+          EOF
+
+          sudo apt-get update || true    ;# Prevent failure due to missing URLs.
+
+          sudo apt-get install -y --no-install-recommends \
+                  build-essential \
+                  gcc-14-riscv64-linux-gnu \
+                  g++-14-riscv64-linux-gnu \
+                  libcurl4-openssl-dev:riscv64
+
+      - name: Build
+        run: |
+          cmake -B build -DCMAKE_BUILD_TYPE=Release \
+                         -DGGML_OPENMP=OFF \
+                         -DLLAMA_BUILD_EXAMPLES=ON \
+                         -DLLAMA_BUILD_TOOLS=ON \
+                         -DLLAMA_BUILD_TESTS=OFF \
+                         -DCMAKE_SYSTEM_NAME=Linux \
+                         -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
+                         -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
+                         -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
+                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
+                         -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
+                         -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
+                         -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
+                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
+
+          cmake --build build --config Release -j $(nproc)
+
+  ubuntu-24-riscv64-vulkan-cross:
+    runs-on: ubuntu-24.04
+
+    steps:
+      - uses: actions/checkout@v4
+      - name: Setup Riscv
+        run: |
+          sudo dpkg --add-architecture riscv64
+
+          # Add arch-specific repositories for non-amd64 architectures
+          cat << EOF | sudo tee /etc/apt/sources.list.d/riscv64-ports.list
+          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
+          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
+          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
+          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
+          EOF
+
+          sudo apt-get update || true    ;# Prevent failure due to missing URLs.
+
+          sudo apt-get install -y --no-install-recommends \
+                  build-essential \
+                  glslc \
+                  gcc-14-riscv64-linux-gnu \
+                  g++-14-riscv64-linux-gnu \
+                  libvulkan-dev:riscv64 \
+                  libcurl4-openssl-dev:riscv64
+
+      - name: Build
+        run: |
+          cmake -B build -DCMAKE_BUILD_TYPE=Release \
+                         -DGGML_VULKAN=ON \
+                         -DGGML_OPENMP=OFF \
+                         -DLLAMA_BUILD_EXAMPLES=ON \
+                         -DLLAMA_BUILD_TOOLS=ON \
+                         -DLLAMA_BUILD_TESTS=OFF \
+                         -DCMAKE_SYSTEM_NAME=Linux \
+                         -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
+                         -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
+                         -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
+                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
+                         -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
+                         -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
+                         -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
+                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
+
+          cmake --build build --config Release -j $(nproc)
+
+  ubuntu-24-arm64-vulkan-cross:
+    runs-on: ubuntu-24.04
+
+    steps:
+      - uses: actions/checkout@v4
+      - name: Setup Arm64
+        run: |
+          sudo dpkg --add-architecture arm64
+
+          # Add arch-specific repositories for non-amd64 architectures
+          cat << EOF | sudo tee /etc/apt/sources.list.d/arm64-ports.list
+          deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
+          deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
+          deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
+          deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
+          EOF
+
+          sudo apt-get update || true    ;# Prevent failure due to missing URLs.
+
+          sudo apt-get install -y --no-install-recommends \
+                  build-essential \
+                  glslc \
+                  crossbuild-essential-arm64 \
+                  libvulkan-dev:arm64 \
+                  libcurl4-openssl-dev:arm64
+
+      - name: Build
+        run: |
+          cmake -B build -DCMAKE_BUILD_TYPE=Release \
+                         -DGGML_VULKAN=ON \
+                         -DGGML_OPENMP=OFF \
+                         -DLLAMA_BUILD_EXAMPLES=ON \
+                         -DLLAMA_BUILD_TOOLS=ON \
+                         -DLLAMA_BUILD_TESTS=OFF \
+                         -DCMAKE_SYSTEM_NAME=Linux \
+                         -DCMAKE_SYSTEM_PROCESSOR=aarch64 \
+                         -DCMAKE_C_COMPILER=aarch64-linux-gnu-gcc \
+                         -DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++ \
+                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
+                         -DCMAKE_FIND_ROOT_PATH=/usr/lib/aarch64-linux-gnu \
+                         -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
+                         -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
+                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
+
+          cmake --build build --config Release -j $(nproc)
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -36,13 +36,17 @@ jobs:
      matrix:
        config:
          # Multi-stage build
-          - { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: false}
-          - { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
-          - { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
-          - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
-          - { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
+          # Note: the arm64 images are failing, which prevents the amd64 images from being built
+          # https://github.com/ggml-org/llama.cpp/issues/11888
+          #- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: false }
+          - { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
+          - { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
+          - { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
+          # Note: the intel images are failing due to an out of disk space error
+          # - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
+          - { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
          # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
-          #- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: true }
+          #- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: true }
    steps:
      - name: Check out the repo
        uses: actions/checkout@v4
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -0,0 +1,709 @@
+name: Create Release
+
+on:
+  workflow_dispatch: # allows manual triggering
+    inputs:
+      create_release:
+        description: 'Create new release'
+        required: true
+        type: boolean
+  push:
+    branches:
+      - master
+    paths: ['.github/workflows/release.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
+
+env:
+  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
+  CMAKE_ARGS: "-DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=ON -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON"
+
+jobs:
+  macOS-arm64:
+    runs-on: macos-14
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: macOS-latest-cmake-arm64
+          evict-old-files: 1d
+
+      - name: Dependencies
+        id: depends
+        continue-on-error: true
+        run: |
+          brew update
+          brew install curl
+
+      - name: Build
+        id: cmake_build
+        run: |
+          sysctl -a
+          cmake -B build \
+            -DCMAKE_BUILD_RPATH="@loader_path" \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DGGML_METAL_USE_BF16=ON \
+            -DGGML_METAL_EMBED_LIBRARY=ON \
+            -DGGML_RPC=ON \
+            ${{ env.CMAKE_ARGS }}
+          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
+
+      - name: Determine tag name
+        id: tag
+        uses: ./.github/actions/get-tag-name
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        run: |
+          cp LICENSE ./build/bin/
+          zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
+          name: llama-bin-macos-arm64.zip
+
+  macOS-x64:
+    runs-on: macos-13
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: macOS-latest-cmake-x64
+          evict-old-files: 1d
+
+      - name: Dependencies
+        id: depends
+        continue-on-error: true
+        run: |
+          brew update
+          brew install curl
+
+      - name: Build
+        id: cmake_build
+        run: |
+          sysctl -a
+          # Metal is disabled due to intermittent failures with Github runners not having a GPU:
+          # https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
+          cmake -B build \
+            -DCMAKE_BUILD_RPATH="@loader_path" \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DGGML_METAL=OFF \
+            -DGGML_RPC=ON
+          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
+
+      - name: Determine tag name
+        id: tag
+        uses: ./.github/actions/get-tag-name
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        run: |
+          cp LICENSE ./build/bin/
+          zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
+          name: llama-bin-macos-x64.zip
+
+  ubuntu-22-cpu:
+    strategy:
+      matrix:
+        include:
+          - build: 'x64'
+            os: ubuntu-22.04
+          - build: 'arm64'
+            os: ubuntu-22.04-arm
+
+    runs-on: ${{ matrix.os }}
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: ubuntu-cpu-cmake
+          evict-old-files: 1d
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential libcurl4-openssl-dev
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -B build \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            ${{ env.CMAKE_ARGS }}
+          cmake --build build --config Release -j $(nproc)
+
+      - name: Determine tag name
+        id: tag
+        uses: ./.github/actions/get-tag-name
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        run: |
+          cp LICENSE ./build/bin/
+          zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/*
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip
+          name: llama-bin-ubuntu-${{ matrix.build }}.zip
+
+  ubuntu-22-vulkan:
+    runs-on: ubuntu-22.04
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: ubuntu-22-cmake-vulkan
+          evict-old-files: 1d
+
+      - name: Dependencies
+        id: depends
+        run: |
+          wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
+          sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
+          sudo apt-get update -y
+          sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libcurl4-openssl-dev
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -B build \
+            -DGGML_VULKAN=ON \
+            ${{ env.CMAKE_ARGS }}
+          cmake --build build --config Release -j $(nproc)
+
+      - name: Determine tag name
+        id: tag
+        uses: ./.github/actions/get-tag-name
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        run: |
+          cp LICENSE ./build/bin/
+          zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip ./build/bin/*
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip
+          name: llama-bin-ubuntu-vulkan-x64.zip
+
+  windows:
+    runs-on: windows-latest
+
+    env:
+      OPENBLAS_VERSION: 0.3.23
+      VULKAN_VERSION: 1.4.309.0
+
+    strategy:
+      matrix:
+        include:
+          - build: 'cpu-x64'
+            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF'
+          #- build: 'openblas-x64'
+          #  defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
+          - build: 'vulkan-x64'
+            defines: '-DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
+          - build: 'cpu-arm64'
+            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF'
+          - build: 'opencl-adreno-arm64'
+            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: windows-latest-cmake-${{ matrix.build }}
+          variant: ccache
+          evict-old-files: 1d
+
+      - name: Download OpenBLAS
+        id: get_openblas
+        if: ${{ matrix.build == 'openblas-x64' }}
+        run: |
+          curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
+          curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
+          mkdir $env:RUNNER_TEMP/openblas
+          tar.exe -xvf $env:RUNNER_TEMP/openblas.zip -C $env:RUNNER_TEMP/openblas
+          $vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
+          $msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
+          $lib =  $(join-path $msvc 'bin\Hostx64\x64\lib.exe')
+          & $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll
+
+      - name: Install Vulkan SDK
+        id: get_vulkan
+        if: ${{ matrix.build == 'vulkan-x64' }}
+        run: |
+          curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
+          & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
+          Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
+          Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
+
+      - name: Install Ninja
+        id: install_ninja
+        run: |
+          choco install ninja
+
+      - name: Install OpenCL Headers and Libs
+        id: install_opencl
+        if: ${{ matrix.build == 'opencl-adreno-arm64' }}
+        run: |
+          git clone https://github.com/KhronosGroup/OpenCL-Headers
+          cd OpenCL-Headers
+          cmake -B build `
+            -DBUILD_TESTING=OFF `
+            -DOPENCL_HEADERS_BUILD_TESTING=OFF `
+            -DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
+            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
+          cmake --build build --target install
+          git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
+          cd OpenCL-ICD-Loader
+          cmake -B build-arm64-release `
+            -A arm64 `
+            -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" `
+            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
+          cmake --build build-arm64-release --target install --config release
+
+      - name: libCURL
+        id: get_libcurl
+        uses: ./.github/actions/windows-setup-curl
+
+      - name: Build
+        id: cmake_build
+        env:
+          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
+        run: |
+          cmake -S . -B build ${{ matrix.defines }} `
+            -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" `
+            ${{ env.CMAKE_ARGS }}
+          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
+
+      - name: Add libopenblas.dll
+        id: add_libopenblas_dll
+        if: ${{ matrix.build == 'openblas-x64' }}
+        run: |
+          cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
+          cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
+
+      - name: Determine tag name
+        id: tag
+        uses: ./.github/actions/get-tag-name
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        env:
+          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
+        run: |
+          Copy-Item $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll
+          7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
+          name: llama-bin-win-${{ matrix.build }}.zip
+
+  windows-cuda:
+    runs-on: windows-2019
+
+    strategy:
+      matrix:
+        cuda: ['12.4', '11.7']
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+        with:
+            fetch-depth: 0
+
+      - name: Install ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: windows-cuda-${{ matrix.cuda }}
+          variant: ccache
+          evict-old-files: 1d
+
+      - name: Install Cuda Toolkit
+        uses: ./.github/actions/windows-setup-cuda
+        with:
+          cuda_version: ${{ matrix.cuda }}
+
+      - name: Install Ninja
+        id: install_ninja
+        run: |
+          choco install ninja
+
+      - name: libCURL
+        id: get_libcurl
+        uses: ./.github/actions/windows-setup-curl
+
+      - name: Build
+        id: cmake_build
+        shell: cmd
+        env:
+          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
+        run: |
+          call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
+          cmake -S . -B build -G "Ninja Multi-Config" ^
+            -DGGML_NATIVE=OFF ^
+            -DGGML_BACKEND_DL=ON ^
+            -DGGML_CPU_ALL_VARIANTS=ON ^
+            -DGGML_CUDA=ON ^
+            -DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include" ^
+            ${{ env.CMAKE_ARGS }}
+          set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
+          cmake --build build --config Release -j %NINJA_JOBS% -t ggml
+          cmake --build build --config Release
+
+      - name: Determine tag name
+        id: tag
+        uses: ./.github/actions/get-tag-name
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        env:
+          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
+        run: |
+          cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll
+          7z a llama-${{ steps.tag.outputs.name }}-bin-win-cuda${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-win-cuda${{ matrix.cuda }}-x64.zip
+          name: llama-bin-win-cuda${{ matrix.cuda }}-x64.zip
+
+      - name: Copy and pack Cuda runtime
+        run: |
+          echo "Cuda install location: ${{ env.CUDA_PATH }}"
+          $dst='.\build\bin\cudart\'
+          robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
+          robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
+          7z a cudart-llama-bin-win-cuda${{ matrix.cuda }}-x64.zip $dst\*
+
+      - name: Upload Cuda runtime
+        uses: actions/upload-artifact@v4
+        with:
+          path: cudart-llama-bin-win-cuda${{ matrix.cuda }}-x64.zip
+          name: cudart-llama-bin-win-cuda${{ matrix.cuda }}-x64.zip
+
+  windows-sycl:
+    runs-on: windows-latest
+
+    defaults:
+      run:
+        shell: bash
+
+    env:
+      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b380d914-366b-4b77-a74a-05e3c38b3514/intel-oneapi-base-toolkit-2025.0.0.882_offline.exe
+      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
+      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: windows-latest-cmake-sycl
+          variant: ccache
+          evict-old-files: 1d
+
+      - name: Install
+        run:  |
+          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
+
+      # TODO: add libcurl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
+
+      - name: Build
+        id: cmake_build
+        run:  examples/sycl/win-build-sycl.bat
+
+      - name: Determine tag name
+        id: tag
+        uses: ./.github/actions/get-tag-name
+
+      - name: Build the release package
+        id: pack_artifacts
+        run: |
+          echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
+
+          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
+
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
+
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
+
+          cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
+
+          echo "cp oneAPI running time dll files to ./build/bin done"
+          7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
+
+      - name: Upload the release package
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
+          name: llama-bin-win-sycl-x64.zip
+
+  windows-hip:
+    runs-on: windows-latest
+
+    strategy:
+      matrix:
+        gpu_target: [gfx1100, gfx1101, gfx1030]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+        with:
+            fetch-depth: 0
+
+      - name: Clone rocWMMA repository
+        id: clone_rocwmma
+        run: |
+          git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: windows-latest-cmake-hip-release
+          evict-old-files: 1d
+
+      - name: Install
+        id: depends
+        run: |
+          $ErrorActionPreference = "Stop"
+          write-host "Downloading AMD HIP SDK Installer"
+          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
+          write-host "Installing AMD HIP SDK"
+          Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
+          write-host "Completed AMD HIP SDK installation"
+
+      - name: Verify ROCm
+        id: verify
+        run: |
+          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
+
+      - name: libCURL
+        id: get_libcurl
+        uses: ./.github/actions/windows-setup-curl
+
+      - name: Build
+        id: cmake_build
+        env:
+          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
+        run: |
+          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
+          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
+          cmake -G "Unix Makefiles" -B build -S . `
+            -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
+            -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
+            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/" `
+            -DCMAKE_BUILD_TYPE=Release `
+            -DAMDGPU_TARGETS=${{ matrix.gpu_target }} `
+            -DGGML_HIP_ROCWMMA_FATTN=ON `
+            -DGGML_HIP=ON `
+            -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" `
+            ${{ env.CMAKE_ARGS }}
+          cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
+          md "build\bin\rocblas\library\"
+          cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
+          cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
+          cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
+
+      - name: Determine tag name
+        id: tag
+        uses: ./.github/actions/get-tag-name
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        env:
+          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
+        run: |
+          cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\libcurl-x64.dll
+          7z a llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip .\build\bin\*
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
+          name: llama-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
+
+  ios-xcode-build:
+    runs-on: macos-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Build
+        id: cmake_build
+        run: |
+          sysctl -a
+          cmake -B build -G Xcode \
+            -DGGML_METAL_USE_BF16=ON \
+            -DGGML_METAL_EMBED_LIBRARY=ON \
+            -DLLAMA_CURL=OFF \
+            -DLLAMA_BUILD_EXAMPLES=OFF \
+            -DLLAMA_BUILD_TOOLS=OFF \
+            -DLLAMA_BUILD_TESTS=OFF \
+            -DLLAMA_BUILD_SERVER=OFF \
+            -DCMAKE_SYSTEM_NAME=iOS \
+            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
+            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
+          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
+
+      - name: xcodebuild for swift package
+        id: xcodebuild
+        run: |
+          ./build-xcframework.sh
+
+      - name: Build Xcode project
+        run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build
+
+      - name: Determine tag name
+        id: tag
+        uses: ./.github/actions/get-tag-name
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        run: |
+          zip --symlinks -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
+          name: llama-${{ steps.tag.outputs.name }}-xcframework
+
+  release:
+    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+
+    # Fine-grant permission
+    # https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
+    permissions:
+        contents: write # for creating release
+
+    runs-on: ubuntu-latest
+
+    needs:
+      - ubuntu-22-cpu
+      - ubuntu-22-vulkan
+      - windows
+      - windows-cuda
+      - windows-sycl
+      - windows-hip
+      - macOS-arm64
+      - macOS-x64
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Determine tag name
+        id: tag
+        uses: ./.github/actions/get-tag-name
+
+      - name: Download artifacts
+        id: download-artifact
+        uses: actions/download-artifact@v4
+        with:
+          path: ./artifact
+
+      - name: Move artifacts
+        id: move_artifacts
+        run: mkdir -p ./artifact/release && mv ./artifact/*/*.zip ./artifact/release
+
+      - name: Create release
+        id: create_release
+        uses: ggml-org/action-create-release@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          tag_name: ${{ steps.tag.outputs.name }}
+
+      - name: Upload release
+        id: upload_release
+        uses: actions/github-script@v3
+        with:
+          github-token: ${{secrets.GITHUB_TOKEN}}
+          script: |
+            const path = require('path');
+            const fs = require('fs');
+            const release_id = '${{ steps.create_release.outputs.id }}';
+            for (let file of await fs.readdirSync('./artifact/release')) {
+              if (path.extname(file) === '.zip') {
+                console.log('uploadReleaseAsset', file);
+                await github.repos.uploadReleaseAsset({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  release_id: release_id,
+                  name: file,
+                  data: await fs.readFileSync(`./artifact/release/${file}`)
+                });
+              }
+            }
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -15,10 +15,10 @@ on:
  push:
    branches:
      - master
-    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
+    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*']
  pull_request:
    types: [opened, synchronize, reopened]
-    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
+    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*']

 env:
  LLAMA_LOG_COLORS: 1
@@ -74,7 +74,7 @@ jobs:
      - name: Tests dependencies
        id: test_dependencies
        run: |
-          pip install -r examples/server/tests/requirements.txt
+          pip install -r tools/server/tests/requirements.txt

      # Setup nodejs (to be used for verifying bundled index.html)
      - uses: actions/setup-node@v4
@@ -84,14 +84,14 @@ jobs:
      - name: WebUI - Install dependencies
        id: webui_lint
        run: |
-          cd examples/server/webui
+          cd tools/server/webui
          npm ci

      - name: WebUI - Check code format
        id: webui_format
        run: |
          git config --global --add safe.directory $(realpath .)
-          cd examples/server/webui
+          cd tools/server/webui
          git status

          npm run format
@@ -108,7 +108,7 @@ jobs:
        id: verify_server_index_html
        run: |
          git config --global --add safe.directory $(realpath .)
-          cd examples/server/webui
+          cd tools/server/webui
          git status

          npm run build
@@ -129,7 +129,6 @@ jobs:
          cmake -B build \
              -DGGML_NATIVE=OFF \
              -DLLAMA_BUILD_SERVER=ON \
-              -DLLAMA_CURL=ON \
              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
              -DGGML_OPENMP=OFF ;
@@ -142,7 +141,6 @@ jobs:
          cmake -B build \
              -DGGML_NATIVE=OFF \
              -DLLAMA_BUILD_SERVER=ON \
-              -DLLAMA_CURL=ON \
              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
@@ -154,7 +152,6 @@ jobs:
          cmake -B build \
              -DGGML_NATIVE=OFF \
              -DLLAMA_BUILD_SERVER=ON \
-              -DLLAMA_CURL=ON \
              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server

@@ -164,21 +161,21 @@ jobs:
        env:
          GITHUB_ACTIONS: "true"
        run: |
-          cd examples/server/tests
+          cd tools/server/tests
          ./tests.sh

      - name: Tests (sanitizers)
        id: server_integration_tests_sanitizers
        if: ${{ matrix.sanitizer != '' }}
        run: |
-          cd examples/server/tests
+          cd tools/server/tests
          LLAMA_SANITIZE=1 ./tests.sh

      - name: Slow tests
        id: server_integration_tests_slow
        if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
        run: |
-          cd examples/server/tests
+          cd tools/server/tests
          SLOW_TESTS=1 ./tests.sh


@@ -195,17 +192,14 @@ jobs:

      - name: libCURL
        id: get_libcurl
-        env:
-          CURL_VERSION: 8.6.0_6
-        run: |
-          curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-win64-mingw.zip"
-          mkdir $env:RUNNER_TEMP/libcurl
-          tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl
+        uses: ./.github/actions/windows-setup-curl

      - name: Build
        id: cmake_build
+        env:
+          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
        run: |
-          cmake -B build -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
+          cmake -B build -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server

      - name: Python setup
@@ -217,18 +211,20 @@ jobs:
      - name: Tests dependencies
        id: test_dependencies
        run: |
-          pip install -r examples/server/tests/requirements.txt
+          pip install -r tools/server/tests/requirements.txt

      - name: Copy Libcurl
        id: prepare_libcurl
+        env:
+          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
        run: |
-          cp $env:RUNNER_TEMP/libcurl/bin/libcurl-x64.dll ./build/bin/Release/libcurl-x64.dll
+          cp $env:CURL_PATH/bin/libcurl-x64.dll ./build/bin/Release/libcurl-x64.dll

      - name: Tests
        id: server_integration_tests
        if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
        run: |
-          cd examples/server/tests
+          cd tools/server/tests
          $env:PYTHONIOENCODING = ":replace"
          pytest -v -x -m "not slow"

@@ -236,6 +232,6 @@ jobs:
        id: server_integration_tests_slow
        if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
        run: |
-          cd examples/server/tests
+          cd tools/server/tests
          $env:SLOW_TESTS = "1"
          pytest -v -x
--- a/.gitignore
+++ b/.gitignore
@@ -96,11 +96,11 @@ perf-*.txt
 # Examples

 examples/jeopardy/results.txt
-examples/server/*.css.hpp
-examples/server/*.html.hpp
-examples/server/*.js.hpp
-examples/server/*.mjs.hpp
-examples/server/*.gz.hpp
+tools/server/*.css.hpp
+tools/server/*.html.hpp
+tools/server/*.js.hpp
+tools/server/*.mjs.hpp
+tools/server/*.gz.hpp
 !build_64.sh
 !examples/*.bat
 !examples/*/*.kts
@@ -110,7 +110,7 @@ examples/server/*.gz.hpp

 # Server Web UI temporary files
 node_modules
-examples/server/webui/dist
+tools/server/webui/dist

 # Python

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -77,11 +77,12 @@ option(LLAMA_BUILD_COMMON "llama: build common utils library" ${LLAMA_STANDALONE

 # extra artifacts
 option(LLAMA_BUILD_TESTS    "llama: build tests"          ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_TOOLS    "llama: build tools"          ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES "llama: build examples"       ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_SERVER   "llama: build server example" ${LLAMA_STANDALONE})

 # 3rd party libs
-option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
+option(LLAMA_CURL       "llama: use libcurl to download model from an URL" ON)
 option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF)

 # Required for relocatable CMake package
@@ -168,6 +169,11 @@ add_subdirectory(src)
 # utils, programs, examples and tests
 #

+if (NOT LLAMA_BUILD_COMMON)
+    message(STATUS "LLAMA_BUILD_COMMON is OFF, disabling LLAMA_CURL")
+    set(LLAMA_CURL OFF)
+endif()
+
 if (LLAMA_BUILD_COMMON)
    add_subdirectory(common)
 endif()
@@ -182,6 +188,10 @@ if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES)
    add_subdirectory(pocs)
 endif()

+if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TOOLS)
+    add_subdirectory(tools)
+endif()
+
 #
 # install
 #
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -38,15 +38,6 @@
        }
    },

-    {
-        "name": "arm64-windows-msvc", "hidden": true,
-        "architecture": { "value": "arm64",    "strategy": "external" },
-        "toolset":      { "value": "host=x64", "strategy": "external" },
-        "cacheVariables": {
-            "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
-        }
-    },
-
    {
        "name": "arm64-windows-llvm", "hidden": true,
        "architecture": { "value": "arm64",    "strategy": "external" },
@@ -73,10 +64,6 @@
    { "name": "arm64-apple-clang-release", "inherits": [ "base", "arm64-apple-clang", "reldbg" ] },
    { "name": "arm64-apple-clang+static-release", "inherits": [ "base", "arm64-apple-clang",  "reldbg", "static" ] },

-    { "name": "arm64-windows-msvc-debug", "inherits": [ "base", "arm64-windows-msvc",  "debug"   ] },
-    { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc",  "reldbg" ] },
-    { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc",  "reldbg", "static" ] },
-
    { "name": "x64-windows-llvm-debug", "inherits": [ "base", "x64-windows-llvm", "debug" ] },
    { "name": "x64-windows-llvm-release", "inherits": [ "base", "x64-windows-llvm", "release" ] },
    { "name": "x64-windows-llvm-reldbg", "inherits": [ "base", "x64-windows-llvm", "reldbg" ] },
--- a/2
+++ b/2
@@ -2,7 +2,7 @@

 /ci/ @ggerganov
 /.devops/*.Dockerfile @ngxson
-/examples/server/ @ngxson
+/tools/server/ @ngxson
 /ggml/src/ggml-cuda/fattn* @JohannesGaessler
 /ggml/src/ggml-cuda/mmq.* @JohannesGaessler
 /ggml/src/ggml-cuda/mmv.* @JohannesGaessler
--- a/101
+++ b/101
@@ -780,10 +780,6 @@ ifdef GGML_HIP

 	MK_CPPFLAGS += -DGGML_USE_HIP -DGGML_USE_CUDA

-ifdef GGML_HIP_UMA
-	MK_CPPFLAGS += -DGGML_HIP_UMA
-endif # GGML_HIP_UMA
-
 	MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
 	MK_LDFLAGS += -L$(ROCM_PATH)/lib64 -Wl,-rpath=$(ROCM_PATH)/lib64
 	MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
@@ -1160,10 +1156,10 @@ $(LIB_COMMON_S): $(OBJ_COMMON)

 # Clean generated server assets
 clean-server-assets:
-	find examples/server -type f -name "*.js.hpp"   -delete
-	find examples/server -type f -name "*.mjs.hpp"  -delete
-	find examples/server -type f -name "*.css.hpp"  -delete
-	find examples/server -type f -name "*.html.hpp" -delete
+	find tools/server -type f -name "*.js.hpp"   -delete
+	find tools/server -type f -name "*.mjs.hpp"  -delete
+	find tools/server -type f -name "*.css.hpp"  -delete
+	find tools/server -type f -name "*.html.hpp" -delete

 # Clean rule
 clean: clean-server-assets
@@ -1183,7 +1179,7 @@ clean: clean-server-assets
 # Helper function that replaces .c, .cpp, and .cu file endings with .o:
 GET_OBJ_FILE = $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(patsubst %.cu,%.o,$(1))))

-llama-cli: examples/main/main.cpp \
+llama-cli: tools/main/main.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -1191,12 +1187,7 @@ llama-cli: examples/main/main.cpp \
 	@echo '====  Run ./llama-cli -h for help.  ===='
 	@echo

-llama-infill: examples/infill/infill.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-run: examples/run/run.cpp \
+llama-run: tools/run/run.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -1211,7 +1202,7 @@ llama-simple-chat: examples/simple-chat/simple-chat.cpp \
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-tokenize: examples/tokenize/tokenize.cpp \
+llama-tokenize: tools/tokenize/tokenize.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -1221,27 +1212,27 @@ llama-batched: examples/batched/batched.cpp \
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-batched-bench: examples/batched-bench/batched-bench.cpp \
+llama-batched-bench: tools/batched-bench/batched-bench.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-quantize: examples/quantize/quantize.cpp \
+llama-quantize: tools/quantize/quantize.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-quantize-stats: examples/quantize-stats/quantize-stats.cpp \
+llama-quantize-stats: tools/quantize-stats/quantize-stats.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-perplexity: examples/perplexity/perplexity.cpp \
+llama-perplexity: tools/perplexity/perplexity.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-imatrix: examples/imatrix/imatrix.cpp \
+llama-imatrix: tools/imatrix/imatrix.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -1283,7 +1274,7 @@ llama-gguf-hash: examples/gguf-hash/gguf-hash.cpp examples/gguf-hash/deps/sha1/s
 	$(CXX) $(CXXFLAGS) -Iexamples/gguf-hash/deps -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-gguf-split: examples/gguf-split/gguf-split.cpp \
+llama-gguf-split: tools/gguf-split/gguf-split.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -1293,7 +1284,7 @@ llama-eval-callback: examples/eval-callback/eval-callback.cpp \
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp \
+llama-cvector-generator: tools/cvector-generator/cvector-generator.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -1303,12 +1294,12 @@ llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-bench: examples/llama-bench/llama-bench.cpp \
+llama-bench: tools/llama-bench/llama-bench.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-export-lora: examples/export-lora/export-lora.cpp \
+llama-export-lora: tools/export-lora/export-lora.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -1364,17 +1355,17 @@ llama-gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp \
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

 ifdef GGML_RPC
-rpc-server: examples/rpc/rpc-server.cpp \
+rpc-server: tools/rpc/rpc-server.cpp \
 	$(OBJ_GGML)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 endif # GGML_RPC

 llama-server: \
-	examples/server/server.cpp \
-	examples/server/utils.hpp \
-	examples/server/httplib.h \
-	examples/server/index.html.hpp \
-	examples/server/loading.html.hpp \
+	tools/server/server.cpp \
+	tools/server/utils.hpp \
+	tools/server/httplib.h \
+	tools/server/index.html.hpp \
+	tools/server/loading.html.hpp \
 	common/chat.cpp \
 	common/chat.h \
 	common/chat-template.hpp \
@@ -1382,10 +1373,10 @@ llama-server: \
 	common/minja.hpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Itools/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)

-# Portable equivalent of `cd examples/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`:
-examples/server/%.hpp: examples/server/public/% FORCE Makefile
+# Portable equivalent of `cd tools/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`:
+tools/server/%.hpp: tools/server/public/% FORCE Makefile
 	@( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \
 		echo "unsigned char $${NAME}[] = {" && \
 		cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \
@@ -1398,36 +1389,36 @@ llama-gen-docs: examples/gen-docs/gen-docs.cpp \
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-libllava.a: examples/llava/llava.cpp \
-	examples/llava/llava.h \
-	examples/llava/clip.cpp \
-	examples/llava/clip.h \
+libllava.a: tools/mtmd/llava.cpp \
+	tools/mtmd/llava.h \
+	tools/mtmd/clip.cpp \
+	tools/mtmd/clip.h \
 	common/stb_image.h \
 	common/base64.hpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual

-llama-llava-cli: examples/llava/llava-cli.cpp \
-	examples/llava/llava.cpp \
-	examples/llava/llava.h \
-	examples/llava/clip.cpp \
-	examples/llava/clip.h \
+llama-llava-cli: tools/mtmd/llava-cli.cpp \
+	tools/mtmd/llava.cpp \
+	tools/mtmd/llava.h \
+	tools/mtmd/clip.cpp \
+	tools/mtmd/clip.h \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual

-llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp \
-	examples/llava/llava.cpp \
-	examples/llava/llava.h \
-	examples/llava/clip.cpp \
-	examples/llava/clip.h \
+llama-minicpmv-cli: tools/mtmd/minicpmv-cli.cpp \
+	tools/mtmd/llava.cpp \
+	tools/mtmd/llava.h \
+	tools/mtmd/clip.cpp \
+	tools/mtmd/clip.h \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual

-llama-qwen2vl-cli: examples/llava/qwen2vl-cli.cpp \
-	examples/llava/llava.cpp \
-	examples/llava/llava.h \
-	examples/llava/clip.cpp \
-	examples/llava/clip.h \
+llama-qwen2vl-cli: tools/mtmd/qwen2vl-cli.cpp \
+	tools/mtmd/llava.cpp \
+	tools/mtmd/llava.h \
+	tools/mtmd/clip.cpp \
+	tools/mtmd/clip.h \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual

@@ -1484,12 +1475,12 @@ tests/test-double-float: tests/test-double-float.cpp

 tests/test-json-schema-to-grammar: tests/test-json-schema-to-grammar.cpp \
 	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) -Itools/server -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

 tests/test-chat: tests/test-chat.cpp \
 	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) -Itools/server -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

 tests/test-opt: tests/test-opt.cpp \
--- a/README.md
+++ b/README.md
@@ -9,13 +9,6 @@

 Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++

-> [!IMPORTANT]
-> New `llama.cpp` package location: [ggml-org/llama.cpp](https://github.com/ggml-org/llama.cpp/pkgs/container/llama.cpp)
->
-> Update your container URLs to: `ghcr.io/ggml-org/llama.cpp`
->
-> More info: https://github.com/ggml-org/llama.cpp/discussions/11801
-
 ## Recent API changes

 - [Changelog for `libllama` API](https://github.com/ggml-org/llama.cpp/issues/9289)
@@ -23,8 +16,9 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)

 ## Hot topics

- **How to use [MTLResidencySet](https://developer.apple.com/documentation/metal/mtlresidencyset?language=objc) to keep the GPU memory active?** https://github.com/ggml-org/llama.cpp/pull/11427
- **VS Code extension for FIM completions:** https://github.com/ggml-org/llama.vscode
+- **GGML developer experience survey (organized and reviewed by NVIDIA):** [link](https://forms.gle/Gasw3cRgyhNEnrwK9)
+- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141]((https://github.com/ggml-org/llama.cpp/pull/13141))), `libllava` will be deprecated
+- VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
 - Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
 - Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
 - Introducing GGUF-my-LoRA https://github.com/ggml-org/llama.cpp/discussions/10123
@@ -104,6 +98,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - [x] [Flan T5](https://huggingface.co/models?search=flan-t5)
 - [x] [Open Elm models](https://huggingface.co/collections/apple/openelm-instruct-models-6619ad295d7ae9f868b759ca)
 - [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b) + [GLMEdge-1.5b](https://huggingface.co/THUDM/glm-edge-1.5b-chat) + [GLMEdge-4b](https://huggingface.co/THUDM/glm-edge-4b-chat)
+- [x] [GLM-4-0414](https://huggingface.co/collections/THUDM/glm-4-0414-67f3cbcb34dd9d252707cb2e)
 - [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)
 - [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct)
 - [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a)
@@ -112,6 +107,8 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM)
 - [x] [QRWKV-6](https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1)
 - [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct)
+- [X] [Trillion-7B-preview](https://huggingface.co/trillionlabs/Trillion-7B-preview)
+- [x] [Ling models](https://huggingface.co/collections/inclusionAI/ling-67c51c85b34a7ea0aba94c32)

 #### Multimodal

@@ -245,6 +242,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 | [Vulkan](docs/build.md#vulkan) | GPU |
 | [CANN](docs/build.md#cann) | Ascend NPU |
 | [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
+| [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |

 ## Building the project

@@ -263,7 +261,9 @@ The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](htt
 - [Trending](https://huggingface.co/models?library=gguf&sort=trending)
 - [LLaMA](https://huggingface.co/models?sort=trending&search=llama+gguf)

-You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from Hugging Face by using this CLI argument: `-hf <user>/<model>[:quant]`
+You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from [Hugging Face](https://huggingface.co/) or other model hosting sites, such as [ModelScope](https://modelscope.cn/), by using this CLI argument: `-hf <user>/<model>[:quant]`.
+
+By default, the CLI would download from Hugging Face, you can switch to other options with the environment variable `MODEL_ENDPOINT`. For example, you may opt to downloading model checkpoints from ModelScope or other model sharing communities by setting the environment variable, e.g. `MODEL_ENDPOINT=https://www.modelscope.cn/`.

 After downloading a model, use the CLI tools to run it locally - see below.

@@ -276,9 +276,9 @@ The Hugging Face platform provides a variety of online tools for converting, qua
 - Use the [GGUF-editor space](https://huggingface.co/spaces/CISCai/gguf-editor) to edit GGUF meta data in the browser (more info: https://github.com/ggml-org/llama.cpp/discussions/9268)
 - Use the [Inference Endpoints](https://ui.endpoints.huggingface.co/) to directly host `llama.cpp` in the cloud (more info: https://github.com/ggml-org/llama.cpp/discussions/9669)

-To learn more about model quantization, [read this documentation](examples/quantize/README.md)
+To learn more about model quantization, [read this documentation](tools/quantize/README.md)

-## [`llama-cli`](examples/main)
+## [`llama-cli`](tools/main)

 #### A CLI tool for accessing and experimenting with most of `llama.cpp`'s functionality.

@@ -341,7 +341,7 @@ To learn more about model quantization, [read this documentation](examples/quant
    </details>


-## [`llama-server`](examples/server)
+## [`llama-server`](tools/server)

 #### A lightweight, [OpenAI API](https://github.com/openai/openai-openapi) compatible, HTTP server for serving LLMs.

@@ -411,7 +411,7 @@ To learn more about model quantization, [read this documentation](examples/quant
    </details>


-## [`llama-perplexity`](examples/perplexity)
+## [`llama-perplexity`](tools/perplexity)

 #### A tool for measuring the perplexity [^1][^2] (and other quality metrics) of a model over a given text.

@@ -436,10 +436,10 @@ To learn more about model quantization, [read this documentation](examples/quant

    </details>

-[^1]: [examples/perplexity/README.md](./examples/perplexity/README.md)
+[^1]: [tools/perplexity/README.md](./tools/perplexity/README.md)
 [^2]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity)

-## [`llama-bench`](examples/llama-bench)
+## [`llama-bench`](tools/llama-bench)

 #### Benchmark the performance of the inference for various parameters.

@@ -460,7 +460,7 @@ To learn more about model quantization, [read this documentation](examples/quant

    </details>

-## [`llama-run`](examples/run)
+## [`llama-run`](tools/run)

 #### A comprehensive example for running `llama.cpp` models. Useful for inferencing. Used with RamaLama [^3].

@@ -504,8 +504,8 @@ To learn more about model quantization, [read this documentation](examples/quant

 ## Other documentation

- [main (cli)](examples/main/README.md)
- [server](examples/server/README.md)
+- [main (cli)](tools/main/README.md)
+- [server](tools/server/README.md)
 - [GBNF grammars](grammars/README.md)

 #### Development documentation
@@ -528,6 +528,35 @@ If your issue is with model generation quality, then please at least scan the fo
    - [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
    - [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)

+## XCFramework
+The XCFramework is a precompiled version of the library for iOS, visionOS, tvOS,
+and macOS. It can be used in Swift projects without the need to compile the
+library from source. For example:
+```swift
+// swift-tools-version: 5.10
+// The swift-tools-version declares the minimum version of Swift required to build this package.
+
+import PackageDescription
+
+let package = Package(
+    name: "MyLlamaPackage",
+    targets: [
+        .executableTarget(
+            name: "MyLlamaPackage",
+            dependencies: [
+                "LlamaFramework"
+            ]),
+        .binaryTarget(
+            name: "LlamaFramework",
+            url: "https://github.com/ggml-org/llama.cpp/releases/download/b5046/llama-b5046-xcframework.zip",
+            checksum: "c19be78b5f00d8d29a25da41042cb7afa094cbf6280a225abe614b03b20029ab"
+        )
+    ]
+)
+```
+The above example is using an intermediate build `b5046` of the library. This can be modified
+to use a different version by changing the URL and checksum.
+
 ## Completions
 Command-line completion is available for some environments.

--- a/SECURITY.md
+++ b/SECURITY.md
@@ -40,7 +40,8 @@ To protect sensitive data from potential leaks or unauthorized access, it is cru
 ### Untrusted environments or networks

 If you can't run your models in a secure and isolated environment or if it must be exposed to an untrusted network, make sure to take the following security precautions:
-* Confirm the hash of any downloaded artifact (e.g. pre-trained model weights) matches a known-good value
+* Do not use the RPC backend, [rpc-server](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) and [llama-server](https://github.com/ggml-org/llama.cpp/tree/master/tools/server) functionality (see https://github.com/ggml-org/llama.cpp/pull/13061).
+* Confirm the hash of any downloaded artifact (e.g. pre-trained model weights) matches a known-good value.
 * Encrypt your data if sending it over the network.

 ### Multi-Tenant environments
--- a/build-xcframework.sh
+++ b/build-xcframework.sh
@@ -8,6 +8,7 @@ TVOS_MIN_OS_VERSION=16.4

 BUILD_SHARED_LIBS=OFF
 LLAMA_BUILD_EXAMPLES=OFF
+LLAMA_BUILD_TOOLS=OFF
 LLAMA_BUILD_TESTS=OFF
 LLAMA_BUILD_SERVER=OFF
 GGML_METAL=ON
@@ -31,6 +32,7 @@ COMMON_CMAKE_ARGS=(
    -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
    -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}
    -DLLAMA_BUILD_EXAMPLES=${LLAMA_BUILD_EXAMPLES}
+    -DLLAMA_BUILD_TOOLS=${LLAMA_BUILD_TOOLS}
    -DLLAMA_BUILD_TESTS=${LLAMA_BUILD_TESTS}
    -DLLAMA_BUILD_SERVER=${LLAMA_BUILD_SERVER}
    -DGGML_METAL_EMBED_LIBRARY=${GGML_METAL_EMBED_LIBRARY}
@@ -41,6 +43,11 @@ COMMON_CMAKE_ARGS=(
    -DGGML_OPENMP=${GGML_OPENMP}
 )

+XCODE_VERSION=$(xcodebuild -version 2>/dev/null | head -n1 | awk '{ print $2 }')
+MAJOR_VERSION=$(echo $XCODE_VERSION | cut -d. -f1)
+MINOR_VERSION=$(echo $XCODE_VERSION | cut -d. -f2)
+echo "Detected Xcode version: $XCODE_VERSION"
+
 check_required_tool() {
    local tool=$1
    local install_message=$2
@@ -325,21 +332,28 @@ combine_static_libraries() {

    # Platform-specific post-processing for device builds
    if [[ "$is_simulator" == "false" ]]; then
-        if command -v vtool &>/dev/null; then
+        if command -v xcrun vtool &>/dev/null; then
            case "$platform" in
                "ios")
                    echo "Marking binary as a framework binary for iOS..."
-                    vtool -set-build-version ios ${IOS_MIN_OS_VERSION} ${IOS_MIN_OS_VERSION} -replace \
+                    xcrun vtool -set-build-version ios ${IOS_MIN_OS_VERSION} ${IOS_MIN_OS_VERSION} -replace \
                        -output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}"
                    ;;
                "visionos")
                    echo "Marking binary as a framework binary for visionOS..."
-                    vtool -set-build-version xros ${VISIONOS_MIN_OS_VERSION} ${VISIONOS_MIN_OS_VERSION} -replace \
+                    if [[ "$MAJOR_VERSION" -gt 16 ]] || [[ "$MAJOR_VERSION" -eq 16 && "$MINOR_VERSION" -gt 2 ]]; then
+                        echo "Xcode version greater than 16.2, using visionOS."
+                        VISION_OS_BUILD_VERSION="visionos"
+                    else
+                        echo "Xcode version less than or equal to 16.2, using xros."
+                        VISION_OS_BUILD_VERSION="xros"
+                    fi
+                    xcrun vtool -set-build-version ${VISION_OS_BUILD_VERSION} ${VISIONOS_MIN_OS_VERSION} ${VISIONOS_MIN_OS_VERSION} -replace \
                        -output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}"
                    ;;
                "tvos")
                    echo "Marking binary as a framework binary for tvOS..."
-                    vtool -set-build-version tvos ${TVOS_MIN_OS_VERSION} ${TVOS_MIN_OS_VERSION} -replace \
+                    xcrun vtool -set-build-version tvos ${TVOS_MIN_OS_VERSION} ${TVOS_MIN_OS_VERSION} -replace \
                        -output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}"
                    ;;
            esac
@@ -399,6 +413,7 @@ cmake -B build-ios-sim -G Xcode \
    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphonesimulator \
    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
+    -DLLAMA_CURL=OFF \
    -S .
 cmake --build build-ios-sim --config Release -- -quiet

@@ -411,6 +426,7 @@ cmake -B build-ios-device -G Xcode \
    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphoneos \
    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
+    -DLLAMA_CURL=OFF \
    -S .
 cmake --build build-ios-device --config Release -- -quiet

@@ -421,6 +437,7 @@ cmake -B build-macos -G Xcode \
    -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
+    -DLLAMA_CURL=OFF \
    -S .
 cmake --build build-macos --config Release -- -quiet

@@ -434,6 +451,7 @@ cmake -B build-visionos -G Xcode \
    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xros \
    -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
+    -DLLAMA_CURL=OFF \
    -S .
 cmake --build build-visionos --config Release -- -quiet

@@ -447,6 +465,7 @@ cmake -B build-visionos-sim -G Xcode \
    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xrsimulator \
    -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
+    -DLLAMA_CURL=OFF \
    -S .
 cmake --build build-visionos-sim --config Release -- -quiet

@@ -462,6 +481,7 @@ cmake -B build-tvos-sim -G Xcode \
    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=appletvsimulator \
    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
+    -DLLAMA_CURL=OFF \
    -S .
 cmake --build build-tvos-sim --config Release -- -quiet

@@ -476,6 +496,7 @@ cmake -B build-tvos-device -G Xcode \
    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=appletvos \
    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
+    -DLLAMA_CURL=OFF \
    -S .
 cmake --build build-tvos-device --config Release -- -quiet

--- a/ci/README.md
+++ b/ci/README.md
@@ -26,4 +26,43 @@ GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 # with SYCL support
 source /opt/intel/oneapi/setvars.sh
 GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+
+# with MUSA support
+GG_BUILD_MUSA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 ```
+
+## Running MUSA CI in a Docker Container
+
+Assuming `$PWD` is the root of the `llama.cpp` repository, follow these steps to set up and run MUSA CI in a Docker container:
+
+### 1. Create a local directory to store cached models, configuration files and venv:
+
+```bash
+mkdir -p $HOME/llama.cpp/ci-cache
+```
+
+### 2. Create a local directory to store CI run results:
+
+```bash
+mkdir -p $HOME/llama.cpp/ci-results
+```
+
+### 3. Start a Docker container and run the CI:
+
+```bash
+docker run --privileged -it \
+    -v $HOME/llama.cpp/ci-cache:/ci-cache \
+    -v $HOME/llama.cpp/ci-results:/ci-results \
+    -v $PWD:/ws -w /ws \
+    mthreads/musa:rc3.1.1-devel-ubuntu22.04
+```
+
+Inside the container, execute the following commands:
+
+```bash
+apt update -y && apt install -y bc cmake ccache git python3.10-venv time unzip wget
+git config --global --add safe.directory /ws
+GG_BUILD_MUSA=1 bash ./ci/run.sh /ci-results /ci-cache
+```
+
+This setup ensures that the CI runs within an isolated Docker environment while maintaining cached files and results across runs.
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -16,6 +16,9 @@
 # # with VULKAN support
 # GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 #
+# # with MUSA support
+# GG_BUILD_MUSA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+#

 if [ -z "$2" ]; then
    echo "usage: $0 <output-dir> <mnt-dir>"
@@ -36,7 +39,7 @@ sd=`dirname $0`
 cd $sd/../
 SRC=`pwd`

-CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON"
+CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=OFF"

 if [ ! -z ${GG_BUILD_METAL} ]; then
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON -DGGML_METAL_USE_BF16=ON"
@@ -52,13 +55,24 @@ if [ ! -z ${GG_BUILD_SYCL} ]; then
        echo "source /opt/intel/oneapi/setvars.sh"
        exit 1
    fi
-
+    # Use only main GPU
+    export ONEAPI_DEVICE_SELECTOR="level_zero:0"
+    # Enable sysman for correct memory reporting
+    export ZES_ENABLE_SYSMAN=1
+    # to circumvent precision issues on CPY operations
+    export SYCL_PROGRAM_COMPILE_OPTIONS="-cl-fp32-correctly-rounded-divide-sqrt"
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
 fi

 if [ ! -z ${GG_BUILD_VULKAN} ]; then
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
 fi
+
+if [ ! -z ${GG_BUILD_MUSA} ]; then
+    # Use qy1 by default (MTT S80)
+    MUSA_ARCH=${MUSA_ARCH:-21}
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_MUSA=ON -DMUSA_ARCHITECTURES=${MUSA_ARCH}"
+fi
 ## helpers

 # download a file if it does not exist or if it is outdated
@@ -173,8 +187,8 @@ function gg_run_test_scripts_debug {

    set -e

-    (cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
-    (cd ./examples/quantize   && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
+    (cd ./tools/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
+    (cd ./tools/quantize   && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log

    set +e
 }
@@ -197,8 +211,8 @@ function gg_run_test_scripts_release {

    set -e

-    (cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
-    (cd ./examples/quantize   && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
+    (cd ./tools/gguf-split && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
+    (cd ./tools/quantize   && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log

    set +e
 }
@@ -808,7 +822,7 @@ export LLAMA_LOG_PREFIX=1
 export LLAMA_LOG_TIMESTAMPS=1

 if [ -z ${GG_BUILD_LOW_PERF} ]; then
-    # Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
+    # Create symlink: ./llama.cpp/models-mnt -> $MNT/models
    rm -rf ${SRC}/models-mnt
    mnt_models=${MNT}/models
    mkdir -p ${mnt_models}
@@ -826,8 +840,10 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
 fi

 ret=0
-
-test $ret -eq 0 && gg_run ctest_debug
+if [ -z ${GG_BUILD_SYCL} ]; then
+    # SYCL build breaks with debug build flags
+    test $ret -eq 0 && gg_run ctest_debug
+fi
 test $ret -eq 0 && gg_run ctest_release

 if [ -z ${GG_BUILD_LOW_PERF} ]; then
@@ -835,7 +851,9 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
    test $ret -eq 0 && gg_run rerank_tiny

    if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then
-        test $ret -eq 0 && gg_run test_scripts_debug
+        if [ -z ${GG_BUILD_SYCL} ]; then
+            test $ret -eq 0 && gg_run test_scripts_debug
+        fi
        test $ret -eq 0 && gg_run test_scripts_release
    fi

@@ -846,7 +864,9 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
            test $ret -eq 0 && gg_run pythia_2_8b
            #test $ret -eq 0 && gg_run open_llama_7b_v2
        fi
-        test $ret -eq 0 && gg_run ctest_with_model_debug
+        if [ -z ${GG_BUILD_SYCL} ]; then
+            test $ret -eq 0 && gg_run ctest_with_model_debug
+        fi
        test $ret -eq 0 && gg_run ctest_with_model_release
    fi
 fi
--- a/cmake/arm64-windows-msvc.cmake
+++ b/cmake/arm64-windows-msvc.cmake
@@ -1,6 +0,0 @@
-set( CMAKE_SYSTEM_NAME Windows )
-set( CMAKE_SYSTEM_PROCESSOR arm64 )
-
-set( target arm64-pc-windows-msvc )
-set( CMAKE_C_COMPILER_TARGET   ${target} )
-set( CMAKE_CXX_COMPILER_TARGET ${target} )
--- a/cmake/build-info.cmake
+++ b/cmake/build-info.cmake
@@ -41,14 +41,20 @@ endif()

 if(MSVC)
    set(BUILD_COMPILER "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
-    set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME})
+    if (CMAKE_VS_PLATFORM_NAME)
+        set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME})
+    else()
+        set(BUILD_TARGET "${CMAKE_SYSTEM_NAME} ${CMAKE_SYSTEM_PROCESSOR}")
+    endif()
 else()
    execute_process(
-        COMMAND sh -c "\"$@\" --version | head -1" _ ${CMAKE_C_COMPILER}
+        COMMAND ${CMAKE_C_COMPILER} --version
        OUTPUT_VARIABLE OUT
        OUTPUT_STRIP_TRAILING_WHITESPACE
    )
+    string(REGEX REPLACE " *\n.*" "" OUT "${OUT}")
    set(BUILD_COMPILER ${OUT})
+
    execute_process(
        COMMAND ${CMAKE_C_COMPILER} -dumpmachine
        OUTPUT_VARIABLE OUT
--- a/cmake/x64-windows-llvm.cmake
+++ b/cmake/x64-windows-llvm.cmake
@@ -3,9 +3,3 @@ set( CMAKE_SYSTEM_PROCESSOR x86_64 )

 set( CMAKE_C_COMPILER    clang )
 set( CMAKE_CXX_COMPILER  clang++ )
-
-set( arch_c_flags "-march=native" )
-
-set( CMAKE_C_FLAGS_INIT   "${arch_c_flags}" )
-set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags}" )
-
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -39,7 +39,9 @@ add_custom_command(
    COMMENT "Generating build details from Git"
    COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
            -DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
-            -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info-gen-cpp.cmake"
+            -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+            -DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME} -DCMAKE_SYSTEM_PROCESSOR=${CMAKE_SYSTEM_PROCESSOR}
+            -P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info-gen-cpp.cmake"
    WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
    DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
    VERBATIM
@@ -85,7 +87,10 @@ set(LLAMA_COMMON_EXTRA_LIBS build_info)

 # Use curl to download model url
 if (LLAMA_CURL)
-    find_package(CURL REQUIRED)
+    find_package(CURL)
+    if (NOT CURL_FOUND)
+        message(FATAL_ERROR "Could NOT find CURL. Hint: to disable this feature, set -DLLAMA_CURL=OFF")
+    endif()
    target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
    include_directories(${CURL_INCLUDE_DIRS})
    find_library(CURL_LIBRARY curl REQUIRED)
@@ -114,8 +119,8 @@ if (LLAMA_LLGUIDANCE)

    ExternalProject_Add(llguidance_ext
        GIT_REPOSITORY https://github.com/guidance-ai/llguidance
-        # v0.6.12:
-        GIT_TAG ced1c9023d47ec194fa977932d35ce65c2ebfc09
+        # v0.7.10:
+        GIT_TAG 0309d2a6bf40abda35344a362edc71e06d5009f8
        PREFIX ${CMAKE_BINARY_DIR}/llguidance
        SOURCE_DIR ${LLGUIDANCE_SRC}
        BUILD_IN_SOURCE TRUE
@@ -139,3 +144,27 @@ endif ()
 target_include_directories(${TARGET} PUBLIC .)
 target_compile_features   (${TARGET} PUBLIC cxx_std_17)
 target_link_libraries     (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
+
+
+#
+# copy the license files
+#
+
+# Check if running in GitHub Actions
+if (DEFINED ENV{GITHUB_ACTIONS} AND "$ENV{GITHUB_ACTIONS}" STREQUAL "true")
+    message(STATUS "Running inside GitHub Actions - copying license files")
+
+    # Copy all files from licenses/ to build/bin/
+    file(GLOB LICENSE_FILES "${CMAKE_SOURCE_DIR}/licenses/*")
+    foreach(LICENSE_FILE ${LICENSE_FILES})
+        get_filename_component(FILENAME ${LICENSE_FILE} NAME)
+        add_custom_command(
+            POST_BUILD
+            TARGET ${TARGET}
+            COMMAND ${CMAKE_COMMAND} -E copy_if_different
+                "${LICENSE_FILE}"
+                "$<TARGET_FILE_DIR:llama>/${FILENAME}"
+            COMMENT "Copying ${FILENAME} to ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}")
+        message(STATUS "Copying ${LICENSE_FILE} to ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${FILENAME}")
+    endforeach()
+endif()
--- a/common/arg.cpp
+++ b/common/arg.cpp
--- a/common/arg.h
+++ b/common/arg.h
@@ -78,3 +78,12 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e

 // function to be used by test-arg-parser
 common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
+bool common_has_curl();
+
+struct common_remote_params {
+    std::vector<std::string> headers;
+    long timeout = 0; // CURLOPT_TIMEOUT, in seconds ; 0 means no timeout
+    long max_size = 0; // max size of the response ; unlimited if 0 ; max is 2GB
+};
+// get remote file content, returns <http_code, raw_response_body>
+std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params);
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -125,7 +125,9 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messa
            msgs.push_back(msg);
        }
    } catch (const std::exception & e) {
-        throw std::runtime_error("Failed to parse messages: " + std::string(e.what()) + "; messages = " + messages.dump(2));
+        // @ngxson : disable otherwise it's bloating the API response
+        // printf("%s\n", std::string("; messages = ") + messages.dump(2));
+        throw std::runtime_error("Failed to parse messages: " + std::string(e.what()));
    }

    return msgs;
@@ -1622,7 +1624,7 @@ static common_chat_params common_chat_templates_apply_jinja(
    }

    // Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools)
-    if (src.find("<tool_call>") != std::string::npos && params.json_schema.is_null()) {
+    if (src.find("<tool_call>") != std::string::npos && params.json_schema.is_null() && params.tools.is_array() && params.json_schema.is_null()) {
        return common_chat_params_init_hermes_2_pro(tmpl, params);
    }

--- a/common/common.cpp
+++ b/common/common.cpp
@@ -7,9 +7,6 @@

 #include "common.h"
 #include "log.h"
-// Change JSON_ASSERT from assert() to GGML_ASSERT:
-#define JSON_ASSERT GGML_ASSERT
-#include "json.hpp"
 #include "llama.h"

 #include <algorithm>
@@ -51,47 +48,11 @@
 #include <sys/stat.h>
 #include <unistd.h>
 #endif
-#if defined(LLAMA_USE_CURL)
-#include <curl/curl.h>
-#include <curl/easy.h>
-#include <future>
-#endif

 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

-#if defined(LLAMA_USE_CURL)
-#ifdef __linux__
-#include <linux/limits.h>
-#elif defined(_WIN32)
-#   if !defined(PATH_MAX)
-#   define PATH_MAX MAX_PATH
-#   endif
-#else
-#include <sys/syslimits.h>
-#endif
-#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
-
-//
-// CURL utils
-//
-
-using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
-
-// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
-struct curl_slist_ptr {
-    struct curl_slist * ptr = nullptr;
-    ~curl_slist_ptr() {
-        if (ptr) {
-            curl_slist_free_all(ptr);
-        }
-    }
-};
-#endif // LLAMA_USE_CURL
-
-using json = nlohmann::ordered_json;
-
 //
 // CPU utils
 //
@@ -869,7 +830,7 @@ std::string fs_get_cache_directory() {
    if (getenv("LLAMA_CACHE")) {
        cache_directory = std::getenv("LLAMA_CACHE");
    } else {
-#ifdef __linux__
+#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)
        if (std::getenv("XDG_CACHE_HOME")) {
            cache_directory = std::getenv("XDG_CACHE_HOME");
        } else {
@@ -879,7 +840,9 @@ std::string fs_get_cache_directory() {
        cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
 #elif defined(_WIN32)
        cache_directory = std::getenv("LOCALAPPDATA");
-#endif // __linux__
+#else
+#  error Unknown architecture
+#endif
        cache_directory = ensure_trailing_slash(cache_directory);
        cache_directory += "llama.cpp";
    }
@@ -900,22 +863,14 @@ std::string fs_get_cache_file(const std::string & filename) {
 //
 // Model utils
 //
+
 struct common_init_result common_init_from_params(common_params & params) {
    common_init_result iparams;
    auto mparams = common_model_params_to_llama(params);

-    llama_model * model = nullptr;
-
-    if (!params.hf_repo.empty() && !params.hf_file.empty()) {
-        model = common_load_model_from_hf(params.hf_repo, params.hf_file, params.model, params.hf_token, mparams);
-    } else if (!params.model_url.empty()) {
-        model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
-    } else {
-        model = llama_model_load_from_file(params.model.c_str(), mparams);
-    }
-
+    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
    if (model == NULL) {
-        LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str());
+        LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
        return iparams;
    }

@@ -950,7 +905,7 @@ struct common_init_result common_init_from_params(common_params & params) {

    llama_context * lctx = llama_init_from_model(model, cparams);
    if (lctx == NULL) {
-        LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
+        LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
        llama_model_free(model);
        return iparams;
    }
@@ -1074,6 +1029,19 @@ struct common_init_result common_init_from_params(common_params & params) {
    return iparams;
 }

+std::string get_model_endpoint() {
+    const char * model_endpoint_env = getenv("MODEL_ENDPOINT");
+    // We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility.
+    const char * hf_endpoint_env = getenv("HF_ENDPOINT");
+    const char * endpoint_env = model_endpoint_env ? model_endpoint_env : hf_endpoint_env;
+    std::string model_endpoint = "https://huggingface.co/";
+    if (endpoint_env) {
+        model_endpoint = endpoint_env;
+        if (model_endpoint.back() != '/') model_endpoint += '/';
+    }
+    return model_endpoint;
+}
+
 void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) {
    llama_clear_adapter_lora(ctx);
    for (auto & la : lora) {
@@ -1089,15 +1057,18 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
    if (!params.devices.empty()) {
        mparams.devices = params.devices.data();
    }
+
    if (params.n_gpu_layers != -1) {
        mparams.n_gpu_layers = params.n_gpu_layers;
    }
+
    mparams.main_gpu        = params.main_gpu;
    mparams.split_mode      = params.split_mode;
    mparams.tensor_split    = params.tensor_split;
    mparams.use_mmap        = params.use_mmap;
    mparams.use_mlock       = params.use_mlock;
    mparams.check_tensors   = params.check_tensors;
+
    if (params.kv_overrides.empty()) {
        mparams.kv_overrides = NULL;
    } else {
@@ -1105,6 +1076,13 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
        mparams.kv_overrides = params.kv_overrides.data();
    }

+    if (params.tensor_buft_overrides.empty()) {
+        mparams.tensor_buft_overrides = NULL;
+    } else {
+        GGML_ASSERT(params.tensor_buft_overrides.back().pattern == nullptr && "Tensor buffer overrides not terminated with empty pattern");
+        mparams.tensor_buft_overrides = params.tensor_buft_overrides.data();
+    }
+
    return mparams;
 }

@@ -1118,7 +1096,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
    cparams.n_threads         = params.cpuparams.n_threads;
    cparams.n_threads_batch   = params.cpuparams_batch.n_threads == -1 ?
                                params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
-    cparams.logits_all        = params.logits_all;
    cparams.embeddings        = params.embedding;
    cparams.rope_scaling_type = params.rope_scaling_type;
    cparams.rope_freq_base    = params.rope_freq_base;
@@ -1164,451 +1141,6 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
    return tpp;
 }

-#ifdef LLAMA_USE_CURL
-
-#define CURL_MAX_RETRY 3
-#define CURL_RETRY_DELAY_SECONDS 2
-
-static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
-    int remaining_attempts = max_attempts;
-
-    while (remaining_attempts > 0) {
-        LOG_INF("%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
-
-        CURLcode res = curl_easy_perform(curl);
-        if (res == CURLE_OK) {
-            return true;
-        }
-
-        int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
-        LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
-
-        remaining_attempts--;
-        std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
-    }
-
-    LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
-
-    return false;
-}
-
-static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
-    // Initialize libcurl
-    curl_ptr       curl(curl_easy_init(), &curl_easy_cleanup);
-    curl_slist_ptr http_headers;
-    if (!curl) {
-        LOG_ERR("%s: error initializing libcurl\n", __func__);
-        return false;
-    }
-
-    bool force_download = false;
-
-    // Set the URL, allow to follow http redirection
-    curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
-    curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
-
-    // Check if hf-token or bearer-token was specified
-    if (!hf_token.empty()) {
-        std::string auth_header = "Authorization: Bearer " + hf_token;
-        http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
-        curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
-    }
-
-#if defined(_WIN32)
-    // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
-    //   operating system. Currently implemented under MS-Windows.
-    curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
-#endif
-
-    // Check if the file already exists locally
-    auto file_exists = std::filesystem::exists(path);
-
-    // If the file exists, check its JSON metadata companion file.
-    std::string metadata_path = path + ".json";
-    nlohmann::json metadata;
-    std::string etag;
-    std::string last_modified;
-
-    if (file_exists) {
-        // Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
-        std::ifstream metadata_in(metadata_path);
-        if (metadata_in.good()) {
-            try {
-                metadata_in >> metadata;
-                LOG_INF("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
-                if (metadata.contains("url") && metadata.at("url").is_string()) {
-                    auto previous_url = metadata.at("url").get<std::string>();
-                    if (previous_url != url) {
-                        LOG_ERR("%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
-                        return false;
-                    }
-                }
-                if (metadata.contains("etag") && metadata.at("etag").is_string()) {
-                    etag = metadata.at("etag");
-                }
-                if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
-                    last_modified = metadata.at("lastModified");
-                }
-            } catch (const nlohmann::json::exception & e) {
-            LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
-                return false;
-            }
-        }
-    } else {
-        LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
-    }
-
-    // Send a HEAD request to retrieve the etag and last-modified headers
-    struct common_load_model_from_url_headers {
-        std::string etag;
-        std::string last_modified;
-    };
-
-    common_load_model_from_url_headers headers;
-
-    {
-        typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
-        auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
-            common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
-
-            static std::regex header_regex("([^:]+): (.*)\r\n");
-            static std::regex etag_regex("ETag", std::regex_constants::icase);
-            static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
-
-            std::string header(buffer, n_items);
-            std::smatch match;
-            if (std::regex_match(header, match, header_regex)) {
-                const std::string & key = match[1];
-                const std::string & value = match[2];
-                if (std::regex_match(key, match, etag_regex)) {
-                    headers->etag = value;
-                } else if (std::regex_match(key, match, last_modified_regex)) {
-                    headers->last_modified = value;
-                }
-            }
-            return n_items;
-        };
-
-        curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
-        curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
-        curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
-        curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
-
-        bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
-        if (!was_perform_successful) {
-            return false;
-        }
-
-        long http_code = 0;
-        curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
-        if (http_code != 200) {
-            // HEAD not supported, we don't know if the file has changed
-            // force trigger downloading
-            force_download = true;
-            LOG_ERR("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
-        }
-    }
-
-    bool should_download = !file_exists || force_download;
-    if (!should_download) {
-        if (!etag.empty() && etag != headers.etag) {
-            LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
-            should_download = true;
-        } else if (!last_modified.empty() && last_modified != headers.last_modified) {
-            LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
-            should_download = true;
-        }
-    }
-    if (should_download) {
-        std::string path_temporary = path + ".downloadInProgress";
-        if (file_exists) {
-            LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
-            if (remove(path.c_str()) != 0) {
-                LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
-                return false;
-            }
-        }
-
-        // Set the output file
-
-        struct FILE_deleter {
-            void operator()(FILE * f) const {
-                fclose(f);
-            }
-        };
-
-        std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "wb"));
-        if (!outfile) {
-            LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path.c_str());
-            return false;
-        }
-
-        typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd);
-        auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t {
-            return fwrite(data, size, nmemb, (FILE *)fd);
-        };
-        curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 0L);
-        curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
-        curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, outfile.get());
-
-        //  display download progress
-        curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L);
-
-        // helper function to hide password in URL
-        auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
-            std::size_t protocol_pos = url.find("://");
-            if (protocol_pos == std::string::npos) {
-                return url;  // Malformed URL
-            }
-
-            std::size_t at_pos = url.find('@', protocol_pos + 3);
-            if (at_pos == std::string::npos) {
-                return url;  // No password in URL
-            }
-
-            return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos);
-        };
-
-        // start the download
-        LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
-            llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
-        bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
-        if (!was_perform_successful) {
-            return false;
-        }
-
-        long http_code = 0;
-        curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
-        if (http_code < 200 || http_code >= 400) {
-            LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
-            return false;
-        }
-
-        // Causes file to be closed explicitly here before we rename it.
-        outfile.reset();
-
-        // Write the updated JSON metadata file.
-        metadata.update({
-            {"url", url},
-            {"etag", headers.etag},
-            {"lastModified", headers.last_modified}
-        });
-        std::ofstream(metadata_path) << metadata.dump(4);
-        LOG_INF("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
-
-        if (rename(path_temporary.c_str(), path.c_str()) != 0) {
-            LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
-            return false;
-        }
-    }
-
-    return true;
-}
-
-struct llama_model * common_load_model_from_url(
-        const std::string & model_url,
-        const std::string & local_path,
-        const std::string & hf_token,
-        const struct llama_model_params & params) {
-    // Basic validation of the model_url
-    if (model_url.empty()) {
-        LOG_ERR("%s: invalid model_url\n", __func__);
-        return NULL;
-    }
-
-    if (!common_download_file(model_url, local_path, hf_token)) {
-        return NULL;
-    }
-
-    // check for additional GGUFs split to download
-    int n_split = 0;
-    {
-        struct gguf_init_params gguf_params = {
-            /*.no_alloc = */ true,
-            /*.ctx      = */ NULL,
-        };
-        auto * ctx_gguf = gguf_init_from_file(local_path.c_str(), gguf_params);
-        if (!ctx_gguf) {
-            LOG_ERR("\n%s:  failed to load input GGUF from %s\n", __func__, local_path.c_str());
-            return NULL;
-        }
-
-        auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
-        if (key_n_split >= 0) {
-            n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
-        }
-
-        gguf_free(ctx_gguf);
-    }
-
-    if (n_split > 1) {
-        char split_prefix[PATH_MAX] = {0};
-        char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};
-
-        // Verify the first split file format
-        // and extract split URL and PATH prefixes
-        {
-            if (!llama_split_prefix(split_prefix, sizeof(split_prefix), local_path.c_str(), 0, n_split)) {
-                LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, local_path.c_str(), n_split);
-                return NULL;
-            }
-
-            if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url.c_str(), 0, n_split)) {
-                LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url.c_str(), n_split);
-                return NULL;
-            }
-        }
-
-        // Prepare download in parallel
-        std::vector<std::future<bool>> futures_download;
-        for (int idx = 1; idx < n_split; idx++) {
-            futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split, hf_token](int download_idx) -> bool {
-                char split_path[PATH_MAX] = {0};
-                llama_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split);
-
-                char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
-                llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
-
-                return common_download_file(split_url, split_path, hf_token);
-            }, idx));
-        }
-
-        // Wait for all downloads to complete
-        for (auto & f : futures_download) {
-            if (!f.get()) {
-                return NULL;
-            }
-        }
-    }
-
-    return llama_model_load_from_file(local_path.c_str(), params);
-}
-
-struct llama_model * common_load_model_from_hf(
-        const std::string & repo,
-        const std::string & remote_path,
-        const std::string & local_path,
-        const std::string & hf_token,
-        const struct llama_model_params & params) {
-    // construct hugging face model url:
-    //
-    //  --repo ggml-org/models --file tinyllama-1.1b/ggml-model-f16.gguf
-    //    https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf
-    //
-    //  --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf
-    //    https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf
-    //
-
-    std::string model_url = "https://huggingface.co/";
-    model_url += repo;
-    model_url += "/resolve/main/";
-    model_url += remote_path;
-
-    return common_load_model_from_url(model_url, local_path, hf_token, params);
-}
-
-/**
- * Allow getting the HF file from the HF repo with tag (like ollama), for example:
- * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
- * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
- * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
- * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
- *
- * Return pair of <repo, file> (with "repo" already having tag removed)
- *
- * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
- */
-std::pair<std::string, std::string> common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & hf_token) {
-    auto parts = string_split<std::string>(hf_repo_with_tag, ':');
-    std::string tag = parts.size() > 1 ? parts.back() : "latest";
-    std::string hf_repo = parts[0];
-    if (string_split<std::string>(hf_repo, '/').size() != 2) {
-        throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
-    }
-
-    // fetch model info from Hugging Face Hub API
-    json model_info;
-    curl_ptr       curl(curl_easy_init(), &curl_easy_cleanup);
-    curl_slist_ptr http_headers;
-    std::string res_str;
-    std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag;
-    curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
-    curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
-    typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
-    auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
-        static_cast<std::string *>(data)->append((char * ) ptr, size * nmemb);
-        return size * nmemb;
-    };
-    curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
-    curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str);
-#if defined(_WIN32)
-    curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
-#endif
-    if (!hf_token.empty()) {
-        std::string auth_header = "Authorization: Bearer " + hf_token;
-        http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
-    }
-    // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
-    http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
-    http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json");
-    curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
-
-    CURLcode res = curl_easy_perform(curl.get());
-
-    if (res != CURLE_OK) {
-        throw std::runtime_error("error: cannot make GET request to HF API");
-    }
-
-    long res_code;
-    curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
-    if (res_code == 200) {
-        model_info = json::parse(res_str);
-    } else if (res_code == 401) {
-        throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
-    } else {
-        throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
-    }
-
-    // check response
-    if (!model_info.contains("ggufFile")) {
-        throw std::runtime_error("error: model does not have ggufFile");
-    }
-    json & gguf_file = model_info.at("ggufFile");
-    if (!gguf_file.contains("rfilename")) {
-        throw std::runtime_error("error: ggufFile does not have rfilename");
-    }
-
-    return std::make_pair(hf_repo, gguf_file.at("rfilename"));
-}
-
-#else
-
-struct llama_model * common_load_model_from_url(
-        const std::string & /*model_url*/,
-        const std::string & /*local_path*/,
-        const std::string & /*hf_token*/,
-        const struct llama_model_params & /*params*/) {
-    LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
-    return nullptr;
-}
-
-struct llama_model * common_load_model_from_hf(
-        const std::string & /*repo*/,
-        const std::string & /*remote_path*/,
-        const std::string & /*local_path*/,
-        const std::string & /*hf_token*/,
-        const struct llama_model_params & /*params*/) {
-    LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
-    return nullptr;
-}
-
-std::pair<std::string, std::string> common_get_hf_file(const std::string &, const std::string &) {
-    LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
-    return std::make_pair("", "");
-}
-
-#endif // LLAMA_USE_CURL
-
 //
 // Batch utils
 //
@@ -2032,26 +1564,3 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c

    return result;
 }
-
-template <>
-json common_grammar_trigger::to_json() const {
-    json out {
-        {"type", (int) type},
-        {"value", value},
-    };
-    if (type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
-        out["token"] = (int) token;
-    }
-    return out;
-}
-
-template <>
-common_grammar_trigger common_grammar_trigger::from_json(const json & in) {
-    common_grammar_trigger out;
-    out.type = (common_grammar_trigger_type) in.at("type").get<int>();
-    out.value = in.at("value").get<std::string>();
-    if (out.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
-        out.token = (llama_token) in.at("token").get<int>();
-    }
-    return out;
-}
--- a/common/common.h
+++ b/common/common.h
@@ -66,7 +66,6 @@ enum llama_example {
    LLAMA_EXAMPLE_COMMON,
    LLAMA_EXAMPLE_SPECULATIVE,
    LLAMA_EXAMPLE_MAIN,
-    LLAMA_EXAMPLE_INFILL,
    LLAMA_EXAMPLE_EMBEDDING,
    LLAMA_EXAMPLE_PERPLEXITY,
    LLAMA_EXAMPLE_RETRIEVAL,
@@ -96,6 +95,7 @@ enum common_sampler_type {
    COMMON_SAMPLER_TYPE_XTC         = 8,
    COMMON_SAMPLER_TYPE_INFILL      = 9,
    COMMON_SAMPLER_TYPE_PENALTIES   = 10,
+    COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11,
 };

 // dimensionality reduction methods, used by cvector-generator
@@ -121,10 +121,6 @@ struct common_grammar_trigger {
    common_grammar_trigger_type type;
    std::string value;
    llama_token token = LLAMA_TOKEN_NULL;
-
-    // T can only be nlohmann::ordered_json
-    template <class T> T to_json() const;
-    template <class T> static common_grammar_trigger from_json(const T & in);
 };

 // sampling parameters
@@ -165,6 +161,7 @@ struct common_params_sampling {
    std::vector<enum common_sampler_type> samplers = {
        COMMON_SAMPLER_TYPE_PENALTIES,
        COMMON_SAMPLER_TYPE_DRY,
+        COMMON_SAMPLER_TYPE_TOP_N_SIGMA,
        COMMON_SAMPLER_TYPE_TOP_K,
        COMMON_SAMPLER_TYPE_TYPICAL_P,
        COMMON_SAMPLER_TYPE_TOP_P,
@@ -184,6 +181,13 @@ struct common_params_sampling {
    std::string print() const;
 };

+struct common_params_model {
+    std::string path    = ""; // model local path                                           // NOLINT
+    std::string url     = ""; // model url to download                                      // NOLINT
+    std::string hf_repo = ""; // HF repo                                                    // NOLINT
+    std::string hf_file = ""; // HF file                                                    // NOLINT
+};
+
 struct common_params_speculative {
    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading

@@ -197,19 +201,11 @@ struct common_params_speculative {
    struct cpu_params cpuparams;
    struct cpu_params cpuparams_batch;

-    std::string hf_repo = ""; // HF repo                                                     // NOLINT
-    std::string hf_file = ""; // HF file                                                     // NOLINT
-
-    std::string model = "";     // draft model for speculative decoding                      // NOLINT
-    std::string model_url = ""; // model url to download                                     // NOLINT
+    struct common_params_model model;
 };

 struct common_params_vocoder {
-    std::string hf_repo = ""; // HF repo                                                     // NOLINT
-    std::string hf_file = ""; // HF file                                                     // NOLINT
-
-    std::string model     = ""; // model path                                                // NOLINT
-    std::string model_url = ""; // model url to download                                     // NOLINT
+    struct common_params_model model;

    std::string speaker_file = ""; // speaker file path                                      // NOLINT

@@ -267,12 +263,10 @@ struct common_params {
    struct common_params_speculative speculative;
    struct common_params_vocoder     vocoder;

-    std::string model                = ""; // model path                                                    // NOLINT
+    struct common_params_model model;
+
    std::string model_alias          = ""; // model alias                                                   // NOLINT
-    std::string model_url            = ""; // model url to download                                         // NOLINT
    std::string hf_token             = ""; // HF token                                                      // NOLINT
-    std::string hf_repo              = ""; // HF repo                                                       // NOLINT
-    std::string hf_file              = ""; // HF file                                                       // NOLINT
    std::string prompt               = "";                                                                  // NOLINT
    std::string system_prompt        = "";                                                                  // NOLINT
    std::string prompt_file          = ""; // store the external prompt file name                           // NOLINT
@@ -286,6 +280,7 @@ struct common_params {
    std::vector<std::string> in_files;   // all input files
    std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
    std::vector<llama_model_kv_override> kv_overrides;
+    std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;

    bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
    std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
@@ -329,7 +324,6 @@ struct common_params {
    bool ctx_shift         = true;  // context shift on inifinite text generation

    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
-    bool logits_all        = false; // return logits for all tokens in the batch
    bool use_mmap          = true;  // use mmap for faster loads
    bool use_mlock         = false; // use mlock to keep model in memory
    bool verbose_prompt    = false; // print prompt tokens before generation
@@ -346,8 +340,10 @@ struct common_params {

    common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;

-    // multimodal models (see examples/llava)
-    std::string mmproj = "";        // path to multimodal projector                                         // NOLINT
+    // multimodal models (see tools/mtmd)
+    struct common_params_model mmproj;
+    bool mmproj_use_gpu = true;     // use GPU for multimodal model
+    bool no_mmproj = false;         // explicitly disable multimodal model
    std::vector<std::string> image; // path to image file(s)

    // embedding
@@ -418,8 +414,8 @@ struct common_params {
    int n_pca_batch = 100;
    int n_pca_iterations = 1000;
    dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
-    std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
-    std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
+    std::string cvector_positive_file = "tools/cvector-generator/positive.txt";
+    std::string cvector_negative_file = "tools/cvector-generator/negative.txt";

    bool spm_infill = false; // suffix/prefix/middle pattern for infill

@@ -546,26 +542,11 @@ struct llama_model_params     common_model_params_to_llama  (      common_params
 struct llama_context_params   common_context_params_to_llama(const common_params & params);
 struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);

-struct llama_model * common_load_model_from_url(
-    const std::string & model_url,
-    const std::string & local_path,
-    const std::string & hf_token,
-    const struct llama_model_params & params);
-
-struct llama_model * common_load_model_from_hf(
-    const std::string & repo,
-    const std::string & remote_path,
-    const std::string & local_path,
-    const std::string & hf_token,
-    const struct llama_model_params & params);
-
-std::pair<std::string, std::string> common_get_hf_file(
-    const std::string & hf_repo_with_tag,
-    const std::string & hf_token);
-
 // clear LoRA adapters from context, then apply new list of adapters
 void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);

+std::string                   get_model_endpoint();
+
 //
 // Batch utils
 //
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -16,6 +16,9 @@ using json = nlohmann::ordered_json;
 static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
    auto has_max = max_items != std::numeric_limits<int>::max();

+    if (max_items == 0) {
+        return "";
+    }
    if (min_items == 0 && max_items == 1) {
        return item_rule + "?";
    }
--- a/common/llguidance.cpp
+++ b/common/llguidance.cpp
@@ -11,25 +11,24 @@ struct llama_sampler_llg {
    std::string         grammar_kind;
    std::string         grammar_data;
    LlgTokenizer *      tokenizer;
-    LlgConstraint *     grammar;
-    LlgMaskResult       llg_res;
-    bool                has_llg_res;
+    LlgMatcher *        grammar;
 };

-static LlgConstraint * llama_sampler_llg_new(LlgTokenizer * tokenizer, const char * grammar_kind,
-                                             const char * grammar_data) {
+static LlgMatcher * llama_sampler_llg_new(LlgTokenizer * tokenizer, const char * grammar_kind,
+                                          const char * grammar_data) {
    LlgConstraintInit cinit;
    llg_constraint_init_set_defaults(&cinit, tokenizer);
    const char * log_level = getenv("LLGUIDANCE_LOG_LEVEL");
    if (log_level && *log_level) {
        cinit.log_stderr_level = atoi(log_level);
    }
-    auto c = llg_new_constraint_any(&cinit, grammar_kind, grammar_data);
-    if (llg_get_error(c)) {
-        LOG_ERR("llg error: %s\n", llg_get_error(c));
-        llg_free_constraint(c);
+    auto c = llg_new_matcher(&cinit, grammar_kind, grammar_data);
+    if (llg_matcher_get_error(c)) {
+        LOG_ERR("llg error: %s\n", llg_matcher_get_error(c));
+        llg_free_matcher(c);
        return nullptr;
    }
+
    return c;
 }

@@ -40,39 +39,29 @@ static const char * llama_sampler_llg_name(const llama_sampler * /*smpl*/) {
 static void llama_sampler_llg_accept_impl(llama_sampler * smpl, llama_token token) {
    auto * ctx = (llama_sampler_llg *) smpl->ctx;
    if (ctx->grammar) {
-        LlgCommitResult res;
-        llg_commit_token(ctx->grammar, token, &res);
-        ctx->has_llg_res = false;
+        llg_matcher_consume_token(ctx->grammar, token);
    }
 }

 static void llama_sampler_llg_apply(llama_sampler * smpl, llama_token_data_array * cur_p) {
    auto * ctx = (llama_sampler_llg *) smpl->ctx;
    if (ctx->grammar) {
-        if (!ctx->has_llg_res) {
-            if (llg_compute_mask(ctx->grammar, &ctx->llg_res) == 0) {
-                ctx->has_llg_res = true;
+        const uint32_t * mask = llg_matcher_get_mask(ctx->grammar);
+        if (mask == nullptr) {
+            if (llg_matcher_compute_mask(ctx->grammar) == 0) {
+                mask = llg_matcher_get_mask(ctx->grammar);
            } else {
-                LOG_ERR("llg error: %s\n", llg_get_error(ctx->grammar));
-                llg_free_constraint(ctx->grammar);
+                LOG_ERR("llg error: %s\n", llg_matcher_get_error(ctx->grammar));
+                llg_free_matcher(ctx->grammar);
                ctx->grammar = nullptr;
+                return;
            }
        }
-        if (ctx->has_llg_res) {
-            if (ctx->llg_res.is_stop) {
-                for (size_t i = 0; i < cur_p->size; ++i) {
-                    if (!llama_vocab_is_eog(ctx->vocab, cur_p->data[i].id)) {
-                        cur_p->data[i].logit = -INFINITY;
-                    }
-                }
-            } else {
-                const uint32_t * mask = ctx->llg_res.sample_mask;
-                for (size_t i = 0; i < cur_p->size; ++i) {
-                    auto token = cur_p->data[i].id;
-                    if ((mask[token / 32] & (1 << (token % 32))) == 0) {
-                        cur_p->data[i].logit = -INFINITY;
-                    }
-                }
+
+        for (size_t i = 0; i < cur_p->size; ++i) {
+            auto token = cur_p->data[i].id;
+            if ((mask[token / 32] & (1 << (token % 32))) == 0) {
+                cur_p->data[i].logit = -INFINITY;
            }
        }
    }
@@ -80,14 +69,9 @@ static void llama_sampler_llg_apply(llama_sampler * smpl, llama_token_data_array

 static void llama_sampler_llg_reset(llama_sampler * smpl) {
    auto * ctx = (llama_sampler_llg *) smpl->ctx;
-    if (!ctx->grammar) {
-        return;
+    if (ctx->grammar) {
+        llg_matcher_reset(ctx->grammar);
    }
-
-    auto * grammar_new = llama_sampler_llg_new(ctx->tokenizer, ctx->grammar_kind.c_str(), ctx->grammar_data.c_str());
-    llg_free_constraint(ctx->grammar);
-    ctx->grammar     = grammar_new;
-    ctx->has_llg_res = false;
 }

 static llama_sampler * llama_sampler_llg_clone(const llama_sampler * smpl) {
@@ -102,7 +86,7 @@ static llama_sampler * llama_sampler_llg_clone(const llama_sampler * smpl) {
        if (ctx->grammar) {
            result_ctx->grammar_kind = ctx->grammar_kind;
            result_ctx->grammar_data = ctx->grammar_data;
-            result_ctx->grammar      = llg_clone_constraint(ctx->grammar);
+            result_ctx->grammar      = llg_clone_matcher(ctx->grammar);
            result_ctx->tokenizer    = llg_clone_tokenizer(ctx->tokenizer);
        }
    }
@@ -114,7 +98,7 @@ static void llama_sampler_llg_free(llama_sampler * smpl) {
    const auto * ctx = (llama_sampler_llg *) smpl->ctx;

    if (ctx->grammar) {
-        llg_free_constraint(ctx->grammar);
+        llg_free_matcher(ctx->grammar);
        llg_free_tokenizer(ctx->tokenizer);
    }

@@ -239,9 +223,11 @@ llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab, const char * g
            /* .grammar_data = */ grammar_data,
            /* .tokenizer    = */ tokenizer,
            /* .grammar      = */ llama_sampler_llg_new(tokenizer, grammar_kind, grammar_data),
-            /* .llg_res      = */ {},
-            /* .has_llg_res  = */ false,
        };
+        if (ctx->grammar) {
+            GGML_ASSERT(((size_t) llama_vocab_n_tokens(vocab) + 31) / 32 * 4 ==
+                        llg_matcher_get_mask_byte_size(ctx->grammar));
+        }
    } else {
        *ctx = {
            /* .vocab        = */ vocab,
@@ -249,15 +235,12 @@ llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab, const char * g
            /* .grammar_data = */ {},
            /* .tokenizer    = */ nullptr,
            /* .grammar      = */ nullptr,
-            /* .llg_res      = */ {},
-            /* .has_llg_res  = */ false,
        };
    }

    return llama_sampler_init(
        /* .iface = */ &llama_sampler_llg_i,
-        /* .ctx   = */ ctx
-    );
+        /* .ctx   = */ ctx);
 }

 #else
--- a/common/minja/chat-template.hpp
+++ b/common/minja/chat-template.hpp
@@ -9,10 +9,19 @@
 #pragma once

 #include "minja.hpp"
-#include <json.hpp>
+
+#include <chrono>
+#include <cstddef>
+#include <cstdio>
+#include <exception>
+#include <iomanip>
+#include <memory>
+#include <sstream>
 #include <string>
 #include <vector>

+#include <json.hpp>
+
 using json = nlohmann::ordered_json;

 namespace minja {
@@ -425,7 +434,7 @@ class chat_template {
                        auto obj = json {
                            {"tool_calls", tool_calls},
                        };
-                        if (!content.is_null() && content != "") {
+                        if (!content.is_null() && !content.empty()) {
                            obj["content"] = content;
                        }
                        message["content"] = obj.dump(2);
@@ -435,13 +444,12 @@ class chat_template {
                if (polyfill_tool_responses && role == "tool") {
                    message["role"] = "user";
                    auto obj = json {
-                        {"tool_response", {
-                            {"content", message.at("content")},
-                        }},
+                        {"tool_response", json::object()},
                    };
                    if (message.contains("name")) {
-                        obj["tool_response"]["name"] = message.at("name");
+                        obj["tool_response"]["tool"] = message.at("name");
                    }
+                    obj["tool_response"]["content"] = message.at("content");
                    if (message.contains("tool_call_id")) {
                        obj["tool_response"]["tool_call_id"] = message.at("tool_call_id");
                    }
@@ -510,7 +518,7 @@ class chat_template {
    static nlohmann::ordered_json add_system(const nlohmann::ordered_json & messages, const std::string & system_prompt) {
        json messages_with_system = messages;

-        if (messages_with_system.size() > 0 && messages_with_system[0].at("role") == "system") {
+        if (!messages_with_system.empty() && messages_with_system[0].at("role") == "system") {
            std::string existing_system = messages_with_system.at(0).at("content");
            messages_with_system[0] = json {
                {"role", "system"},
--- a/common/minja/minja.hpp
+++ b/common/minja/minja.hpp
@@ -8,14 +8,26 @@
 // SPDX-License-Identifier: MIT
 #pragma once

+#include <algorithm>
+#include <cctype>
+#include <cstddef>
+#include <cmath>
+#include <exception>
+#include <functional>
 #include <iostream>
-#include <string>
-#include <vector>
-#include <regex>
+#include <iterator>
+#include <limits>
+#include <map>
 #include <memory>
-#include <stdexcept>
+#include <regex>
 #include <sstream>
+#include <string>
+#include <stdexcept>
+#include <unordered_map>
 #include <unordered_set>
+#include <utility>
+#include <vector>
+
 #include <json.hpp>

 using json = nlohmann::ordered_json;
@@ -731,51 +743,51 @@ public:

 struct TextTemplateToken : public TemplateToken {
    std::string text;
-    TextTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, const std::string& t) : TemplateToken(Type::Text, location, pre, post), text(t) {}
+    TextTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post, const std::string& t) : TemplateToken(Type::Text, loc, pre, post), text(t) {}
 };

 struct ExpressionTemplateToken : public TemplateToken {
    std::shared_ptr<Expression> expr;
-    ExpressionTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, std::shared_ptr<Expression> && e) : TemplateToken(Type::Expression, location, pre, post), expr(std::move(e)) {}
+    ExpressionTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post, std::shared_ptr<Expression> && e) : TemplateToken(Type::Expression, loc, pre, post), expr(std::move(e)) {}
 };

 struct IfTemplateToken : public TemplateToken {
    std::shared_ptr<Expression> condition;
-    IfTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, std::shared_ptr<Expression> && c) : TemplateToken(Type::If, location, pre, post), condition(std::move(c)) {}
+    IfTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post, std::shared_ptr<Expression> && c) : TemplateToken(Type::If, loc, pre, post), condition(std::move(c)) {}
 };

 struct ElifTemplateToken : public TemplateToken {
    std::shared_ptr<Expression> condition;
-    ElifTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, std::shared_ptr<Expression> && c) : TemplateToken(Type::Elif, location, pre, post), condition(std::move(c)) {}
+    ElifTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post, std::shared_ptr<Expression> && c) : TemplateToken(Type::Elif, loc, pre, post), condition(std::move(c)) {}
 };

 struct ElseTemplateToken : public TemplateToken {
-    ElseTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::Else, location, pre, post) {}
+    ElseTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::Else, loc, pre, post) {}
 };

 struct EndIfTemplateToken : public TemplateToken {
-    EndIfTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndIf, location, pre, post) {}
+    EndIfTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndIf, loc, pre, post) {}
 };

 struct MacroTemplateToken : public TemplateToken {
    std::shared_ptr<VariableExpr> name;
    Expression::Parameters params;
-    MacroTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, std::shared_ptr<VariableExpr> && n, Expression::Parameters && p)
-      : TemplateToken(Type::Macro, location, pre, post), name(std::move(n)), params(std::move(p)) {}
+    MacroTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post, std::shared_ptr<VariableExpr> && n, Expression::Parameters && p)
+      : TemplateToken(Type::Macro, loc, pre, post), name(std::move(n)), params(std::move(p)) {}
 };

 struct EndMacroTemplateToken : public TemplateToken {
-    EndMacroTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndMacro, location, pre, post) {}
+    EndMacroTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndMacro, loc, pre, post) {}
 };

 struct FilterTemplateToken : public TemplateToken {
    std::shared_ptr<Expression> filter;
-    FilterTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, std::shared_ptr<Expression> && filter)
-      : TemplateToken(Type::Filter, location, pre, post), filter(std::move(filter)) {}
+    FilterTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post, std::shared_ptr<Expression> && filter)
+      : TemplateToken(Type::Filter, loc, pre, post), filter(std::move(filter)) {}
 };

 struct EndFilterTemplateToken : public TemplateToken {
-    EndFilterTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndFilter, location, pre, post) {}
+    EndFilterTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndFilter, loc, pre, post) {}
 };

 struct ForTemplateToken : public TemplateToken {
@@ -783,38 +795,38 @@ struct ForTemplateToken : public TemplateToken {
    std::shared_ptr<Expression> iterable;
    std::shared_ptr<Expression> condition;
    bool recursive;
-    ForTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, const std::vector<std::string> & vns, std::shared_ptr<Expression> && iter,
+    ForTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post, const std::vector<std::string> & vns, std::shared_ptr<Expression> && iter,
      std::shared_ptr<Expression> && c, bool r)
-      : TemplateToken(Type::For, location, pre, post), var_names(vns), iterable(std::move(iter)), condition(std::move(c)), recursive(r) {}
+      : TemplateToken(Type::For, loc, pre, post), var_names(vns), iterable(std::move(iter)), condition(std::move(c)), recursive(r) {}
 };

 struct EndForTemplateToken : public TemplateToken {
-    EndForTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndFor, location, pre, post) {}
+    EndForTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndFor, loc, pre, post) {}
 };

 struct GenerationTemplateToken : public TemplateToken {
-    GenerationTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::Generation, location, pre, post) {}
+    GenerationTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::Generation, loc, pre, post) {}
 };

 struct EndGenerationTemplateToken : public TemplateToken {
-    EndGenerationTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndGeneration, location, pre, post) {}
+    EndGenerationTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndGeneration, loc, pre, post) {}
 };

 struct SetTemplateToken : public TemplateToken {
    std::string ns;
    std::vector<std::string> var_names;
    std::shared_ptr<Expression> value;
-    SetTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, const std::string & ns, const std::vector<std::string> & vns, std::shared_ptr<Expression> && v)
-      : TemplateToken(Type::Set, location, pre, post), ns(ns), var_names(vns), value(std::move(v)) {}
+    SetTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post, const std::string & ns, const std::vector<std::string> & vns, std::shared_ptr<Expression> && v)
+      : TemplateToken(Type::Set, loc, pre, post), ns(ns), var_names(vns), value(std::move(v)) {}
 };

 struct EndSetTemplateToken : public TemplateToken {
-    EndSetTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndSet, location, pre, post) {}
+    EndSetTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndSet, loc, pre, post) {}
 };

 struct CommentTemplateToken : public TemplateToken {
    std::string text;
-    CommentTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, const std::string& t) : TemplateToken(Type::Comment, location, pre, post), text(t) {}
+    CommentTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post, const std::string& t) : TemplateToken(Type::Comment, loc, pre, post), text(t) {}
 };

 enum class LoopControlType { Break, Continue };
@@ -830,7 +842,7 @@ public:

 struct LoopControlTemplateToken : public TemplateToken {
    LoopControlType control_type;
-    LoopControlTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, LoopControlType control_type) : TemplateToken(Type::Break, location, pre, post), control_type(control_type) {}
+    LoopControlTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post, LoopControlType control_type) : TemplateToken(Type::Break, loc, pre, post), control_type(control_type) {}
 };

 class TemplateNode {
@@ -868,8 +880,8 @@ public:
 class SequenceNode : public TemplateNode {
    std::vector<std::shared_ptr<TemplateNode>> children;
 public:
-    SequenceNode(const Location & location, std::vector<std::shared_ptr<TemplateNode>> && c)
-      : TemplateNode(location), children(std::move(c)) {}
+    SequenceNode(const Location & loc, std::vector<std::shared_ptr<TemplateNode>> && c)
+      : TemplateNode(loc), children(std::move(c)) {}
    void do_render(std::ostringstream & out, const std::shared_ptr<Context> & context) const override {
        for (const auto& child : children) child->render(out, context);
    }
@@ -878,7 +890,7 @@ public:
 class TextNode : public TemplateNode {
    std::string text;
 public:
-    TextNode(const Location & location, const std::string& t) : TemplateNode(location), text(t) {}
+    TextNode(const Location & loc, const std::string& t) : TemplateNode(loc), text(t) {}
    void do_render(std::ostringstream & out, const std::shared_ptr<Context> &) const override {
      out << text;
    }
@@ -887,7 +899,7 @@ public:
 class ExpressionNode : public TemplateNode {
    std::shared_ptr<Expression> expr;
 public:
-    ExpressionNode(const Location & location, std::shared_ptr<Expression> && e) : TemplateNode(location), expr(std::move(e)) {}
+    ExpressionNode(const Location & loc, std::shared_ptr<Expression> && e) : TemplateNode(loc), expr(std::move(e)) {}
    void do_render(std::ostringstream & out, const std::shared_ptr<Context> & context) const override {
      if (!expr) throw std::runtime_error("ExpressionNode.expr is null");
      auto result = expr->evaluate(context);
@@ -904,8 +916,8 @@ public:
 class IfNode : public TemplateNode {
    std::vector<std::pair<std::shared_ptr<Expression>, std::shared_ptr<TemplateNode>>> cascade;
 public:
-    IfNode(const Location & location, std::vector<std::pair<std::shared_ptr<Expression>, std::shared_ptr<TemplateNode>>> && c)
-        : TemplateNode(location), cascade(std::move(c)) {}
+    IfNode(const Location & loc, std::vector<std::pair<std::shared_ptr<Expression>, std::shared_ptr<TemplateNode>>> && c)
+        : TemplateNode(loc), cascade(std::move(c)) {}
    void do_render(std::ostringstream & out, const std::shared_ptr<Context> & context) const override {
      for (const auto& branch : cascade) {
          auto enter_branch = true;
@@ -924,7 +936,7 @@ public:
 class LoopControlNode : public TemplateNode {
    LoopControlType control_type_;
  public:
-    LoopControlNode(const Location & location, LoopControlType control_type) : TemplateNode(location), control_type_(control_type) {}
+    LoopControlNode(const Location & loc, LoopControlType control_type) : TemplateNode(loc), control_type_(control_type) {}
    void do_render(std::ostringstream &, const std::shared_ptr<Context> &) const override {
      throw LoopControlException(control_type_);
    }
@@ -938,9 +950,9 @@ class ForNode : public TemplateNode {
    bool recursive;
    std::shared_ptr<TemplateNode> else_body;
 public:
-    ForNode(const Location & location, std::vector<std::string> && var_names, std::shared_ptr<Expression> && iterable,
+    ForNode(const Location & loc, std::vector<std::string> && var_names, std::shared_ptr<Expression> && iterable,
      std::shared_ptr<Expression> && condition, std::shared_ptr<TemplateNode> && body, bool recursive, std::shared_ptr<TemplateNode> && else_body)
-            : TemplateNode(location), var_names(var_names), iterable(std::move(iterable)), condition(std::move(condition)), body(std::move(body)), recursive(recursive), else_body(std::move(else_body)) {}
+            : TemplateNode(loc), var_names(var_names), iterable(std::move(iterable)), condition(std::move(condition)), body(std::move(body)), recursive(recursive), else_body(std::move(else_body)) {}

    void do_render(std::ostringstream & out, const std::shared_ptr<Context> & context) const override {
      // https://jinja.palletsprojects.com/en/3.0.x/templates/#for
@@ -1025,8 +1037,8 @@ class MacroNode : public TemplateNode {
    std::shared_ptr<TemplateNode> body;
    std::unordered_map<std::string, size_t> named_param_positions;
 public:
-    MacroNode(const Location & location, std::shared_ptr<VariableExpr> && n, Expression::Parameters && p, std::shared_ptr<TemplateNode> && b)
-        : TemplateNode(location), name(std::move(n)), params(std::move(p)), body(std::move(b)) {
+    MacroNode(const Location & loc, std::shared_ptr<VariableExpr> && n, Expression::Parameters && p, std::shared_ptr<TemplateNode> && b)
+        : TemplateNode(loc), name(std::move(n)), params(std::move(p)), body(std::move(b)) {
        for (size_t i = 0; i < params.size(); ++i) {
          const auto & name = params[i].first;
          if (!name.empty()) {
@@ -1072,8 +1084,8 @@ class FilterNode : public TemplateNode {
    std::shared_ptr<TemplateNode> body;

 public:
-    FilterNode(const Location & location, std::shared_ptr<Expression> && f, std::shared_ptr<TemplateNode> && b)
-        : TemplateNode(location), filter(std::move(f)), body(std::move(b)) {}
+    FilterNode(const Location & loc, std::shared_ptr<Expression> && f, std::shared_ptr<TemplateNode> && b)
+        : TemplateNode(loc), filter(std::move(f)), body(std::move(b)) {}

    void do_render(std::ostringstream & out, const std::shared_ptr<Context> & context) const override {
        if (!filter) throw std::runtime_error("FilterNode.filter is null");
@@ -1095,8 +1107,8 @@ class SetNode : public TemplateNode {
    std::vector<std::string> var_names;
    std::shared_ptr<Expression> value;
 public:
-    SetNode(const Location & location, const std::string & ns, const std::vector<std::string> & vns, std::shared_ptr<Expression> && v)
-        : TemplateNode(location), ns(ns), var_names(vns), value(std::move(v)) {}
+    SetNode(const Location & loc, const std::string & ns, const std::vector<std::string> & vns, std::shared_ptr<Expression> && v)
+        : TemplateNode(loc), ns(ns), var_names(vns), value(std::move(v)) {}
    void do_render(std::ostringstream &, const std::shared_ptr<Context> & context) const override {
      if (!value) throw std::runtime_error("SetNode.value is null");
      if (!ns.empty()) {
@@ -1118,8 +1130,8 @@ class SetTemplateNode : public TemplateNode {
    std::string name;
    std::shared_ptr<TemplateNode> template_value;
 public:
-    SetTemplateNode(const Location & location, const std::string & name, std::shared_ptr<TemplateNode> && tv)
-        : TemplateNode(location), name(name), template_value(std::move(tv)) {}
+    SetTemplateNode(const Location & loc, const std::string & name, std::shared_ptr<TemplateNode> && tv)
+        : TemplateNode(loc), name(name), template_value(std::move(tv)) {}
    void do_render(std::ostringstream &, const std::shared_ptr<Context> & context) const override {
      if (!template_value) throw std::runtime_error("SetTemplateNode.template_value is null");
      Value value { template_value->render(context) };
@@ -1132,8 +1144,8 @@ class IfExpr : public Expression {
    std::shared_ptr<Expression> then_expr;
    std::shared_ptr<Expression> else_expr;
 public:
-    IfExpr(const Location & location, std::shared_ptr<Expression> && c, std::shared_ptr<Expression> && t, std::shared_ptr<Expression> && e)
-        : Expression(location), condition(std::move(c)), then_expr(std::move(t)), else_expr(std::move(e)) {}
+    IfExpr(const Location & loc, std::shared_ptr<Expression> && c, std::shared_ptr<Expression> && t, std::shared_ptr<Expression> && e)
+        : Expression(loc), condition(std::move(c)), then_expr(std::move(t)), else_expr(std::move(e)) {}
    Value do_evaluate(const std::shared_ptr<Context> & context) const override {
      if (!condition) throw std::runtime_error("IfExpr.condition is null");
      if (!then_expr) throw std::runtime_error("IfExpr.then_expr is null");
@@ -1150,16 +1162,16 @@ public:
 class LiteralExpr : public Expression {
    Value value;
 public:
-    LiteralExpr(const Location & location, const Value& v)
-      : Expression(location), value(v) {}
+    LiteralExpr(const Location & loc, const Value& v)
+      : Expression(loc), value(v) {}
    Value do_evaluate(const std::shared_ptr<Context> &) const override { return value; }
 };

 class ArrayExpr : public Expression {
    std::vector<std::shared_ptr<Expression>> elements;
 public:
-    ArrayExpr(const Location & location, std::vector<std::shared_ptr<Expression>> && e)
-      : Expression(location), elements(std::move(e)) {}
+    ArrayExpr(const Location & loc, std::vector<std::shared_ptr<Expression>> && e)
+      : Expression(loc), elements(std::move(e)) {}
    Value do_evaluate(const std::shared_ptr<Context> & context) const override {
        auto result = Value::array();
        for (const auto& e : elements) {
@@ -1173,8 +1185,8 @@ public:
 class DictExpr : public Expression {
    std::vector<std::pair<std::shared_ptr<Expression>, std::shared_ptr<Expression>>> elements;
 public:
-    DictExpr(const Location & location, std::vector<std::pair<std::shared_ptr<Expression>, std::shared_ptr<Expression>>> && e)
-      : Expression(location), elements(std::move(e)) {}
+    DictExpr(const Location & loc, std::vector<std::pair<std::shared_ptr<Expression>, std::shared_ptr<Expression>>> && e)
+      : Expression(loc), elements(std::move(e)) {}
    Value do_evaluate(const std::shared_ptr<Context> & context) const override {
        auto result = Value::object();
        for (const auto& [key, value] : elements) {
@@ -1189,8 +1201,8 @@ public:
 class SliceExpr : public Expression {
 public:
    std::shared_ptr<Expression> start, end;
-    SliceExpr(const Location & location, std::shared_ptr<Expression> && s, std::shared_ptr<Expression> && e)
-      : Expression(location), start(std::move(s)), end(std::move(e)) {}
+    SliceExpr(const Location & loc, std::shared_ptr<Expression> && s, std::shared_ptr<Expression> && e)
+      : Expression(loc), start(std::move(s)), end(std::move(e)) {}
    Value do_evaluate(const std::shared_ptr<Context> &) const override {
        throw std::runtime_error("SliceExpr not implemented");
    }
@@ -1200,8 +1212,8 @@ class SubscriptExpr : public Expression {
    std::shared_ptr<Expression> base;
    std::shared_ptr<Expression> index;
 public:
-    SubscriptExpr(const Location & location, std::shared_ptr<Expression> && b, std::shared_ptr<Expression> && i)
-        : Expression(location), base(std::move(b)), index(std::move(i)) {}
+    SubscriptExpr(const Location & loc, std::shared_ptr<Expression> && b, std::shared_ptr<Expression> && i)
+        : Expression(loc), base(std::move(b)), index(std::move(i)) {}
    Value do_evaluate(const std::shared_ptr<Context> & context) const override {
        if (!base) throw std::runtime_error("SubscriptExpr.base is null");
        if (!index) throw std::runtime_error("SubscriptExpr.index is null");
@@ -1243,8 +1255,8 @@ public:
    enum class Op { Plus, Minus, LogicalNot, Expansion, ExpansionDict };
    std::shared_ptr<Expression> expr;
    Op op;
-    UnaryOpExpr(const Location & location, std::shared_ptr<Expression> && e, Op o)
-      : Expression(location), expr(std::move(e)), op(o) {}
+    UnaryOpExpr(const Location & loc, std::shared_ptr<Expression> && e, Op o)
+      : Expression(loc), expr(std::move(e)), op(o) {}
    Value do_evaluate(const std::shared_ptr<Context> & context) const override {
        if (!expr) throw std::runtime_error("UnaryOpExpr.expr is null");
        auto e = expr->evaluate(context);
@@ -1269,8 +1281,8 @@ private:
    std::shared_ptr<Expression> right;
    Op op;
 public:
-    BinaryOpExpr(const Location & location, std::shared_ptr<Expression> && l, std::shared_ptr<Expression> && r, Op o)
-        : Expression(location), left(std::move(l)), right(std::move(r)), op(o) {}
+    BinaryOpExpr(const Location & loc, std::shared_ptr<Expression> && l, std::shared_ptr<Expression> && r, Op o)
+        : Expression(loc), left(std::move(l)), right(std::move(r)), op(o) {}
    Value do_evaluate(const std::shared_ptr<Context> & context) const override {
        if (!left) throw std::runtime_error("BinaryOpExpr.left is null");
        if (!right) throw std::runtime_error("BinaryOpExpr.right is null");
@@ -1427,8 +1439,8 @@ class MethodCallExpr : public Expression {
    std::shared_ptr<VariableExpr> method;
    ArgumentsExpression args;
 public:
-    MethodCallExpr(const Location & location, std::shared_ptr<Expression> && obj, std::shared_ptr<VariableExpr> && m, ArgumentsExpression && a)
-        : Expression(location), object(std::move(obj)), method(std::move(m)), args(std::move(a)) {}
+    MethodCallExpr(const Location & loc, std::shared_ptr<Expression> && obj, std::shared_ptr<VariableExpr> && m, ArgumentsExpression && a)
+        : Expression(loc), object(std::move(obj)), method(std::move(m)), args(std::move(a)) {}
    Value do_evaluate(const std::shared_ptr<Context> & context) const override {
        if (!object) throw std::runtime_error("MethodCallExpr.object is null");
        if (!method) throw std::runtime_error("MethodCallExpr.method is null");
@@ -1526,8 +1538,8 @@ class CallExpr : public Expression {
 public:
    std::shared_ptr<Expression> object;
    ArgumentsExpression args;
-    CallExpr(const Location & location, std::shared_ptr<Expression> && obj, ArgumentsExpression && a)
-        : Expression(location), object(std::move(obj)), args(std::move(a)) {}
+    CallExpr(const Location & loc, std::shared_ptr<Expression> && obj, ArgumentsExpression && a)
+        : Expression(loc), object(std::move(obj)), args(std::move(a)) {}
    Value do_evaluate(const std::shared_ptr<Context> & context) const override {
        if (!object) throw std::runtime_error("CallExpr.object is null");
        auto obj = object->evaluate(context);
@@ -1542,8 +1554,8 @@ public:
 class FilterExpr : public Expression {
    std::vector<std::shared_ptr<Expression>> parts;
 public:
-    FilterExpr(const Location & location, std::vector<std::shared_ptr<Expression>> && p)
-      : Expression(location), parts(std::move(p)) {}
+    FilterExpr(const Location & loc, std::vector<std::shared_ptr<Expression>> && p)
+      : Expression(loc), parts(std::move(p)) {}
    Value do_evaluate(const std::shared_ptr<Context> & context) const override {
        Value result;
        bool first = true;
@@ -2460,7 +2472,7 @@ private:
                static std::regex leading_space_regex(R"(^\s+)");
                text = std::regex_replace(text, leading_space_regex, "");
              } else if (options.trim_blocks && (it - 1) != begin && !dynamic_cast<ExpressionTemplateToken*>((*(it - 2)).get())) {
-                if (text.length() > 0 && text[0] == '\n') {
+                if (!text.empty() && text[0] == '\n') {
                  text.erase(0, 1);
                }
              }
@@ -2538,7 +2550,7 @@ public:
        TemplateTokenIterator begin = tokens.begin();
        auto it = begin;
        TemplateTokenIterator end = tokens.end();
-        return parser.parseTemplate(begin, it, end, /* full= */ true);
+        return parser.parseTemplate(begin, it, end, /* fully= */ true);
    }
 };

@@ -2577,7 +2589,7 @@ inline std::shared_ptr<Context> Context::builtins() {
    throw std::runtime_error(args.at("message").get<std::string>());
  }));
  globals.set("tojson", simple_function("tojson", { "value", "indent" }, [](const std::shared_ptr<Context> &, Value & args) {
-    return Value(args.at("value").dump(args.get<int64_t>("indent", -1), /* tojson= */ true));
+    return Value(args.at("value").dump(args.get<int64_t>("indent", -1), /* to_json= */ true));
  }));
  globals.set("items", simple_function("items", { "object" }, [](const std::shared_ptr<Context> &, Value & args) {
    auto items = Value::array();
@@ -2599,21 +2611,25 @@ inline std::shared_ptr<Context> Context::builtins() {
  globals.set("last", simple_function("last", { "items" }, [](const std::shared_ptr<Context> &, Value & args) {
    auto items = args.at("items");
    if (!items.is_array()) throw std::runtime_error("object is not a list");
-    if (items.size() == 0) return Value();
+    if (items.empty()) return Value();
    return items.at(items.size() - 1);
  }));
  globals.set("trim", simple_function("trim", { "text" }, [](const std::shared_ptr<Context> &, Value & args) {
    auto & text = args.at("text");
    return text.is_null() ? text : Value(strip(text.get<std::string>()));
  }));
-  globals.set("lower", simple_function("lower", { "text" }, [](const std::shared_ptr<Context> &, Value & args) {
-    auto text = args.at("text");
-    if (text.is_null()) return text;
-    std::string res;
-    auto str = text.get<std::string>();
-    std::transform(str.begin(), str.end(), std::back_inserter(res), ::tolower);
-    return Value(res);
-  }));
+  auto char_transform_function = [](const std::string & name, const std::function<char(char)> & fn) {
+    return simple_function(name, { "text" }, [=](const std::shared_ptr<Context> &, Value & args) {
+      auto text = args.at("text");
+      if (text.is_null()) return text;
+      std::string res;
+      auto str = text.get<std::string>();
+      std::transform(str.begin(), str.end(), std::back_inserter(res), fn);
+      return Value(res);
+    });
+  };
+  globals.set("lower", char_transform_function("lower", ::tolower));
+  globals.set("upper", char_transform_function("upper", ::toupper));
  globals.set("default", Value::callable([=](const std::shared_ptr<Context> &, ArgumentsValue & args) {
    args.expectArgs("default", {2, 3}, {0, 1});
    auto & value = args.args[0];
@@ -2743,12 +2759,17 @@ inline std::shared_ptr<Context> Context::builtins() {
    return Value::callable([=](const std::shared_ptr<Context> & context, ArgumentsValue & args) {
      args.expectArgs(is_select ? "select" : "reject", {2, (std::numeric_limits<size_t>::max)()}, {0, 0});
      auto & items = args.args[0];
-      if (items.is_null())
+      if (items.is_null()) {
        return Value::array();
-      if (!items.is_array()) throw std::runtime_error("object is not iterable: " + items.dump());
+      }
+      if (!items.is_array()) {
+        throw std::runtime_error("object is not iterable: " + items.dump());
+      }

      auto filter_fn = context->get(args.args[1]);
-      if (filter_fn.is_null()) throw std::runtime_error("Undefined filter: " + args.args[1].dump());
+      if (filter_fn.is_null()) {
+        throw std::runtime_error("Undefined filter: " + args.args[1].dump());
+      }

      auto filter_args = Value::array();
      for (size_t i = 2, n = args.args.size(); i < n; i++) {
@@ -2870,20 +2891,25 @@ inline std::shared_ptr<Context> Context::builtins() {
        auto v = arg.get<int64_t>();
        startEndStep[i] = v;
        param_set[i] = true;
-        }
      }
-      for (auto & [name, value] : args.kwargs) {
-        size_t i;
-        if (name == "start") i = 0;
-        else if (name == "end") i = 1;
-        else if (name == "step") i = 2;
-        else throw std::runtime_error("Unknown argument " + name + " for function range");
+    }
+    for (auto & [name, value] : args.kwargs) {
+      size_t i;
+      if (name == "start") {
+        i = 0;
+      } else if (name == "end") {
+        i = 1;
+      } else if (name == "step") {
+        i = 2;
+      } else {
+        throw std::runtime_error("Unknown argument " + name + " for function range");
+      }

-        if (param_set[i]) {
-          throw std::runtime_error("Duplicate argument " + name + " for function range");
-        }
-        startEndStep[i] = value.get<int64_t>();
-        param_set[i] = true;
+      if (param_set[i]) {
+        throw std::runtime_error("Duplicate argument " + name + " for function range");
+      }
+      startEndStep[i] = value.get<int64_t>();
+      param_set[i] = true;
    }
    if (!param_set[1]) {
      throw std::runtime_error("Missing required argument 'end' for function range");
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -1,6 +1,7 @@
 #include "sampling.h"

 #include "common.h"
+#include "log.h"

 #include <cmath>
 #include <unordered_map>
@@ -208,6 +209,9 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
                                                        trigger_patterns_c.data(), trigger_patterns_c.size(),
                                                        trigger_tokens.data(), trigger_tokens.size())
             :      llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
+        if (!grmr) {
+            return nullptr;
+        }
    }

    auto * result = new common_sampler {
@@ -226,51 +230,48 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
                params.logit_bias.data()));

    if (params.mirostat == 0) {
-        if (params.top_n_sigma >= 0) {
-            llama_sampler_chain_add(result->chain, llama_sampler_init_top_k        (params.top_k));
-            llama_sampler_chain_add(result->chain, llama_sampler_init_temp         (params.temp));
-            llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma  (params.top_n_sigma));
-        } else {
-            for (const auto & cnstr : params.samplers) {
-                switch (cnstr) {
-                    case COMMON_SAMPLER_TYPE_DRY:
-                        {
-                            std::vector<const char *> c_breakers;
-                            c_breakers.reserve(params.dry_sequence_breakers.size());
-                            for (const auto & str : params.dry_sequence_breakers) {
-                                c_breakers.push_back(str.c_str());
-                            }
-
-                            llama_sampler_chain_add(result->chain, llama_sampler_init_dry      (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
+        for (const auto & cnstr : params.samplers) {
+            switch (cnstr) {
+                case COMMON_SAMPLER_TYPE_DRY:
+                    {
+                        std::vector<const char *> c_breakers;
+                        c_breakers.reserve(params.dry_sequence_breakers.size());
+                        for (const auto & str : params.dry_sequence_breakers) {
+                            c_breakers.push_back(str.c_str());
                        }
-                        break;
-                    case COMMON_SAMPLER_TYPE_TOP_K:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_top_k    (params.top_k));
-                        break;
-                    case COMMON_SAMPLER_TYPE_TOP_P:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_top_p    (params.top_p, params.min_keep));
-                        break;
-                    case COMMON_SAMPLER_TYPE_MIN_P:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_min_p    (params.min_p, params.min_keep));
-                        break;
-                    case COMMON_SAMPLER_TYPE_XTC:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_xtc      (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
-                        break;
-                    case COMMON_SAMPLER_TYPE_TYPICAL_P:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_typical  (params.typ_p, params.min_keep));
-                        break;
-                    case COMMON_SAMPLER_TYPE_TEMPERATURE:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
-                        break;
-                    case COMMON_SAMPLER_TYPE_INFILL:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_infill   (vocab));
-                        break;
-                    case COMMON_SAMPLER_TYPE_PENALTIES:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
-                        break;
-                    default:
-                        GGML_ASSERT(false && "unknown sampler type");
-                }
+
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_dry      (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
+                    }
+                    break;
+                case COMMON_SAMPLER_TYPE_TOP_K:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_top_k       (params.top_k));
+                    break;
+                case COMMON_SAMPLER_TYPE_TOP_P:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_top_p       (params.top_p, params.min_keep));
+                    break;
+                case COMMON_SAMPLER_TYPE_TOP_N_SIGMA:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
+                    break;
+                case COMMON_SAMPLER_TYPE_MIN_P:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_min_p       (params.min_p, params.min_keep));
+                    break;
+                case COMMON_SAMPLER_TYPE_XTC:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_xtc         (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
+                    break;
+                case COMMON_SAMPLER_TYPE_TYPICAL_P:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_typical     (params.typ_p, params.min_keep));
+                    break;
+                case COMMON_SAMPLER_TYPE_TEMPERATURE:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext    (params.temp, params.dynatemp_range, params.dynatemp_exponent));
+                    break;
+                case COMMON_SAMPLER_TYPE_INFILL:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_infill      (vocab));
+                    break;
+                case COMMON_SAMPLER_TYPE_PENALTIES:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_penalties   (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
+                    break;
+                default:
+                    GGML_ASSERT(false && "unknown sampler type");
            }
        }
        llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
@@ -472,6 +473,7 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
        case COMMON_SAMPLER_TYPE_TOP_K:       return 'k';
        case COMMON_SAMPLER_TYPE_TYPICAL_P:   return 'y';
        case COMMON_SAMPLER_TYPE_TOP_P:       return 'p';
+        case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: return 's';
        case COMMON_SAMPLER_TYPE_MIN_P:       return 'm';
        case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
        case COMMON_SAMPLER_TYPE_XTC:         return 'x';
@@ -487,6 +489,7 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
        case COMMON_SAMPLER_TYPE_TOP_K:       return "top_k";
        case COMMON_SAMPLER_TYPE_TYPICAL_P:   return "typ_p";
        case COMMON_SAMPLER_TYPE_TOP_P:       return "top_p";
+        case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: return "top_n_sigma";
        case COMMON_SAMPLER_TYPE_MIN_P:       return "min_p";
        case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
        case COMMON_SAMPLER_TYPE_XTC:         return "xtc";
@@ -501,6 +504,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
        { "dry",         COMMON_SAMPLER_TYPE_DRY },
        { "top_k",       COMMON_SAMPLER_TYPE_TOP_K },
        { "top_p",       COMMON_SAMPLER_TYPE_TOP_P },
+        { "top_n_sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
        { "typ_p",       COMMON_SAMPLER_TYPE_TYPICAL_P },
        { "min_p",       COMMON_SAMPLER_TYPE_MIN_P },
        { "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
@@ -514,6 +518,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
    std::unordered_map<std::string, common_sampler_type> sampler_alt_name_map {
        { "top-k",       COMMON_SAMPLER_TYPE_TOP_K },
        { "top-p",       COMMON_SAMPLER_TYPE_TOP_P },
+        { "top-n-sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
        { "nucleus",     COMMON_SAMPLER_TYPE_TOP_P },
        { "typical-p",   COMMON_SAMPLER_TYPE_TYPICAL_P },
        { "typical",     COMMON_SAMPLER_TYPE_TYPICAL_P },
@@ -530,14 +535,16 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
        auto sampler = sampler_canonical_name_map.find(name);
        if (sampler != sampler_canonical_name_map.end()) {
            samplers.push_back(sampler->second);
-        } else {
-            if (allow_alt_names) {
-                sampler = sampler_alt_name_map.find(name);
-                if (sampler != sampler_alt_name_map.end()) {
-                    samplers.push_back(sampler->second);
-                }
+            continue;
+        }
+        if (allow_alt_names) {
+            sampler = sampler_alt_name_map.find(name);
+            if (sampler != sampler_alt_name_map.end()) {
+                samplers.push_back(sampler->second);
+                continue;
            }
        }
+        LOG_WRN("%s: unable to match sampler by name '%s'\n", __func__, name.c_str());
    }

    return samplers;
@@ -549,6 +556,7 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K),       COMMON_SAMPLER_TYPE_TOP_K },
        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P),   COMMON_SAMPLER_TYPE_TYPICAL_P },
        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P),       COMMON_SAMPLER_TYPE_TOP_P },
+        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_N_SIGMA), COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P),       COMMON_SAMPLER_TYPE_MIN_P },
        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC),         COMMON_SAMPLER_TYPE_XTC },
@@ -563,6 +571,8 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
        const auto sampler = sampler_name_map.find(c);
        if (sampler != sampler_name_map.end()) {
            samplers.push_back(sampler->second);
+        } else {
+            LOG_WRN("%s: unable to match sampler by char '%c'\n", __func__, c);
        }
    }

--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -110,6 +110,12 @@ models = [
    {"name": "deepseek-v3",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-V3"},
    {"name": "deepseek-r1-qwen", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"},
    {"name": "gpt-4o",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Xenova/gpt-4o", },
+    {"name": "superbpe",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/UW/OLMo2-8B-SuperBPE-t180k", },
+    {"name": "trillion",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/trillionlabs/Trillion-7B-preview", },
+    {"name": "bailingmoe",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-lite", },
+    {"name": "llama4",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", },
+    {"name": "glm4",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", },
+    {"name": "pixtral",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", },
 ]


--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@@ -24,7 +24,7 @@ if 'NO_LOCAL_GGUF' not in os.environ:
 import gguf

 # reuse model definitions from convert_hf_to_gguf.py
-from convert_hf_to_gguf import LazyTorchTensor, Model
+from convert_hf_to_gguf import LazyTorchTensor, ModelBase

 logger = logging.getLogger("lora-to-gguf")

@@ -340,11 +340,11 @@ if __name__ == '__main__':
            sys.exit(1)
    else:
        logger.info(f"Loading base model: {dir_base_model.name}")
-        hparams = Model.load_hparams(dir_base_model)
+        hparams = ModelBase.load_hparams(dir_base_model)

    with torch.inference_mode():
        try:
-            model_class = Model.from_model_architecture(hparams["architectures"][0])
+            model_class = ModelBase.from_model_architecture(hparams["architectures"][0])
        except NotImplementedError:
            logger.error(f"Model {hparams['architectures'][0]} is not supported")
            sys.exit(1)
--- a/docs/backend/CUDA-FEDORA.md
+++ b/docs/backend/CUDA-FEDORA.md
@@ -14,9 +14,7 @@ In this guide we setup [Nvidia CUDA](https://docs.nvidia.com/cuda/) in a toolbox
 - [Creating a Fedora Toolbox Environment](#creating-a-fedora-toolbox-environment)
 - [Installing Essential Development Tools](#installing-essential-development-tools)
 - [Adding the CUDA Repository](#adding-the-cuda-repository)
- [Installing `nvidia-driver-libs`](#installing-nvidia-driver-libs)
- [Manually Resolving Package Conflicts](#manually-resolving-package-conflicts)
- [Finalizing the Installation of `nvidia-driver-libs`](#finalizing-the-installation-of-nvidia-driver-libs)
+- [Installing Nvidia Driver Libraries](#installing-nvidia-driver-libraries)
 - [Installing the CUDA Meta-Package](#installing-the-cuda-meta-package)
 - [Configuring the Environment](#configuring-the-environment)
 - [Verifying the Installation](#verifying-the-installation)
@@ -67,7 +65,7 @@ This guide focuses on Fedora hosts, but with small adjustments, it can work for
   sudo dnf distro-sync
   ```

-2. **Install the Default Text Editor (Optional):**
+2. **Install **Vim** the default text editor (Optional):**

   ```bash
   sudo dnf install vim-default-editor --allowerasing
@@ -97,36 +95,48 @@ After adding the repository, synchronize the package manager again:
 sudo dnf distro-sync
 ```

-## Installing `nvidia-driver-libs` and `nvidia-driver-cuda-libs`
+## Installing Nvidia Driver Libraries

-We need to detect if the host is supplying the [NVIDIA driver libraries into the toolbox](https://github.com/containers/toolbox/blob/main/src/pkg/nvidia/nvidia.go).
+First, we need to detect if the host is supplying the [NVIDIA driver libraries into the toolbox](https://github.com/containers/toolbox/blob/main/src/pkg/nvidia/nvidia.go):

 ```bash
 ls -la /usr/lib64/libcuda.so.1
 ```

+### If *`libcuda.so.1`* is missing:
+
+```
+ls: cannot access '/usr/lib64/libcuda.so.1': No such file or directory
+```
+
 **Explanation:**
+The host dose not supply the CUDA drivers, **install them now:**

- `nvidia-driver-libs` and `nvidia-driver-cuda-libs` contains necessary NVIDIA driver libraries required by CUDA,
-  on hosts with NVIDIA drivers installed the Fedora Container will supply the host libraries.
-
-### Install Nvidia Driver Libraries on Guest (if `libcuda.so.1` was NOT found).
+#### Install the Nvidia Driver Libraries on Guest:

 ```bash
-sudo dnf install nvidia-driver-libs nvidia-driver-cuda-libs
+sudo dnf install nvidia-driver-cuda nvidia-driver-libs nvidia-driver-cuda-libs nvidia-persistenced
 ```

-### Manually Updating the RPM database for host-supplied NVIDIA drivers (if `libcuda.so.1` was found).
+### If *`libcuda.so.1`* exists:
+```
+lrwxrwxrwx. 1 root root 21 Mar 24 11:26 /usr/lib64/libcuda.so.1 -> libcuda.so.570.133.07
+```

-If the installation fails due to conflicts, we'll manually download and install the required packages, excluding conflicting files.
+**Explanation:**
+The host is supply the CUDA drivers, **we need to update the guest RPM Database accordingly:**

-#### 1. Download `nvidia-driver-libs` and `nvidia-driver-cuda-libs` RPM's (with dependencies)
+#### Update the Toolbox RPM Database to include the Host-Supplied Libraries:
+
+Note: we do not actually install the libraries, we just update the DB so that the guest system knows they are supplied by the host.
+
+##### 1. Download `nvidia-` parts that are supplied by the host RPM's (with dependencies)

 ```bash
-sudo dnf download --destdir=/tmp/nvidia-driver-libs --resolve --arch x86_64 nvidia-driver-libs nvidia-driver-cuda-libs
+sudo dnf download --destdir=/tmp/nvidia-driver-libs --resolve --arch x86_64 nvidia-driver-cuda nvidia-driver-libs nvidia-driver-cuda-libs nvidia-persistenced
 ```

-#### 2. Update the RPM database to assume the installation of these packages.
+##### 2. Update the RPM database to assume the installation of these packages.

 ```bash
 sudo rpm --install --verbose --hash --justdb /tmp/nvidia-driver-libs/*
@@ -134,23 +144,26 @@ sudo rpm --install --verbose --hash --justdb /tmp/nvidia-driver-libs/*

 **Note:**

- The `--justdb` option only updates the RPM database, without touching the filesystem.
+- The `--justdb` option only updates the RPM database, without touching the filesystem elsewhere.

-#### Finalizing the Installation of `nvidia-driver-libs` and `nvidia-driver-cuda-libs`
+##### Check that the RPM Database has been correctly updated:
+
+**Note:** This is the same command as in the *"Install the Nvidia Driver Libraries on Guest"* for if *`libcuda.so.1`* was missing.

-After manually installing the dependencies, run:

 ```bash
-sudo dnf install nvidia-driver-libs nvidia-driver-cuda-libs
+sudo dnf install nvidia-driver-cuda nvidia-driver-libs nvidia-driver-cuda-libs nvidia-persistenced
 ```

-You should receive a message indicating the package is already installed:
+*(this time it will not install anything, as the database things that these packages are already installed)*

 ```
 Updating and loading repositories:
 Repositories loaded.
-Package "nvidia-driver-libs-3:570.86.10-1.fc41.x86_64" is already installed.
-Package "nvidia-driver-cuda-libs-3:570.86.10-1.fc41.x86_64" is already installed.
+Package "nvidia-driver-cuda-3:570.124.06-1.fc41.x86_64" is already installed.
+Package "nvidia-driver-libs-3:570.124.06-1.fc41.x86_64" is already installed.
+Package "nvidia-driver-cuda-libs-3:570.124.06-1.fc41.x86_64" is already installed.
+Package "nvidia-persistenced-3:570.124.06-1.fc41.x86_64" is already installed.

 Nothing to do.
 ```
@@ -207,9 +220,9 @@ You should see output similar to:
 ```
 nvcc: NVIDIA (R) Cuda compiler driver
 Copyright (c) 2005-2025 NVIDIA Corporation
-Built on Wed_Jan_15_19:20:09_PST_2025
-Cuda compilation tools, release 12.8, V12.8.61
-Build cuda_12.8.r12.8/compiler.35404655_0
+Built on Fri_Feb_21_20:23:50_PST_2025
+Cuda compilation tools, release 12.8, V12.8.93
+Build cuda_12.8.r12.8/compiler.35583870_0
 ```

 This output confirms that the CUDA compiler is accessible and indicates the installed version.
--- a/docs/backend/OPENCL.md
+++ b/docs/backend/OPENCL.md
@@ -145,8 +145,13 @@ A Snapdragon X Elite device with Windows 11 Arm64 is used. Make sure the followi
 * Clang 19
 * Ninja
 * Visual Studio 2022
+* Powershell 7

-Powershell is used for the following instructions.
+Visual Studio provides necessary headers and libraries although it is not directly used for building.
+Alternatively, Visual Studio Build Tools can be installed instead of the full Visual Studio.
+
+Powershell 7 is used for the following commands.
+If an older version of Powershell is used, these commands may not work as they are.

 ### I. Setup Environment

@@ -196,10 +201,9 @@ ninja

 ## Known Issues

- Qwen2.5 0.5B model produces gibberish output with Adreno kernels.
+- Currently OpenCL backend does not work on Adreno 6xx GPUs.

 ## TODO

- Fix Qwen2.5 0.5B
 - Optimization for Q6_K
 - Support and optimization for Q4_K
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -20,7 +20,7 @@
 **oneAPI** is an open ecosystem and a standard-based specification, supporting multiple architectures including but not limited to intel CPUs, GPUs and FPGAs. The key components of the oneAPI ecosystem include:

 - **DPCPP** *(Data Parallel C++)*: The primary oneAPI SYCL implementation, which includes the icpx/icx Compilers.
- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. oneMKL and oneDNN)*.
+- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. Intel oneMKL, oneMath and oneDNN)*.
 - **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs.
 - **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.

@@ -227,16 +227,6 @@ Upon a successful installation, SYCL is enabled for the available intel devices,

 **oneAPI Plugin**: In order to enable SYCL support on Nvidia GPUs, please install the [Codeplay oneAPI Plugin for Nvidia GPUs](https://developer.codeplay.com/products/oneapi/nvidia/download). User should also make sure the plugin version matches the installed base toolkit one *(previous step)* for a seamless "oneAPI on Nvidia GPU" setup.

-
-**oneMKL for cuBlas**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* do not contain the cuBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *cuBLAS* backend enabled is thus required to run it on Nvidia GPUs.
-
-```sh
-git clone https://github.com/oneapi-src/oneMKL
-cd oneMKL
-cmake -B buildWithCublas -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_CUBLAS_BACKEND=ON -DTARGET_DOMAINS=blas
-cmake --build buildWithCublas --config Release
-```
-
 **oneDNN**: The current oneDNN releases *(shipped with the oneAPI base-toolkit)* do not include the NVIDIA backend. Therefore, oneDNN must be compiled from source to enable the NVIDIA target:

 ```sh
@@ -250,16 +240,6 @@ cmake --build build-nvidia --config Release

 **oneAPI Plugin**: In order to enable SYCL support on AMD GPUs, please install the [Codeplay oneAPI Plugin for AMD GPUs](https://developer.codeplay.com/products/oneapi/amd/download). As with Nvidia GPUs, the user should also make sure the plugin version matches the installed base toolkit.

-**oneMKL for rocBlas**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* doesn't contain the rocBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *rocBLAS* backend enabled is thus required to run it on AMD GPUs.
-
-```sh
-git clone https://github.com/oneapi-src/oneMKL
-cd oneMKL
-# Find your HIPTARGET with rocminfo, under the key 'Name:'
-cmake -B buildWithrocBLAS -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_ROCBLAS_BACKEND=ON -DHIPTARGETS=${HIPTARGET} -DTARGET_DOMAINS=blas
-cmake --build buildWithrocBLAS --config Release
-```
-
 3. **Verify installation and environment**

 In order to check the available SYCL devices on the machine, please use the `sycl-ls` command.
@@ -322,15 +302,16 @@ cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -
 cmake --build build --config Release -j -v
 ```

+It is possible to come across some precision issues when running tests that stem from using faster
+instructions, which can be circumvented by setting the environment variable `SYCL_PROGRAM_COMPILE_OPTIONS`
+as `-cl-fp32-correctly-rounded-divide-sqrt`
+
 #### Nvidia GPU

-```sh
-# Export relevant ENV variables
-export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LD_LIBRARY_PATH
-export LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LIBRARY_PATH
-export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithCublas/include:$CPLUS_INCLUDE_DIR
-export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
+The SYCL backend depends on [oneMath](https://github.com/uxlfoundation/oneMath) for Nvidia and AMD devices.
+By default it is automatically built along with the project. A specific build can be provided by setting the CMake flag `-DoneMath_DIR=/path/to/oneMath/install/lib/cmake/oneMath`.

+```sh
 # Build LLAMA with Nvidia BLAS acceleration through SYCL
 # Setting GGML_SYCL_DEVICE_ARCH is optional but can improve performance
 GGML_SYCL_DEVICE_ARCH=sm_80 # Example architecture
@@ -345,14 +326,15 @@ cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=
 cmake --build build --config Release -j -v
 ```

+It is possible to come across some precision issues when running tests that stem from using faster
+instructions, which can be circumvented by passing the `-fno-fast-math` flag to the compiler.
+
 #### AMD GPU

-```sh
-# Export relevant ENV variables
-export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LD_LIBRARY_PATH
-export LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LIBRARY_PATH
-export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithrocBLAS/include:$CPLUS_INCLUDE_DIR
+The SYCL backend depends on [oneMath](https://github.com/uxlfoundation/oneMath) for Nvidia and AMD devices.
+By default it is automatically built along with the project. A specific build can be provided by setting the CMake flag `-DoneMath_DIR=/path/to/oneMath/install/lib/cmake/oneMath`.

+```sh
 # Build LLAMA with rocBLAS acceleration through SYCL

 ## AMD
@@ -443,13 +425,13 @@ Examples:
 - Use device 0:

 ```sh
-ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
+ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
 ```

 - Use multiple devices:

 ```sh
-ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
+ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
 ```

 *Notes:*
@@ -493,6 +475,12 @@ b. Enable oneAPI running environment:
 "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
 ```

+- if you are using Powershell, enable the runtime environment with the following:
+
+```
+cmd.exe "/K" '"C:\Program Files (x86)\Intel\oneAPI\setvars.bat" && powershell'
+```
+
 c. Verify installation

 In the oneAPI command line, run the following to print the available SYCL devices:
@@ -523,13 +511,13 @@ You could download the release package for Windows directly, which including bin

 Choose one of following methods to build from source code.

-1. Script
+#### 1. Script

 ```sh
 .\examples\sycl\win-build-sycl.bat
 ```

-2. CMake
+#### 2. CMake

 On the oneAPI command line window, step into the llama.cpp main directory and run the following:

@@ -558,13 +546,84 @@ cmake --preset x64-windows-sycl-debug
 cmake --build build-x64-windows-sycl-debug -j --target llama-cli
 ```

-3. Visual Studio
+#### 3. Visual Studio

-You can use Visual Studio to open llama.cpp folder as a CMake project. Choose the sycl CMake presets (`x64-windows-sycl-release` or `x64-windows-sycl-debug`) before you compile the project.
+You have two options to use Visual Studio to build llama.cpp:
+- As CMake Project using CMake presets.
+- Creating a Visual Studio solution to handle the project.
+
+**Note**:
+
+All following commands are executed in PowerShell.
+
+##### - Open as a CMake Project
+
+You can use Visual Studio to open the `llama.cpp` folder directly as a CMake project. Before compiling, select one of the SYCL CMake presets:
+
+- `x64-windows-sycl-release`
+
+- `x64-windows-sycl-debug`

 *Notes:*
+- For a minimal experimental setup, you can build only the inference executable using:

- In case of a minimal experimental setup, the user can build the inference executable only through `cmake --build build --config Release -j --target llama-cli`.
+    ```Powershell
+    cmake --build build --config Release -j --target llama-cli
+    ```
+
+##### - Generating a Visual Studio Solution
+
+You can use Visual Studio solution to build and work on llama.cpp on Windows. You need to convert the CMake Project into a `.sln` file.
+
+If you want to use the Intel C++ Compiler for the entire `llama.cpp` project, run the following command:
+
+```Powershell
+cmake -B build -G "Visual Studio 17 2022" -T "Intel C++ Compiler 2025" -A x64 -DGGML_SYCL=ON -DCMAKE_BUILD_TYPE=Release
+```
+
+If you prefer to use the Intel C++ Compiler only for `ggml-sycl`, ensure that `ggml` and its backend libraries are built as shared libraries ( i.e. `-DBUILD_SHARED_LIBRARIES=ON`, this is default behaviour):
+
+```Powershell
+cmake -B build -G "Visual Studio 17 2022" -A x64 -DGGML_SYCL=ON -DCMAKE_BUILD_TYPE=Release \
+      -DSYCL_INCLUDE_DIR="C:\Program Files (x86)\Intel\oneAPI\compiler\latest\include" \
+      -DSYCL_LIBRARY_DIR="C:\Program Files (x86)\Intel\oneAPI\compiler\latest\lib"
+```
+
+If successful the build files have been written to: *path/to/llama.cpp/build*
+Open the project file **build/llama.cpp.sln** with Visual Studio.
+
+Once the Visual Studio solution is created, follow these steps:
+
+1. Open the solution in Visual Studio.
+
+2. Right-click on `ggml-sycl` and select **Properties**.
+
+3. In the left column, expand **C/C++** and select **DPC++**.
+
+4. In the right panel, find **Enable SYCL Offload** and set it to `Yes`.
+
+5. Apply the changes and save.
+
+
+*Navigation Path:*
+
+```
+Properties -> C/C++ -> DPC++ -> Enable SYCL Offload (Yes)
+```
+
+Now, you can build `llama.cpp` with the SYCL backend as a Visual Studio project.
+To do it from menu: `Build -> Build Solution`.
+Once it is completed, final results will be in **build/Release/bin**
+
+*Additional Note*
+
+- You can avoid specifying `SYCL_INCLUDE_DIR` and `SYCL_LIBRARY_DIR` in the CMake command by setting the environment variables:
+
+    - `SYCL_INCLUDE_DIR_HINT`
+
+    - `SYCL_LIBRARY_DIR_HINT`
+
+- Above instruction has been tested with Visual Studio 17 Community edition and oneAPI 2025.0. We expect them to work also with future version if the instructions are adapted accordingly.

 ### III. Run the inference

@@ -638,13 +697,13 @@ Examples:
 - Use device 0:

 ```
-build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
+build\bin\llama-cli.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
 ```

 - Use multiple devices:

 ```
-build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
+build\bin\llama-cli.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
 ```


--- a/docs/build.md
+++ b/docs/build.md
@@ -132,12 +132,14 @@ You may find the official downloads here: [NVIDIA developer site](https://develo


 #### Compile and run inside a Fedora Toolbox Container
-We also have a [guide](./cuda-fedora.md) for setting up CUDA toolkit in a Fedora [toolbox container](https://containertoolbx.org/).
+We also have a [guide](./backend/CUDA-FEDORA.md) for setting up CUDA toolkit in a Fedora [toolbox container](https://containertoolbx.org/).

 **Recommended for:**
-
- ***Particularly*** *convenient* for users of [Atomic Desktops for Fedora](https://fedoraproject.org/atomic-desktops/); such as: [Silverblue](https://fedoraproject.org/atomic-desktops/silverblue/) and [Kinoite](https://fedoraproject.org/atomic-desktops/kinoite/).
- Toolbox is installed by default: [Fedora Workstation](https://fedoraproject.org/workstation/) or [Fedora KDE Plasma Desktop](https://fedoraproject.org/spins/kde).
+- ***Necessary*** for users of [Atomic Desktops for Fedora](https://fedoraproject.org/atomic-desktops/); such as: [Silverblue](https://fedoraproject.org/atomic-desktops/silverblue/) and [Kinoite](https://fedoraproject.org/atomic-desktops/kinoite/).
+  - (there are no supported CUDA packages for these systems)
+- ***Necessary*** for users that have a host that is not a: [Supported Nvidia CUDA Release Platform](https://developer.nvidia.com/cuda-downloads).
+  - (for example, you may have [Fedora 42 Beta](https://fedoramagazine.org/announcing-fedora-linux-42-beta/) as your your host operating system)
+- ***Convenient*** For those running [Fedora Workstation](https://fedoraproject.org/workstation/) or [Fedora KDE Plasma Desktop](https://fedoraproject.org/spins/kde), and want to keep their host system clean.
 - *Optionally* toolbox packages are available: [Arch Linux](https://archlinux.org/), [Red Hat Enterprise Linux >= 8.5](https://www.redhat.com/en/technologies/linux-platforms/enterprise-linux), or [Ubuntu](https://ubuntu.com/download)


@@ -189,7 +191,7 @@ The following compilation options are also available to tweak performance:

 | Option                        | Legal values           | Default | Description                                                                                                                                                                                                                                                                             |
 |-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| GGML_CUDA_FORCE_MMQ           | Boolean                | false   | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower.                       |
+| GGML_CUDA_FORCE_MMQ           | Boolean                | false   | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, CDNA and RDNA3+). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower.                       |
 | GGML_CUDA_FORCE_CUBLAS        | Boolean                | false   | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models                                                                                                                                                                                       |
 | GGML_CUDA_F16                 | Boolean                | false   | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs.                                                           |
 | GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer       | 128     | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial.                                                                         |
@@ -216,6 +218,7 @@ By default, all supported compute capabilities are enabled. To customize this be

 ```bash
 cmake -B build -DGGML_MUSA=ON -DMUSA_ARCHITECTURES="21"
+cmake --build build --config Release
 ```

 This configuration enables only compute capability `2.1` (MTT S80) during compilation, which can help reduce compilation time.
@@ -256,8 +259,6 @@ You can download it from your Linux distro's package manager or from here: [ROCm
      cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
      && cmake --build build --config Release -- -j 16
  ```
-  On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`.
-  However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).

  To enhance flash attention performance on RDNA3+ or CDNA architectures, you can utilize the rocWMMA library by enabling the `-DGGML_HIP_ROCWMMA_FATTN=ON` option. This requires rocWMMA headers to be installed on the build system.

@@ -293,6 +294,10 @@ You can download it from your Linux distro's package manager or from here: [ROCm
 The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
 If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.

+### Unified Memory
+
+On Linux it is possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1`. However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
+
 ## Vulkan

 **Windows**
@@ -433,6 +438,116 @@ llama_new_context_with_model:       CANN compute buffer size =  1260.81 MiB

 For detailed info, such as model/device supports, CANN install, please refer to [llama.cpp for CANN](./backend/CANN.md).

+## Arm® KleidiAI™
+KleidiAI is a library of optimized microkernels for AI workloads, specifically designed for Arm CPUs. These microkernels enhance performance and can be enabled for use by the CPU backend.
+
+To enable KleidiAI, go to the llama.cpp directory and build using CMake
+```bash
+cmake -B build -DGGML_CPU_KLEIDIAI=ON
+cmake --build build --config Release
+```
+You can verify that KleidiAI is being used by running
+```bash
+./build/bin/llama-cli -m PATH_TO_MODEL -p "What is a car?"
+```
+If KleidiAI is enabled, the ouput will contain a line similar to:
+```
+load_tensors: CPU_KLEIDIAI model buffer size =  3474.00 MiB
+```
+KleidiAI's microkernels implement optimized tensor operations using Arm CPU features such as dotprod, int8mm and SME. llama.cpp selects the most efficient kernel based on runtime CPU feature detection. However, on platforms that support SME, you must manually enable SME microkernels by setting the environment variable `GGML_KLEIDIAI_SME=1`.
+
+Depending on your build target, other higher priority backends may be enabled by default. To ensure the CPU backend is used, you must disable the higher priority backends either at compile time, e.g. -DGGML_METAL=OFF, or during run-time using the command line option `--device none`.
+
+## OpenCL
+
+This provides GPU acceleration through OpenCL on recent Adreno GPU.
+More information about OpenCL backend can be found in [OPENCL.md](./backend/OPENCL.md) for more information.
+
+### Android
+
+Assume NDK is available in `$ANDROID_NDK`. First, install OpenCL headers and ICD loader library if not available,
+
+```sh
+mkdir -p ~/dev/llm
+cd ~/dev/llm
+
+git clone https://github.com/KhronosGroup/OpenCL-Headers && \
+cd OpenCL-Headers && \
+cp -r CL $ANDROID_NDK/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
+
+cd ~/dev/llm
+
+git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader && \
+cd OpenCL-ICD-Loader && \
+mkdir build_ndk && cd build_ndk && \
+cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release \
+  -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
+  -DOPENCL_ICD_LOADER_HEADERS_DIR=$ANDROID_NDK/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include \
+  -DANDROID_ABI=arm64-v8a \
+  -DANDROID_PLATFORM=24 \
+  -DANDROID_STL=c++_shared && \
+ninja && \
+cp libOpenCL.so $ANDROID_NDK/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
+```
+
+Then build llama.cpp with OpenCL enabled,
+
+```sh
+cd ~/dev/llm
+
+git clone https://github.com/ggml-org/llama.cpp && \
+cd llama.cpp && \
+mkdir build-android && cd build-android
+
+cmake .. -G Ninja \
+  -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
+  -DANDROID_ABI=arm64-v8a \
+  -DANDROID_PLATFORM=android-28 \
+  -DBUILD_SHARED_LIBS=OFF \
+  -DGGML_OPENCL=ON
+
+ninja
+```
+
+### Windows Arm64
+
+First, install OpenCL headers and ICD loader library if not available,
+
+```powershell
+mkdir -p ~/dev/llm
+
+cd ~/dev/llm
+git clone https://github.com/KhronosGroup/OpenCL-Headers && cd OpenCL-Headers
+mkdir build && cd build
+cmake .. -G Ninja `
+  -DBUILD_TESTING=OFF `
+  -DOPENCL_HEADERS_BUILD_TESTING=OFF `
+  -DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
+  -DCMAKE_INSTALL_PREFIX="$HOME/dev/llm/opencl"
+cmake --build . --target install
+
+cd ~/dev/llm
+git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader && cd OpenCL-ICD-Loader
+mkdir build && cd build
+cmake .. -G Ninja `
+  -DCMAKE_BUILD_TYPE=Release `
+  -DCMAKE_PREFIX_PATH="$HOME/dev/llm/opencl" `
+  -DCMAKE_INSTALL_PREFIX="$HOME/dev/llm/opencl"
+cmake --build . --target install
+```
+
+Then build llama.cpp with OpenCL enabled,
+
+```powershell
+cmake .. -G Ninja `
+  -DCMAKE_TOOLCHAIN_FILE="$HOME/dev/llm/llama.cpp/cmake/arm64-windows-llvm.cmake" `
+  -DCMAKE_BUILD_TYPE=Release `
+  -DCMAKE_PREFIX_PATH="$HOME/dev/llm/opencl" `
+  -DBUILD_SHARED_LIBS=OFF `
+  -DGGML_OPENCL=ON
+ninja
+```
+
 ## Android

 To read documentation for how to build on Android, [click here](./android.md)
--- a/docs/development/HOWTO-add-model.md
+++ b/docs/development/HOWTO-add-model.md
@@ -9,10 +9,10 @@ Adding a model requires few steps:
 After following these steps, you can open PR.

 Also, it is important to check that the examples and main ggml backends (CUDA, METAL, CPU) are working with the new architecture, especially:
- [main](/examples/main/)
- [imatrix](/examples/imatrix/)
- [quantize](/examples/quantize/)
- [server](/examples/server/)
+- [main](/tools/main/)
+- [imatrix](/tools/imatrix/)
+- [quantize](/tools/quantize/)
+- [server](/tools/server/)

 ### 1. Convert the model to GGUF

--- a/docs/install.md
+++ b/docs/install.md
@@ -9,6 +9,13 @@ brew install llama.cpp
 ```
 The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggml-org/llama.cpp/discussions/7668

+## MacPorts
+
+```sh
+sudo port install llama.cpp
+```
+see also: https://ports.macports.org/port/llama.cpp/details/
+
 ## Nix

 On Mac and Linux, the Nix package manager can be used via
--- a/examples/llava/MobileVLM-README.md
+++ b/examples/llava/MobileVLM-README.md
@@ -9,15 +9,15 @@ The implementation is based on llava, and is compatible with llava and mobileVLM
 Notice: The overall process of model inference for both **MobileVLM** and **MobileVLM_V2** models is the same, but the process of model conversion is a little different. Therefore, using **MobileVLM-1.7B** as an example, the different conversion step will be shown.

 ## Usage
-Build with cmake or run `make llama-llava-cli` to build it.

-After building, run: `./llama-llava-cli` to see the usage. For example:
+Build the `llama-mtmd-cli` binary.
+
+After building, run: `./llama-mtmd-cli` to see the usage. For example:

 ```sh
-./llama-llava-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \
+./llama-mtmd-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \
    --mmproj MobileVLM-1.7B/mmproj-model-f16.gguf \
-    --image path/to/an/image.jpg \
-    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? Answer the question using a single word or phrase. ASSISTANT:"
+    --chat-template deepseek
 ```

 ## Model conversion
@@ -33,13 +33,13 @@ git clone https://huggingface.co/openai/clip-vit-large-patch14-336
 2. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:

 ```sh
-python ./examples/llava/llava_surgery.py -m path/to/MobileVLM-1.7B
+python ./tools/mtmd/llava_surgery.py -m path/to/MobileVLM-1.7B
 ```

 3. Use `convert_image_encoder_to_gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:

 ```sh
-python ./examples/llava/convert_image_encoder_to_gguf.py \
+python ./tools/mtmd/convert_image_encoder_to_gguf.py \
    -m path/to/clip-vit-large-patch14-336 \
    --llava-projector path/to/MobileVLM-1.7B/llava.projector \
    --output-dir path/to/MobileVLM-1.7B \
@@ -47,7 +47,7 @@ python ./examples/llava/convert_image_encoder_to_gguf.py \
 ```

 ```sh
-python ./examples/llava/convert_image_encoder_to_gguf.py \
+python ./tools/mtmd/convert_image_encoder_to_gguf.py \
    -m path/to/clip-vit-large-patch14-336 \
    --llava-projector path/to/MobileVLM-1.7B_V2/llava.projector \
    --output-dir path/to/MobileVLM-1.7B_V2 \
@@ -69,10 +69,10 @@ Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directo

 ## Android compile and run
 ### compile
-refer to `examples/llava/android/build_64.sh`
+refer to `tools/mtmd/android/build_64.sh`
 ```sh
-mkdir examples/llava/android/build_64
-cd examples/llava/android/build_64
+mkdir tools/mtmd/android/build_64
+cd tools/mtmd/android/build_64
 ../build_64.sh
 ```
 ### run on Android
@@ -82,7 +82,7 @@ refer to `android/adb_run.sh`, modify resources' `name` and `path`
 ### case 1
 **input**
 ```sh
-/data/local/tmp/llama-llava-cli \
+/data/local/tmp/llama-mtmd-cli \
    -m /data/local/tmp/ggml-model-q4_k.gguf \
    --mmproj /data/local/tmp/mmproj-model-f16.gguf \
    -t 4 \
@@ -102,7 +102,7 @@ llama_print_timings:       total time =   34731.93 ms
 ### case 2
 **input**
 ```sh
-/data/local/tmp/llama-llava-cli \
+/data/local/tmp/llama-mtmd-cli \
    -m /data/local/tmp/ggml-model-q4_k.gguf \
    --mmproj /data/local/tmp/mmproj-model-f16.gguf \
    -t 4 \
@@ -123,10 +123,10 @@ llama_print_timings:       total time =   34570.79 ms

 ## Some result on Android with `Snapdragon 778G` chip
 ### MobileVLM-1.7B case
-#### llava-cli release-b2005
+#### mtmd-cli release-b2005
 **input**
 ```sh
-/data/local/tmp/llama-llava-cli \
+/data/local/tmp/llama-mtmd-cli \
    -m /data/local/tmp/ggml-model-q4_k.gguf \
    --mmproj /data/local/tmp/mmproj-model-f16.gguf \
    -t 4 \
@@ -147,7 +147,7 @@ llama_print_timings: prompt eval time =    8119.49 ms /   191 tokens (   42.51 m
 llama_print_timings:        eval time =    1005.75 ms /    14 runs   (   71.84 ms per token,    13.92 tokens per second)
 llama_print_timings:       total time =   28038.34 ms /   205 tokens
 ```
-#### llava-cli latest-version
+#### mtmd-cli latest-version
 **input**

 Just the same as above.
@@ -169,7 +169,7 @@ llama_print_timings:        eval time =   43894.02 ms /    13 runs   ( 3376.46 m
 llama_print_timings:       total time =  865441.76 ms /   204 tokens
 ```
 ### MobileVLM_V2-1.7B case
-#### llava-cli release-2005b
+#### mtmd-cli release-2005b
 **input**

 Just the same as above.
@@ -200,7 +200,7 @@ make GGML_CUDA=1 CUDA_DOCKER_ARCH=sm_87 GGML_CUDA_F16=1 -j 32
 ### case 1
 **input**
 ```sh
-./llama-llava-cli \
+./llama-mtmd-cli \
    -m /data/local/tmp/ggml-model-q4_k.gguf \
    --mmproj /data/local/tmp/mmproj-model-f16.gguf \
    --image /data/local/tmp/demo.jpeg \
@@ -224,7 +224,7 @@ llama_print_timings:       total time =    1352.63 ms /   252 tokens
 ### case 2
 **input**
 ```sh
-./llama-llava-cli \
+./llama-mtmd-cli \
    -m /data/local/tmp/ggml-model-q4_k.gguf \
    --mmproj /data/local/tmp/mmproj-model-f16.gguf \
    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:" \
--- a/docs/multimodal/gemma3.md
+++ b/docs/multimodal/gemma3.md
@@ -0,0 +1,51 @@
+# Gemma 3 vision
+
+> [!IMPORTANT]
+>
+> This is very experimental, only used for demo purpose.
+
+## Quick started
+
+You can use pre-quantized model from [ggml-org](https://huggingface.co/ggml-org)'s Hugging Face account
+
+```bash
+# build
+cmake -B build
+cmake --build build --target llama-mtmd-cli
+
+# alternatively, install from brew (MacOS)
+brew install llama.cpp
+
+# run it
+llama-mtmd-cli -hf ggml-org/gemma-3-4b-it-GGUF
+llama-mtmd-cli -hf ggml-org/gemma-3-12b-it-GGUF
+llama-mtmd-cli -hf ggml-org/gemma-3-27b-it-GGUF
+
+# note: 1B model does not support vision
+```
+
+## How to get mmproj.gguf?
+
+Simply to add `--mmproj` in when converting model via `convert_hf_to_gguf.py`:
+
+```bash
+cd gemma-3-4b-it
+python ../llama.cpp/convert_hf_to_gguf.py --outfile model.gguf --outtype f16 --mmproj .
+# output file: mmproj-model.gguf
+```
+
+## How to run it?
+
+What you need:
+- The text model GGUF, can be converted using `convert_hf_to_gguf.py`
+- The mmproj file from step above
+- An image file
+
+```bash
+# build
+cmake -B build
+cmake --build build --target llama-mtmd-cli
+
+# run it
+./build/bin/llama-mtmd-cli -m {text_model}.gguf --mmproj mmproj.gguf --image your_image.jpg
+```
--- a/examples/llava/README-glmedge.md
+++ b/examples/llava/README-glmedge.md
@@ -3,12 +3,12 @@
 Currently this implementation supports [glm-edge-v-2b](https://huggingface.co/THUDM/glm-edge-v-2b) and [glm-edge-v-5b](https://huggingface.co/THUDM/glm-edge-v-5b).

 ## Usage
-Build with cmake or run `make llama-llava-cli` to build it.
+Build the `llama-mtmd-cli` binary.

-After building, run: `./llama-llava-cli` to see the usage. For example:
+After building, run: `./llama-mtmd-cli` to see the usage. For example:

 ```sh
-./llama-llava-cli -m model_path/ggml-model-f16.gguf --mmproj model_path/mmproj-model-f16.gguf --image img_path/image.jpg -p "<|system|>\n system prompt <image><|user|>\n prompt <|assistant|>\n"
+./llama-mtmd-cli -m model_path/ggml-model-f16.gguf --mmproj model_path/mmproj-model-f16.gguf
 ```

 **note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
@@ -25,13 +25,13 @@ git clone https://huggingface.co/THUDM/glm-edge-v-5b or https://huggingface.co/T
 2. Use `glmedge-surgery.py` to split the GLMV-EDGE model to LLM and multimodel projector constituents:

 ```sh
-python ./examples/llava/glmedge-surgery.py -m ../model_path
+python ./tools/mtmd/glmedge-surgery.py -m ../model_path
 ```

 4. Use `glmedge-convert-image-encoder-to-gguf.py` to convert the GLMV-EDGE image encoder to GGUF:

 ```sh
-python ./examples/llava/glmedge-convert-image-encoder-to-gguf.py -m ../model_path --llava-projector ../model_path/glm.projector --output-dir ../model_path
+python ./tools/mtmd/glmedge-convert-image-encoder-to-gguf.py -m ../model_path --llava-projector ../model_path/glm.projector --output-dir ../model_path
 ```

 5. Use `examples/convert_hf_to_gguf.py` to convert the LLM part of GLMV-EDGE to GGUF:
--- a/examples/llava/README-granitevision.md
+++ b/examples/llava/README-granitevision.md
@@ -176,15 +176,11 @@ Note that currently you cannot quantize the visual encoder because granite visio


 ### 5. Running the Model in Llama cpp
-Build llama cpp normally; you should have a target binary named `llama-llava-cli`, which you can pass two binaries to. As an example, we pass the the llama.cpp banner.
+Build llama cpp normally; you should have a target binary named `llama-mtmd-cli`, which you can pass two binaries to. As an example, we pass the the llama.cpp banner.

 ```bash
-$ ./build/bin/llama-llava-cli -m $LLM_GGUF_PATH \
+$ ./build/bin/llama-mtmd-cli -m $LLM_GGUF_PATH \
    --mmproj $VISUAL_GGUF_PATH \
-    --image ./media/llama0-banner.png \
    -c 16384 \
-    -p "<|system|>\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n<|user|>\n\<image>\nWhat does the text in this image say?\n<|assistant|>\n" \
    --temp 0
 ```
-
-Sample output: `The text in the image reads "LLAMA C++ Can it run DOOM Llama?"`
--- a/docs/multimodal/llava.md
+++ b/docs/multimodal/llava.md
@@ -11,12 +11,14 @@ For llava-1.6 a variety of prepared gguf models are available as well [7b-34b](h
 After API is confirmed, more models will be supported / uploaded.

 ## Usage
-Build with cmake or run `make llama-llava-cli` to build it.
+Build the `llama-mtmd-cli` binary.

-After building, run: `./llama-llava-cli` to see the usage. For example:
+After building, run: `./llama-mtmd-cli` to see the usage. For example:

 ```sh
-./llama-llava-cli -m ../llava-v1.5-7b/ggml-model-f16.gguf --mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
+./llama-mtmd-cli -m ../llava-v1.5-7b/ggml-model-f16.gguf \
+    --mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf \
+    --chat-template vicuna
 ```

 **note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
@@ -35,19 +37,19 @@ git clone https://huggingface.co/openai/clip-vit-large-patch14-336
 2. Install the required Python packages:

 ```sh
-pip install -r examples/llava/requirements.txt
+pip install -r tools/mtmd/requirements.txt
 ```

 3. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:

 ```sh
-python ./examples/llava/llava_surgery.py -m ../llava-v1.5-7b
+python ./tools/mtmd/llava_surgery.py -m ../llava-v1.5-7b
 ```

 4. Use `convert_image_encoder_to_gguf.py` to convert the LLaVA image encoder to GGUF:

 ```sh
-python ./examples/llava/convert_image_encoder_to_gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
+python ./tools/mtmd/convert_image_encoder_to_gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
 ```

 5. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF:
@@ -67,12 +69,12 @@ git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b
 2) Install the required Python packages:

 ```sh
-pip install -r examples/llava/requirements.txt
+pip install -r tools/mtmd/requirements.txt
 ```

 3) Use `llava_surgery_v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
 ```console
-python examples/llava/llava_surgery_v2.py -C -m ../llava-v1.6-vicuna-7b/
+python tools/mtmd/llava_surgery_v2.py -C -m ../llava-v1.6-vicuna-7b/
 ```
 - you will find a llava.projector and a llava.clip file in your model directory

@@ -86,7 +88,7 @@ curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.jso

 5) Create the visual gguf model:
 ```console
-python ./examples/llava/convert_image_encoder_to_gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
+python ./tools/mtmd/convert_image_encoder_to_gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
 ```
 - This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP

@@ -97,7 +99,7 @@ python ./examples/convert_legacy_llama.py ../llava-v1.6-vicuna-7b/ --skip-unknow

 7) And finally we can run the llava cli using the 1.6 model version:
 ```console
-./llama-llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096
+./llama-mtmd-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf
 ```

 **note** llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096)
@@ -122,17 +124,9 @@ model.language_model.save_pretrained(llm_export_path)

 Then, you can convert the LLM using the `convert_hf_to_gguf.py` script, which handles more LLM architectures.

-## llava-cli templating and llava-1.6 prompting
+## Chat template

-llava-1.5 models all use the same vicuna prompt, here you can just add your image question like `-p "Provide a full description."`
-For llava-1.5 models which are not vicuna (mistral and Yi) you need to adapt system prompt as well as user prompt, for this purpose llava-cli has a basic templating system:
-
-**For Mistral and using llava-cli binary:**
-Add this: `-p "<image>\nUSER:\nProvide a full description.\nASSISTANT:\n"`
-The mistral template for llava-1.6 seems to be no system print and a USER/ASSISTANT role
-
-**For the 34B this should work:**
-Add this: `-e -p <|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n<image>\nProvide a full description.<|im_end|><|im_start|>assistant\n`
+For llava-1.5 and llava-1.6, you need to use `vicuna` chat template. Simply add `--chat-template vicuna` to activate this template.


 ## How to know if you are running in llava-1.5 or llava-1.6 mode
@@ -147,12 +141,3 @@ When running llava-cli you will see a visual information right before the prompt


 Alternatively just pay notice to how many "tokens" have been used for your prompt, it will also show 1000+ tokens for llava-1.6
-
-
-
-
-## TODO
-
- [x] Support non-CPU backend for the image encoding part.
- [ ] Support different sampling methods.
- [ ] Support more model variants.
--- a/examples/llava/README-minicpmo2.6.md
+++ b/examples/llava/README-minicpmo2.6.md
@@ -29,8 +29,8 @@ cmake --build build --config Release
 Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf) by us)

 ```bash
-python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-o-2_6
-python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-o-2_6 --minicpmv-projector ../MiniCPM-o-2_6/minicpmv.projector --output-dir ../MiniCPM-o-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 4
+python ./tools/mtmd/minicpmv-surgery.py -m ../MiniCPM-o-2_6
+python ./tools/mtmd/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-o-2_6 --minicpmv-projector ../MiniCPM-o-2_6/minicpmv.projector --output-dir ../MiniCPM-o-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 4
 python ./convert_hf_to_gguf.py ../MiniCPM-o-2_6/model

 # quantize int4 version
@@ -40,9 +40,9 @@ python ./convert_hf_to_gguf.py ../MiniCPM-o-2_6/model

 Inference on Linux or Mac
 ```bash
-# run f16 version
-./build/bin/llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
+# run in single-turn mode
+./build/bin/llama-mtmd-cli -m ../MiniCPM-o-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"

-# run quantized int4 version
-./build/bin/llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg  -p "What is in the image?"
+# run in conversation mode
+./build/bin/llama-mtmd-cli -m ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf
 ```
--- a/examples/llava/README-minicpmv2.5.md
+++ b/examples/llava/README-minicpmv2.5.md
@@ -28,8 +28,8 @@ cmake --build build --config Release
 Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us)

 ```bash
-python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
-python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2
+python ./tools/mtmd/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
+python ./tools/mtmd/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2
 python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model

 # quantize int4 version
@@ -39,9 +39,9 @@ python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model

 Inference on Linux or Mac
 ```bash
-# run f16 version
-./build/bin/llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
+# run in single-turn mode
+./build/bin/llama-mtmd-cli -m ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"

-# run quantized int4 version
-./build/bin/llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg  -p "What is in the image?"
+# run in conversation mode
+./build/bin/llama-mtmd-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf
 ```
--- a/examples/llava/README-minicpmv2.6.md
+++ b/examples/llava/README-minicpmv2.6.md
@@ -28,8 +28,8 @@ cmake --build build --config Release
 Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) by us)

 ```bash
-python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-V-2_6
-python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-2_6 --minicpmv-projector ../MiniCPM-V-2_6/minicpmv.projector --output-dir ../MiniCPM-V-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 3
+python ./tools/mtmd/minicpmv-surgery.py -m ../MiniCPM-V-2_6
+python ./tools/mtmd/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-2_6 --minicpmv-projector ../MiniCPM-V-2_6/minicpmv.projector --output-dir ../MiniCPM-V-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 3
 python ./convert_hf_to_gguf.py ../MiniCPM-V-2_6/model

 # quantize int4 version
@@ -39,9 +39,9 @@ python ./convert_hf_to_gguf.py ../MiniCPM-V-2_6/model

 Inference on Linux or Mac
 ```bash
-# run f16 version
-./build/bin/llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
+# run in single-turn mode
+./build/bin/llama-mtmd-cli -m ../MiniCPM-V-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"

-# run quantized int4 version
-./build/bin/llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg  -p "What is in the image?"
+# run in conversation mode
+./build/bin/llama-mtmd-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf
 ```
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -12,60 +12,29 @@ llama_add_compile_flags()

 # examples

-include_directories(${CMAKE_CURRENT_SOURCE_DIR})
-
 if (EMSCRIPTEN)
 else()
-    add_subdirectory(batched-bench)
    add_subdirectory(batched)
    add_subdirectory(embedding)
    add_subdirectory(eval-callback)

-    if (NOT WIN32)
-        # disabled on Windows because it uses internal functions not exported with LLAMA_API
-        add_subdirectory(gbnf-validator)
-    endif()
-
    add_subdirectory(gguf-hash)
-    add_subdirectory(gguf-split)
    add_subdirectory(gguf)
    add_subdirectory(gritlm)
-    add_subdirectory(imatrix)
-    add_subdirectory(infill)
-    add_subdirectory(llama-bench)
    add_subdirectory(lookahead)
    add_subdirectory(lookup)
-    add_subdirectory(main)
    add_subdirectory(parallel)
    add_subdirectory(passkey)
-    add_subdirectory(perplexity)
-    add_subdirectory(quantize)
    add_subdirectory(retrieval)
-    if (LLAMA_BUILD_SERVER)
-        add_subdirectory(server)
-    endif()
    add_subdirectory(save-load-state)
-    add_subdirectory(run)
    add_subdirectory(simple)
    add_subdirectory(simple-chat)
    add_subdirectory(speculative)
    add_subdirectory(speculative-simple)
-    add_subdirectory(tokenize)
-    add_subdirectory(tts)
    add_subdirectory(gen-docs)
    if (NOT GGML_BACKEND_DL)
-        # these examples use the backends directly and cannot be built with dynamic loading
        add_subdirectory(convert-llama2c-to-ggml)
-        add_subdirectory(cvector-generator)
-        add_subdirectory(export-lora)
-        if (NOT WIN32)
-            # disabled on Windows because it uses internal functions not exported with LLAMA_API
-            add_subdirectory(quantize-stats)
-        endif()
-        add_subdirectory(llava)
-        if (GGML_RPC)
-            add_subdirectory(rpc)
-        endif()
+        # these examples use the backends directly and cannot be built with dynamic loading
        if (GGML_SYCL)
            add_subdirectory(sycl)
        endif()
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -41,7 +41,7 @@ int main(int argc, char ** argv) {

    llama_model_params model_params = common_model_params_to_llama(params);

-    llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
+    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);

    if (model == NULL) {
        LOG_ERR("%s: error: unable to load model\n" , __func__);
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -35,23 +35,14 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke

 static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
    const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
-    const struct llama_model * model = llama_get_model(ctx);

    // clear previous kv_cache values (irrelevant for embeddings)
    llama_kv_self_clear(ctx);

    // run model
    LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
-    if (llama_model_has_encoder(model) && !llama_model_has_decoder(model)) {
-        // encoder-only model
-        if (llama_encode(ctx, batch) < 0) {
-            LOG_ERR("%s : failed to encode\n", __func__);
-        }
-    } else if (!llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
-        // decoder-only model
-        if (llama_decode(ctx, batch) < 0) {
-            LOG_ERR("%s : failed to decode\n", __func__);
-        }
+    if (llama_encode(ctx, batch) < 0) {
+        LOG_ERR("%s : failed to encode\n", __func__);
    }

    for (int i = 0; i < batch.n_tokens; i++) {
@@ -89,6 +80,13 @@ int main(int argc, char ** argv) {
    common_init();

    params.embedding = true;
+
+    // utilize the full context
+    if (params.n_batch < params.n_ctx) {
+        LOG_WRN("%s: setting batch size to %d\n", __func__, params.n_ctx);
+        params.n_batch = params.n_ctx;
+    }
+
    // For non-causal models, batch size must be equal to ubatch size
    params.n_ubatch = params.n_batch;

@@ -134,7 +132,6 @@ int main(int argc, char ** argv) {

    // max batch size
    const uint64_t n_batch = params.n_batch;
-    GGML_ASSERT(params.n_batch >= params.n_ctx);

    // tokenize the prompts and trim
    std::vector<std::vector<int32_t>> inputs;
--- a/examples/gbnf-validator/CMakeLists.txt
+++ b/examples/gbnf-validator/CMakeLists.txt
@@ -1,5 +0,0 @@
-set(TARGET llama-gbnf-validator)
-add_executable(${TARGET} gbnf-validator.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@@ -168,7 +168,7 @@ int main(int argc, char * argv[]) {

    llama_backend_init();

-    llama_model * model = llama_model_load_from_file(params.model.c_str(), mparams);
+    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);

    // create generation context
    llama_context * ctx = llama_init_from_model(model, cparams);
--- a/examples/infill/CMakeLists.txt
+++ b/examples/infill/CMakeLists.txt
@@ -1,5 +0,0 @@
-set(TARGET llama-infill)
-add_executable(${TARGET} infill.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/infill/README.md
+++ b/examples/infill/README.md
@@ -1,47 +0,0 @@
-# llama.cpp/example/infill
-
-This example shows how to use the infill mode with Code Llama models supporting infill mode.
-Currently the 7B and 13B models support infill mode.
-
-Infill supports most of the options available in the main example.
-
-For further information have a look at the main README.md in llama.cpp/example/main/README.md
-
-## Common Options
-
-In this section, we cover the most commonly used options for running the `infill` program with the LLaMA models:
-
-   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
-   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
-   `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
-   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 4096, but if a LLaMA model was built with a longer context, increasing this value will provide better results for longer input/inference.
-   `--spm-infill`: Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this.
-
-## Input Prompts
-
-The `infill` program provides several ways to interact with the LLaMA models using input prompts:
-
-   `--in-prefix PROMPT_BEFORE_CURSOR`: Provide the prefix directly as a command-line option.
-   `--in-suffix PROMPT_AFTER_CURSOR`: Provide the suffix directly as a command-line option.
-   `--interactive-first`: Run the program in interactive mode and wait for input right away. (More on this below.)
-
-## Interaction
-
-The `infill` program offers a seamless way to interact with LLaMA models, allowing users to receive real-time infill suggestions. The interactive mode can be triggered using `--interactive`, and `--interactive-first`
-
-### Interaction Options
-
-   `-i, --interactive`: Run the program in interactive mode, allowing users to get real time code suggestions from model.
-   `--interactive-first`: Run the program in interactive mode and immediately wait for user input before starting the text generation.
-   `--color`: Enable colorized output to differentiate visually distinguishing between prompts, user input, and generated text.
-
-### Example
-
-Download a model that supports infill, for example CodeLlama:
-```console
-scripts/hf.sh --repo TheBloke/CodeLlama-13B-GGUF --file codellama-13b.Q5_K_S.gguf --outdir models
-```
-
-```bash
-./llama-infill -t 10 -ngl 0 -m models/codellama-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n    print(\"hell" --in-suffix "\n   print(\"goodbye world\")\n    "
-```
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -1,590 +0,0 @@
-#include "arg.h"
-#include "common.h"
-#include "console.h"
-#include "sampling.h"
-#include "log.h"
-#include "llama.h"
-
-#include <cassert>
-#include <cinttypes>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <ctime>
-#include <fstream>
-#include <iostream>
-#include <sstream>
-#include <string>
-#include <vector>
-
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
-#include <signal.h>
-#include <unistd.h>
-#elif defined (_WIN32)
-#define WIN32_LEAN_AND_MEAN
-#ifndef NOMINMAX
-#define NOMINMAX
-#endif
-#include <windows.h>
-#include <signal.h>
-#endif
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-static llama_context           ** g_ctx;
-static llama_model             ** g_model;
-static common_sampler          ** g_smpl;
-static common_params            * g_params;
-static std::vector<llama_token> * g_input_tokens;
-static std::ostringstream       * g_output_ss;
-static std::vector<llama_token> * g_output_tokens;
-
-static bool is_interacting = false;
-
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
-static void sigint_handler(int signo) {
-    if (signo == SIGINT) {
-        if (!is_interacting) {
-            is_interacting = true;
-        } else {
-            console::cleanup();
-            LOG("\n");
-            common_perf_print(*g_ctx, *g_smpl);
-
-            // make sure all logs are flushed
-            LOG("Interrupted by user\n");
-            common_log_pause(common_log_main());
-
-            _exit(130);
-        }
-    }
-}
-#endif
-
-int main(int argc, char ** argv) {
-    common_params params;
-    g_params = &params;
-
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_INFILL)) {
-        return 1;
-    }
-
-    common_init();
-
-    auto & sparams = params.sampling;
-
-    console::init(params.simple_io, params.use_color);
-    atexit([]() { console::cleanup(); });
-
-    if (params.logits_all) {
-        LOG_ERR("\n************\n");
-        LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
-        LOG_ERR("************\n\n");
-
-        return 0;
-    }
-
-    if (params.embedding) {
-        LOG_ERR("\n************\n");
-        LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
-        LOG_ERR("************\n\n");
-
-        return 0;
-    }
-
-    if (params.n_ctx != 0 && params.n_ctx < 8) {
-        LOG_WRN("%s: minimum context size is 8, using minimum size.\n", __func__);
-        params.n_ctx = 8;
-    }
-
-    if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) {
-        LOG_ERR("\n************\n");
-        LOG_ERR("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
-        LOG_ERR("************\n\n");
-
-        return 0;
-    }
-
-    if (params.rope_freq_base != 0.0) {
-        LOG_WRN("%s: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
-    }
-
-    if (params.rope_freq_scale != 0.0) {
-        LOG_WRN("%s: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
-    }
-
-    LOG_INF("%s: llama backend init\n", __func__);
-    llama_backend_init();
-    llama_numa_init(params.numa);
-
-    llama_model * model = nullptr;
-    llama_context * ctx = nullptr;
-    common_sampler * smpl = nullptr;
-
-    g_model = &model;
-    g_ctx = &ctx;
-    g_smpl = &smpl;
-
-    // load the model and apply lora adapter, if any
-    LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
-    common_init_result llama_init = common_init_from_params(params);
-
-    model = llama_init.model.get();
-    ctx = llama_init.context.get();
-
-    if (model == NULL) {
-        LOG_ERR("%s: unable to load model\n", __func__);
-        return 1;
-    }
-
-    const llama_vocab * vocab = llama_model_get_vocab(model);
-
-    const int n_ctx_train = llama_model_n_ctx_train(model);
-    const int n_ctx = llama_n_ctx(ctx);
-    LOG_DBG("n_ctx: %d\n", n_ctx);
-
-    if (n_ctx > n_ctx_train) {
-        LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
-    }
-
-    // print system information
-    {
-        LOG_INF("\n");
-        LOG_INF("%s\n", common_params_get_system_info(params).c_str());
-    }
-    const bool add_bos = llama_vocab_get_add_bos(vocab);
-    GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
-
-    std::vector<llama_token> embd_inp;
-    std::vector<llama_token> embd_end;
-    std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
-    std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
-
-    GGML_ASSERT(llama_vocab_fim_pre(vocab) >= 0);
-    GGML_ASSERT(llama_vocab_fim_suf(vocab) >= 0);
-
-    inp_pfx.insert(inp_pfx.begin(), llama_vocab_fim_pre(vocab));
-    inp_sfx.insert(inp_sfx.begin(), llama_vocab_fim_suf(vocab));
-
-    embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
-    embd_end = params.spm_infill ? inp_pfx : inp_sfx;
-    if (add_bos) {
-        embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab));
-    }
-    embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
-
-    const llama_token middle_token = llama_vocab_fim_mid(vocab);
-    if (middle_token >= 0) {
-        embd_inp.push_back(middle_token);
-    }
-
-    LOG_DBG("add_bos: %d\n", add_bos);
-    LOG_DBG("prefix: \"%s\"\n", params.input_prefix.c_str());
-    LOG_DBG("suffix: \"%s\"\n", params.input_suffix.c_str());
-    LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
-
-    // Should not run without any tokens
-    if (embd_inp.empty()) {
-        embd_inp.push_back(llama_vocab_bos(vocab));
-        LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
-    }
-
-    if ((int) embd_inp.size() > n_ctx - 4) {
-        LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
-        return 1;
-    }
-
-    // number of tokens to keep when resetting context
-    if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size()) {
-        params.n_keep = (int)embd_inp.size();
-    }
-
-    LOG_INF("inp_pfx: %s\n", string_from(ctx, inp_pfx).c_str());
-    LOG_INF("inp_sfx: %s\n", string_from(ctx, inp_sfx).c_str());
-
-    // enable interactive mode if interactive start is specified
-    if (params.interactive_first) {
-        params.interactive = true;
-    }
-
-    if (params.verbose_prompt) {
-        LOG_INF("\n");
-        LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
-        LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
-        for (int i = 0; i < (int) embd_inp.size(); i++) {
-            LOG_INF("%6d -> '%s'\n", embd_inp[i], common_token_to_piece(ctx, embd_inp[i]).c_str());
-        }
-
-        if (params.n_keep > 0) {
-        LOG_INF("%s: static prompt based on n_keep: '", __func__);
-            for (int i = 0; i < params.n_keep; i++) {
-                LOG_CNT("%s", common_token_to_piece(ctx, embd_inp[i]).c_str());
-            }
-            LOG_CNT("'\n");
-        }
-        LOG_INF("\n");
-    }
-
-    if (params.interactive) {
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
-        struct sigaction sigint_action;
-        sigint_action.sa_handler = sigint_handler;
-        sigemptyset (&sigint_action.sa_mask);
-        sigint_action.sa_flags = 0;
-        sigaction(SIGINT, &sigint_action, NULL);
-#elif defined (_WIN32)
-        auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
-            return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
-        };
-        SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
-#endif
-
-        LOG_INF("%s: interactive mode on.\n", __func__);
-
-        if (params.input_prefix_bos) {
-            LOG_INF("Input prefix with BOS\n");
-        }
-
-        if (!params.input_prefix.empty()) {
-            LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str());
-        }
-
-        if (!params.input_suffix.empty()) {
-            LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
-        }
-    }
-    smpl = common_sampler_init(model, sparams);
-
-    LOG_INF("sampler seed: %u\n",     common_sampler_get_seed(smpl));
-    LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
-    LOG_INF("sampler chain: %s\n",    common_sampler_print(smpl).c_str());
-
-    LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
-
-    LOG_INF("\n");
-    LOG_INF("\n#####  Infill mode  #####\n\n");
-    if (params.interactive) {
-        const char *control_message;
-        if (params.multiline_input) {
-            control_message = " - To return control to LLaMA, end your input with '\\'.\n"
-                              " - To return control without starting a new line, end your input with '/'.\n";
-        } else {
-            control_message = " - Press Return to return control to LLaMA.\n"
-                              " - To return control without starting a new line, end your input with '/'.\n"
-                              " - If you want to submit another line, end your input with '\\'.\n";
-        }
-        LOG_INF("== Running in interactive mode. ==\n");
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
-        LOG_INF(       " - Press Ctrl+C to interject at any time.\n");
-#endif
-        LOG_INF(       "%s\n", control_message);
-
-        is_interacting = params.interactive_first;
-    }
-
-    bool input_echo = true;
-
-    int n_past     = 0;
-    int n_remain   = params.n_predict;
-    int n_consumed = 0;
-
-    std::vector<int>   input_tokens;  g_input_tokens  = &input_tokens;
-    std::vector<int>   output_tokens; g_output_tokens = &output_tokens;
-    std::ostringstream output_ss;     g_output_ss     = &output_ss;
-
-    // the first thing we will do is to output the prompt, so set color accordingly
-    console::set_display(console::prompt);
-
-    std::vector<llama_token> embd;
-
-    while (n_remain != 0 || params.interactive) {
-        // predict
-        if (!embd.empty()) {
-            // Note: n_ctx - 4 here is to match the logic for commandline prompt handling via
-            // --prompt or --file which uses the same value.
-            int max_embd_size = n_ctx - 4;
-
-            // Ensure the input doesn't exceed the context size by truncating embd if necessary.
-            if ((int) embd.size() > max_embd_size) {
-                const int skipped_tokens = (int) embd.size() - max_embd_size;
-                embd.resize(max_embd_size);
-
-                console::set_display(console::error);
-                LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
-                console::set_display(console::reset);
-            }
-
-            // infinite text generation via context swapping
-            // if we run out of context:
-            // - take the n_keep first tokens from the original prompt (via n_past)
-            // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
-            if (n_past + (int) embd.size() > n_ctx) {
-                if (params.n_predict == -2) {
-                    LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
-                    break;
-                }
-
-                const int n_left    = n_past - params.n_keep - 1;
-                const int n_discard = n_left/2;
-
-                LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
-                    n_past, n_left, n_ctx, params.n_keep, n_discard);
-
-                llama_kv_self_seq_rm (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
-                llama_kv_self_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
-
-                n_past -= n_discard;
-
-                LOG_DBG("after swap: n_past = %d\n", n_past);
-
-                LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
-
-            }
-
-            // evaluate tokens in batches
-            // embd is typically prepared beforehand to fit within a batch, but not always
-            for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
-                int n_eval = (int) embd.size() - i;
-                if (n_eval > params.n_batch) {
-                    n_eval = params.n_batch;
-                }
-
-                LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
-
-                if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) {
-                    LOG_ERR("%s : failed to eval\n", __func__);
-                    return 1;
-                }
-
-                n_past += n_eval;
-
-                LOG_DBG("n_past = %d\n", n_past);
-            }
-
-        }
-
-        embd.clear();
-
-        if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
-            const llama_token id = common_sampler_sample(smpl, ctx, -1);
-
-            common_sampler_accept(smpl, id, true);
-
-            // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
-
-            embd.push_back(id);
-
-            // echo this to console
-            input_echo = true;
-
-            // decrement remaining sampling budget
-            --n_remain;
-
-            LOG_DBG("n_remain: %d\n", n_remain);
-        } else {
-            // some user input remains from prompt or interaction, forward it to processing
-            LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
-            while ((int) embd_inp.size() > n_consumed) {
-                embd.push_back(embd_inp[n_consumed]);
-
-                // push the prompt in the sampling context in order to apply repetition penalties later
-                // for the prompt, we don't apply grammar rules
-                common_sampler_accept(smpl, embd_inp[n_consumed], false);
-
-                ++n_consumed;
-                if ((int) embd.size() >= params.n_batch) {
-                    break;
-                }
-            }
-        }
-
-        // display text
-        if (input_echo) {
-            for (auto id : embd) {
-                const std::string token_str = common_token_to_piece(ctx, id);
-                LOG("%s", token_str.c_str());
-
-                if (embd.size() > 1) {
-                    input_tokens.push_back(id);
-                } else {
-                    output_tokens.push_back(id);
-                    output_ss << token_str;
-                }
-            }
-        }
-        // reset color to default if we there is no pending user input
-        if (input_echo && (int) embd_inp.size() == n_consumed) {
-            console::set_display(console::reset);
-        }
-
-        // if not currently processing queued inputs;
-        if ((int) embd_inp.size() <= n_consumed) {
-            // deal with eot token in infill mode
-            if ((common_sampler_last(smpl) == llama_vocab_eot(vocab) || is_interacting) && params.interactive){
-                if (is_interacting && !params.interactive_first) {
-                    // print an eot token
-                    LOG("%s", common_token_to_piece(ctx, llama_vocab_eot(vocab)).c_str());
-                }
-                LOG("\n");
-                console::set_display(console::user_input);
-                std::string buffer;
-                std::string line;
-                bool another_line=true;
-                // set a new prefix via stdin
-                do {
-                    another_line = console::readline(line, params.multiline_input);
-                    buffer += line;
-                } while (another_line);
-                // check if we got an empty line, if so we use the old input
-                if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
-                    params.input_prefix = buffer;
-                }
-                buffer.clear();
-                // set a new suffix via stdin
-                do {
-                    another_line = console::readline(line, params.multiline_input);
-                    buffer += line;
-                } while (another_line);
-                // check if we got an empty line
-                if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
-                    params.input_suffix = buffer;
-                }
-                buffer.clear();
-                // done taking input, reset color
-                console::set_display(console::reset);
-
-                if (params.escape) {
-                    //process escape sequences, for the initial prompt this is done in common.cpp when we load the params, but for the interactive mode we need to do it here
-                    string_process_escapes(params.input_prefix);
-                    string_process_escapes(params.input_suffix);
-                }
-
-                // tokenize new prefix and suffix
-                std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
-                std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
-
-                inp_pfx.insert(inp_pfx.begin(), llama_vocab_fim_pre(vocab));
-                inp_sfx.insert(inp_sfx.begin(), llama_vocab_fim_suf(vocab));
-
-                embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
-                embd_end = params.spm_infill ? inp_pfx : inp_sfx;
-                if (add_bos) {
-                    embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab));
-                }
-                embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
-
-                if (middle_token >= 0) {
-                    embd_inp.push_back(middle_token);
-                }
-
-                embd.clear();
-                n_remain = params.n_predict;
-                n_past = 0;
-                n_consumed = 0;
-                is_interacting = false;
-            }
-            // deal with end of generation tokens in interactive mode
-            else if (llama_vocab_is_eog(vocab, common_sampler_last(smpl))) {
-                LOG_DBG("found EOS token\n");
-
-                if (params.interactive) {
-
-                    is_interacting = true;
-                    LOG("\n");
-                    console::set_display(console::user_input);
-               }
-            }
-
-            if (n_past > 0 && is_interacting && !params.interactive) {
-                LOG_DBG("waiting for user input\n");
-
-                if (params.input_prefix_bos) {
-                    LOG_DBG("adding input prefix BOS token\n");
-                    embd_inp.push_back(llama_vocab_bos(vocab));
-                }
-
-                std::string buffer;
-                if (!params.input_prefix.empty()) {
-                    LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
-                    buffer += params.input_prefix;
-                    LOG("%s", buffer.c_str());
-                }
-
-                std::string line;
-                bool another_line = true;
-                do {
-                    another_line = console::readline(line, params.multiline_input);
-                    buffer += line;
-                } while (another_line);
-
-                // done taking input, reset color
-                console::set_display(console::reset);
-
-                // Add tokens to embd only if the input buffer is non-empty
-                // Entering a empty line lets the user pass control back
-                if (buffer.length() > 1) {
-                    // append input suffix if any
-                    if (!params.input_suffix.empty()) {
-                        LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
-                        buffer += params.input_suffix;
-                        LOG("%s", params.input_suffix.c_str());
-                    }
-
-                    LOG_DBG("buffer: '%s'\n", buffer.c_str());
-
-                    const size_t original_size = embd_inp.size();
-
-                    const auto line_inp = common_tokenize(ctx, buffer, false);
-                    LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
-
-                    embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
-
-                    for (size_t i = original_size; i < embd_inp.size(); ++i) {
-                        const llama_token token = embd_inp[i];
-                        output_tokens.push_back(token);
-                        output_ss << common_token_to_piece(ctx, token);
-                    }
-
-                    n_remain -= line_inp.size();
-                    LOG_DBG("n_remain: %d\n", n_remain);
-                } else {
-                    LOG_DBG("empty line, passing control back\n");
-                }
-
-                input_echo = false; // do not echo this again
-            }
-
-            if (n_past > 0) {
-                if (is_interacting) {
-                    common_sampler_reset(smpl);
-                }
-                is_interacting = false;
-            }
-        }
-
-        // end of generation
-        if (!embd.empty() && llama_vocab_is_eog(vocab, embd.back()) && !params.interactive) {
-            break;
-        }
-
-        // In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
-        // We skip this logic when n_predict == -1 (infinite) or -2 (stop at context size).
-        if (params.interactive && n_remain <= 0 && params.n_predict >= 0) {
-            n_remain = params.n_predict;
-            is_interacting = true;
-        }
-    }
-    if (!params.interactive && n_remain <= 0) {
-        LOG("%s", common_token_to_piece(ctx, llama_vocab_eot(vocab)).c_str());
-    }
-
-    LOG("\n");
-    common_perf_print(ctx, smpl);
-
-    common_sampler_free(smpl);
-    llama_backend_free();
-
-    return 0;
-}
--- a/examples/json_schema_to_grammar.py
+++ b/examples/json_schema_to_grammar.py
@@ -10,6 +10,9 @@ from typing import Any, List, Optional, Set, Tuple, Union

 def _build_repetition(item_rule, min_items, max_items, separator_rule=None):

+    if max_items == 0:
+        return ""
+
    if min_items == 0 and max_items == 1:
        return f'{item_rule}?'

--- a/examples/llama.android/llama/build.gradle.kts
+++ b/examples/llama.android/llama/build.gradle.kts
@@ -18,6 +18,7 @@ android {
        }
        externalNativeBuild {
            cmake {
+                arguments += "-DLLAMA_CURL=OFF"
                arguments += "-DLLAMA_BUILD_COMMON=ON"
                arguments += "-DGGML_LLAMAFILE=OFF"
                arguments += "-DCMAKE_BUILD_TYPE=Release"
--- a/examples/llava/CMakeLists.txt
+++ b/examples/llava/CMakeLists.txt
@@ -1,66 +0,0 @@
-add_library(llava OBJECT
-            llava.cpp
-            llava.h
-            clip.cpp
-            clip.h
-            )
-
-target_link_libraries(llava PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
-
-target_include_directories(llava PUBLIC .)
-target_include_directories(llava PUBLIC ../..)
-target_include_directories(llava PUBLIC ../../common)
-
-target_compile_features(llava PRIVATE cxx_std_17)
-
-add_library(llava_static STATIC $<TARGET_OBJECTS:llava>)
-if (BUILD_SHARED_LIBS)
-    set_target_properties(llava PROPERTIES POSITION_INDEPENDENT_CODE ON)
-    target_compile_definitions(llava PRIVATE LLAMA_SHARED LLAMA_BUILD)
-    add_library(llava_shared SHARED $<TARGET_OBJECTS:llava>)
-    target_link_libraries(llava_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
-    install(TARGETS llava_shared LIBRARY)
-endif()
-
-if (NOT MSVC)
-    target_compile_options(llava PRIVATE -Wno-cast-qual) # stb_image.h
-endif()
-
-if(TARGET BUILD_INFO)
-    add_dependencies(llava BUILD_INFO)
-endif()
-
-set(TARGET llama-llava-cli)
-add_executable(${TARGET} llava-cli.cpp)
-set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-cli)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_17)
-
-set(TARGET llama-minicpmv-cli)
-add_executable(${TARGET} minicpmv-cli.cpp)
-set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-minicpmv-cli)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_17)
-
-set(TARGET llama-qwen2vl-cli)
-add_executable(${TARGET} qwen2vl-cli.cpp)
-set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-qwen2vl-cli)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_17)
-
-set(TARGET llama-gemma3-cli)
-add_executable(${TARGET} gemma3-cli.cpp)
-set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-gemma3-cli)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_17)
-
-set(TARGET llama-llava-clip-quantize-cli)
-add_executable(${TARGET} clip-quantize-cli.cpp)
-set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-clip-quantize-cli)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/llava/README-gemma3.md
+++ b/examples/llava/README-gemma3.md
@@ -1,30 +0,0 @@
-# Gemma 3 vision
-
-> [!IMPORTANT]
->
-> This is very experimental, only used for demo purpose.
-
-## How to get mmproj.gguf?
-
-```bash
-cd gemma-3-4b-it
-python ../llama.cpp/examples/llava/gemma3_convert_encoder_to_gguf.py .
-
-# output file is mmproj.gguf
-```
-
-## How to run it?
-
-What you need:
- The text model GGUF, can be converted using `convert_hf_to_gguf.py`
- The mmproj file from step above
- An image file
-
-```bash
-# build
-cmake -B build
-cmake --build build --target llama-gemma3-cli
-
-# run it
-./build/bin/llama-gemma3-cli -m {text_model}.gguf --mmproj mmproj.gguf --image your_image.jpg
-```
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
--- a/examples/llava/gemma3-cli.cpp
+++ b/examples/llava/gemma3-cli.cpp
@@ -1,341 +0,0 @@
-#include "arg.h"
-#include "log.h"
-#include "common.h"
-#include "sampling.h"
-#include "clip.h"
-#include "stb_image.h"
-#include "llama.h"
-#include "ggml.h"
-#include "console.h"
-
-#include <vector>
-#include <limits.h>
-#include <inttypes.h>
-
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
-#include <signal.h>
-#include <unistd.h>
-#elif defined (_WIN32)
-#define WIN32_LEAN_AND_MEAN
-#ifndef NOMINMAX
-#define NOMINMAX
-#endif
-#include <windows.h>
-#include <signal.h>
-#endif
-
-static bool g_is_generating = false;
-
-/**
- * Please note that this is NOT a production-ready stuff.
- * It is a playground for trying Gemma 3 vision capabilities.
- * For contributors: please keep this code simple and easy to understand.
- */
-
-static void show_additional_info(int /*argc*/, char ** argv) {
-    LOG(
-        "Experimental CLI for using Gemma 3 vision model\n\n"
-        "Usage: %s [options] -m <model> --mmproj <mmproj> --image <image> -p <prompt>\n\n"
-        "  -m and --mmproj are required\n"
-        "  --image and -p are optional, if NOT provided, the CLI will run in chat mode\n",
-        argv[0]
-    );
-}
-
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
-static void sigint_handler(int signo) {
-    if (signo == SIGINT) {
-        if (g_is_generating) {
-            g_is_generating = false;
-        } else {
-            console::cleanup();
-            LOG("\nInterrupted by user\n");
-            _exit(130);
-        }
-    }
-}
-#endif
-
-struct gemma3_context {
-    struct clip_ctx    * ctx_clip = NULL;
-    common_init_result   llama_init;
-
-    llama_model       * model;
-    llama_context     * lctx;
-    const llama_vocab * vocab;
-    llama_batch         batch;
-
-    int n_threads    = 1;
-    llama_pos n_past = 0;
-
-    gemma3_context(common_params & params) : llama_init(common_init_from_params(params)) {
-        model = llama_init.model.get();
-        lctx = llama_init.context.get();
-        vocab = llama_model_get_vocab(model);
-        n_threads = params.cpuparams.n_threads;
-        batch = llama_batch_init(params.n_batch, 0, 1);
-        init_clip_model(params);
-    }
-
-    void init_clip_model(common_params & params) {
-        const char * clip_path = params.mmproj.c_str();
-        ctx_clip = clip_model_load(clip_path, params.verbosity > 1);
-    }
-
-    ~gemma3_context() {
-        clip_free(ctx_clip);
-    }
-};
-
-struct decode_embd_batch {
-    std::vector<llama_pos>      pos;
-    std::vector<int32_t>        n_seq_id;
-    std::vector<llama_seq_id>   seq_id_0;
-    std::vector<llama_seq_id *> seq_ids;
-    std::vector<int8_t>         logits;
-    llama_batch batch;
-    decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
-        pos     .resize(n_tokens);
-        n_seq_id.resize(n_tokens);
-        seq_ids .resize(n_tokens + 1);
-        logits  .resize(n_tokens);
-        seq_id_0.resize(1);
-        seq_id_0[0] = seq_id;
-        seq_ids [n_tokens] = nullptr;
-        batch = {
-            /*n_tokens       =*/ n_tokens,
-            /*tokens         =*/ nullptr,
-            /*embd           =*/ embd,
-            /*pos            =*/ pos.data(),
-            /*n_seq_id       =*/ n_seq_id.data(),
-            /*seq_id         =*/ seq_ids.data(),
-            /*logits         =*/ logits.data(),
-        };
-        for (int i = 0; i < n_tokens; i++) {
-            batch.pos     [i] = pos_0 + i;
-            batch.n_seq_id[i] = 1;
-            batch.seq_id  [i] = seq_id_0.data();
-            batch.logits  [i] = false;
-        }
-    }
-};
-
-static int eval_text(gemma3_context & ctx, std::string input, bool logits_last = false) {
-    llama_tokens tokens = common_tokenize(ctx.lctx, input, false, true);
-    common_batch_clear(ctx.batch);
-    for (llama_token & t : tokens) {
-        common_batch_add(ctx.batch, t, ctx.n_past++, {0}, false);
-    }
-    if (logits_last) {
-        ctx.batch.logits[ctx.batch.n_tokens - 1] = true;
-    }
-    // LOG("eval_text (n_tokens = %d): %s\n", (int)tokens.size(), input.c_str());
-    if (llama_decode(ctx.lctx, ctx.batch)) {
-        LOG_ERR("Failed to decode text\n");
-        return 1;
-    }
-    return 0;
-}
-
-static int eval_image(gemma3_context & ctx, std::string & fname) {
-    std::vector<float> image_embd_v;
-    int n_embd = llama_model_n_embd(ctx.model);
-    int n_tokens = 256;
-    image_embd_v.resize(n_tokens * n_embd);
-
-    bool ok;
-    struct clip_image_u8 * img_u8 = clip_image_u8_init();
-    ok = clip_image_load_from_file(fname.c_str(), img_u8);
-    if (!ok) {
-        LOG_ERR("Unable to load image %s\n", fname.c_str());
-        clip_image_u8_free(img_u8);
-        return 2; // non-fatal error
-    }
-
-    clip_image_f32_batch batch_f32;
-    ok = clip_image_preprocess(ctx.ctx_clip, img_u8, &batch_f32);
-    if (!ok) {
-        LOG_ERR("Unable to preprocess image\n");
-        clip_image_f32_batch_free(&batch_f32);
-        clip_image_u8_free(img_u8);
-        return 1;
-    }
-
-    int64_t t0 = ggml_time_ms();
-    LOG("Encoding image %s\n", fname.c_str());
-    ok = clip_image_batch_encode(ctx.ctx_clip, ctx.n_threads, &batch_f32, image_embd_v.data());
-    if (!ok) {
-        LOG_ERR("Unable to encode image\n");
-        clip_image_f32_batch_free(&batch_f32);
-        clip_image_u8_free(img_u8);
-        return 1;
-    }
-    LOG("Image encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
-
-    clip_image_f32_batch_free(&batch_f32);
-    clip_image_u8_free(img_u8);
-
-    // decode image embeddings
-    int64_t t1 = ggml_time_ms();
-    eval_text(ctx, "<start_of_image>");
-    llama_set_causal_attn(ctx.lctx, false);
-    decode_embd_batch batch_img(image_embd_v.data(), n_tokens, ctx.n_past, 0);
-    if (llama_decode(ctx.lctx, batch_img.batch)) {
-        LOG_ERR("failed to decode image\n");
-        return 1;
-    }
-    ctx.n_past += n_tokens;
-    llama_set_causal_attn(ctx.lctx, true);
-    eval_text(ctx, "<end_of_image>");
-    LOG("Image decoded in %" PRId64 " ms\n", ggml_time_ms() - t1);
-    return 0;
-}
-
-static int generate_response(gemma3_context & ctx, common_sampler * smpl, int n_predict) {
-    for (int i = 0; i < n_predict; i++) {
-        if (i > n_predict || !g_is_generating) {
-            printf("\n");
-            break;
-        }
-
-        llama_token token_id = common_sampler_sample(smpl, ctx.lctx, -1);
-        common_sampler_accept(smpl, token_id, true);
-
-        if (llama_vocab_is_eog(ctx.vocab, token_id)) {
-            printf("\n");
-            break; // end of generation
-        }
-
-        printf("%s", common_token_to_piece(ctx.lctx, token_id).c_str());
-        fflush(stdout);
-
-        // eval the token
-        common_batch_clear(ctx.batch);
-        common_batch_add(ctx.batch, token_id, ctx.n_past++, {0}, true);
-        if (llama_decode(ctx.lctx, ctx.batch)) {
-            LOG_ERR("failed to decode token\n");
-            return 1;
-        }
-    }
-    return 0;
-}
-
-int main(int argc, char ** argv) {
-    ggml_time_init();
-
-    common_params params;
-    params.sampling.temp = 0.2; // lower temp by default for better quality
-
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) {
-        return 1;
-    }
-
-    common_init();
-
-    if (params.mmproj.empty()) {
-        show_additional_info(argc, argv);
-        return 1;
-    }
-
-    gemma3_context ctx(params);
-    printf("%s: %s\n", __func__, params.model.c_str());
-
-    bool is_single_turn = !params.prompt.empty() && !params.image.empty();
-
-    struct common_sampler * smpl = common_sampler_init(ctx.model, params.sampling);
-    int n_predict = params.n_predict < 0 ? INT_MAX : params.n_predict;
-
-    // ctrl+C handling
-    {
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
-        struct sigaction sigint_action;
-        sigint_action.sa_handler = sigint_handler;
-        sigemptyset (&sigint_action.sa_mask);
-        sigint_action.sa_flags = 0;
-        sigaction(SIGINT, &sigint_action, NULL);
-#elif defined (_WIN32)
-        auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
-            return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
-        };
-        SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
-#endif
-    }
-
-    if (eval_text(ctx, "<bos>")) {
-        return 1;
-    }
-
-    if (is_single_turn) {
-        g_is_generating = true;
-        if (eval_text(ctx, "<start_of_turn>user\n")) {
-            return 1;
-        }
-        for (auto & fname : params.image) {
-            if (eval_image(ctx, fname)) {
-                return 1;
-            }
-        }
-        if (eval_text(ctx, params.prompt + "<end_of_turn><start_of_turn>model\n", true)) {
-            return 1;
-        }
-        if (generate_response(ctx, smpl, n_predict)) {
-            return 1;
-        }
-
-    } else {
-        LOG("\n Running in chat mode, available commands:");
-        LOG("\n   /image <path>    load an image");
-        LOG("\n   /clear           clear the chat history");
-        LOG("\n   /quit or /exit   exit the program");
-        LOG("\n");
-
-        if (eval_text(ctx, "<start_of_turn>user\n")) {
-            return 1;
-        }
-
-        while (true) {
-            g_is_generating = false;
-            LOG("\n> ");
-            console::set_display(console::user_input);
-            std::string line;
-            console::readline(line, false);
-            console::set_display(console::reset);
-            line = string_strip(line);
-            if (line.empty()) {
-                continue;
-            }
-            if (line == "/quit" || line == "/exit") {
-                break;
-            }
-            if (line == "/clear") {
-                ctx.n_past = 0;
-                llama_kv_self_seq_rm(ctx.lctx, 0, 1, -1); // keep BOS
-                LOG("Chat history cleared\n\n");
-                continue;
-            }
-            g_is_generating = true;
-            if (line.find("/image") == 0) {
-                std::string image = line.substr(7);
-                int res = eval_image(ctx, image);
-                if (res == 2) {
-                    continue; // image not found
-                }
-                if (res) {
-                    return 1;
-                }
-                continue;
-            }
-            if (eval_text(ctx, line + "<end_of_turn><start_of_turn>model\n", true)) {
-                return 1;
-            }
-            if (generate_response(ctx, smpl, n_predict)) {
-                return 1;
-            }
-            if (eval_text(ctx, "<end_of_turn><start_of_turn>user\n")) {
-                return 1;
-            }
-        }
-    }
-
-    return 0;
-}
--- a/examples/llava/gemma3_convert_encoder_to_gguf.py
+++ b/examples/llava/gemma3_convert_encoder_to_gguf.py
@@ -1,307 +0,0 @@
-import gguf
-import argparse
-import logging
-import sys
-import torch
-import json
-import os
-import numpy as np
-from typing import cast, ContextManager, Any, Iterator
-from pathlib import Path
-from torch import Tensor
-
-logger = logging.getLogger("gemma3-mmproj")
-
-
-# (copied from convert_hf_to_gguf.py)
-# tree of lazy tensors
-class LazyTorchTensor(gguf.LazyBase):
-    _tensor_type = torch.Tensor
-    # to keep the type-checker happy
-    dtype: torch.dtype
-    shape: torch.Size
-
-    # only used when converting a torch.Tensor to a np.ndarray
-    _dtype_map: dict[torch.dtype, type] = {
-        torch.float16: np.float16,
-        torch.float32: np.float32,
-    }
-
-    # used for safetensors slices
-    # ref: https://github.com/huggingface/safetensors/blob/079781fd0dc455ba0fe851e2b4507c33d0c0d407/bindings/python/src/lib.rs#L1046
-    # TODO: uncomment U64, U32, and U16, ref: https://github.com/pytorch/pytorch/issues/58734
-    _dtype_str_map: dict[str, torch.dtype] = {
-        "F64": torch.float64,
-        "F32": torch.float32,
-        "BF16": torch.bfloat16,
-        "F16": torch.float16,
-        # "U64": torch.uint64,
-        "I64": torch.int64,
-        # "U32": torch.uint32,
-        "I32": torch.int32,
-        # "U16": torch.uint16,
-        "I16": torch.int16,
-        "U8": torch.uint8,
-        "I8": torch.int8,
-        "BOOL": torch.bool,
-        "F8_E4M3": torch.float8_e4m3fn,
-        "F8_E5M2": torch.float8_e5m2,
-    }
-
-    def numpy(self) -> gguf.LazyNumpyTensor:
-        dtype = self._dtype_map[self.dtype]
-        return gguf.LazyNumpyTensor(
-            meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape),
-            args=(self,),
-            func=(lambda s: s.numpy())
-        )
-
-    @classmethod
-    def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: tuple[int, ...]) -> Tensor:
-        return torch.empty(size=shape, dtype=dtype, device="meta")
-
-    @classmethod
-    def from_safetensors_slice(cls, st_slice: Any) -> Tensor:
-        dtype = cls._dtype_str_map[st_slice.get_dtype()]
-        shape: tuple[int, ...] = tuple(st_slice.get_shape())
-        lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[:])
-        return cast(torch.Tensor, lazy)
-
-    @classmethod
-    def __torch_function__(cls, func, types, args=(), kwargs=None):
-        del types  # unused
-
-        if kwargs is None:
-            kwargs = {}
-
-        if func is torch.Tensor.numpy:
-            return args[0].numpy()
-
-        return cls._wrap_fn(func)(*args, **kwargs)
-
-
-class Gemma3VisionTower:
-    hparams: dict
-    gguf_writer: gguf.GGUFWriter
-    fname_out: Path
-    ftype: gguf.LlamaFileType
-
-    @staticmethod
-    def load_hparams(dir_model: Path):
-        with open(dir_model / "config.json", "r", encoding="utf-8") as f:
-            return json.load(f)
-
-    @staticmethod
-    def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]:
-        part_names: list[str] = []
-        for filename in os.listdir(dir_model):
-            if filename.startswith(prefix) and filename.endswith(suffix):
-                part_names.append(filename)
-        part_names.sort()
-        return part_names
-
-    def __init__(self,
-                 dir_model: Path,
-                 fname_out: Path,
-                 ftype: gguf.LlamaFileType,
-                 is_big_endian: bool,):
-        hparams = Gemma3VisionTower.load_hparams(dir_model)
-        self.hparams = hparams
-        self.fname_out = fname_out
-        self.ftype = ftype
-        endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
-        self.gguf_writer = gguf.GGUFWriter(path=None, arch="clip", endianess=endianess)
-
-        text_config = hparams["text_config"]
-        vision_config = hparams["vision_config"]
-
-        assert hparams["architectures"][0] == "Gemma3ForConditionalGeneration"
-        assert text_config is not None
-        assert vision_config is not None
-
-        self.gguf_writer.add_string ("clip.projector_type",              "gemma3")
-        self.gguf_writer.add_bool   ("clip.has_text_encoder",            False)
-        self.gguf_writer.add_bool   ("clip.has_vision_encoder",          True)
-        self.gguf_writer.add_bool   ("clip.has_llava_projector",         False) # legacy
-        self.gguf_writer.add_uint32 ("clip.vision.image_size",           vision_config["image_size"])
-        self.gguf_writer.add_uint32 ("clip.vision.patch_size",           vision_config["patch_size"])
-        self.gguf_writer.add_uint32 ("clip.vision.embedding_length",     vision_config["hidden_size"])
-        self.gguf_writer.add_uint32 ("clip.vision.feed_forward_length",  vision_config["intermediate_size"])
-        self.gguf_writer.add_uint32 ("clip.vision.projection_dim",       text_config["hidden_size"])
-        self.gguf_writer.add_uint32 ("clip.vision.block_count",          vision_config["num_hidden_layers"])
-        self.gguf_writer.add_uint32 ("clip.vision.attention.head_count", vision_config["num_attention_heads"])
-        self.gguf_writer.add_float32("clip.vision.attention.layer_norm_epsilon", vision_config.get("layer_norm_eps", 1e-6))
-        # default values taken from HF tranformers code
-        self.gguf_writer.add_array  ("clip.vision.image_mean", [0.5, 0.5, 0.5])
-        self.gguf_writer.add_array  ("clip.vision.image_std",  [0.5, 0.5, 0.5])
-        self.gguf_writer.add_bool   ("clip.use_gelu", True)
-
-        # load tensors
-        for name, data_torch in self.get_tensors(dir_model):
-            # convert any unsupported data types to float32
-            if data_torch.dtype not in (torch.float16, torch.float32):
-                data_torch = data_torch.to(torch.float32)
-            self.add_tensor(name, data_torch)
-
-    def get_tensors(self, dir_model: Path) -> Iterator[tuple[str, Tensor]]:
-        part_names = Gemma3VisionTower.get_model_part_names(dir_model, "model", ".safetensors")
-        tensor_names_from_parts: set[str] = set()
-        for part_name in part_names:
-            logger.info(f"gguf: loading model part '{part_name}'")
-            from safetensors import safe_open
-            ctx = cast(ContextManager[Any], safe_open(dir_model / part_name, framework="pt", device="cpu"))
-            with ctx as model_part:
-                tensor_names_from_parts.update(model_part.keys())
-
-                for name in model_part.keys():
-                    data = model_part.get_slice(name)
-                    data = LazyTorchTensor.from_safetensors_slice(data)
-                    yield name, data
-
-    def add_tensor(self, name: str, data_torch: Tensor):
-        is_1d = len(data_torch.shape) == 1
-        is_embd = ".embeddings." in name
-        old_dtype = data_torch.dtype
-        can_quantize = not is_1d and not is_embd
-        data_qtype = gguf.GGMLQuantizationType.F32
-
-        # this is to support old checkpoint
-        # TODO: remove this when we have the final model
-        name = name.replace("vision_model.vision_model.", "vision_tower.vision_model.")
-        name = name.replace("multimodal_projector.", "multi_modal_projector.")
-
-        # filter only vision tensors
-        if not name.startswith("vision_tower.vision_model.") and not name.startswith("multi_modal_projector."):
-            return
-        # prefix
-        name = name.replace("vision_tower.vision_model.encoder.layers.", "v.blk.")
-        name = name.replace("vision_tower.vision_model.", "v.")
-        # projector and input embd
-        name = name.replace(".embeddings.patch_embedding.", ".patch_embd.")
-        name = name.replace(".embeddings.position_embedding.", ".position_embd.")
-        name = name.replace(
-            "multi_modal_projector.mm_input_projection_weight",
-            "mm.input_projection.weight"
-        )
-        name = name.replace(
-            "multi_modal_projector.mm_soft_emb_norm.weight",
-            "mm.soft_emb_norm.weight"
-        )
-        name = name.replace("post_layernorm.", "post_ln.")
-        # each block
-        name = name.replace(".self_attn.k_proj.", ".attn_k.")
-        name = name.replace(".self_attn.v_proj.", ".attn_v.")
-        name = name.replace(".self_attn.q_proj.", ".attn_q.")
-        name = name.replace(".self_attn.out_proj.", ".attn_out.")
-        name = name.replace(".layer_norm1.", ".ln1.")
-        name = name.replace(".layer_norm2.", ".ln2.")
-        name = name.replace(".mlp.fc1.", ".ffn_down.")
-        name = name.replace(".mlp.fc2.", ".ffn_up.")
-
-        if can_quantize:
-            if self.ftype == gguf.LlamaFileType.ALL_F32:
-                data_qtype = gguf.GGMLQuantizationType.F32
-            elif self.ftype == gguf.LlamaFileType.MOSTLY_F16:
-                data_qtype = gguf.GGMLQuantizationType.F16
-            elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
-                data_qtype = gguf.GGMLQuantizationType.BF16
-            elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
-                data_qtype = gguf.GGMLQuantizationType.Q8_0
-            else:
-                raise ValueError(f"Unsupported file type: {self.ftype}")
-
-        # corrent norm value ; only this "soft_emb_norm" need to be corrected as it's part of Gemma projector
-        # the other norm values are part of SigLIP model, and they are already correct
-        # ref code: Gemma3RMSNorm
-        if "soft_emb_norm.weight" in name:
-            logger.info(f"Correcting norm value for '{name}'")
-            data_torch = data_torch + 1
-
-        data = data_torch.numpy()
-
-        try:
-            data = gguf.quants.quantize(data, data_qtype)
-        except Exception as e:
-            logger.error(f"Error quantizing tensor '{name}': {e}, fallback to F16")
-            data_qtype = gguf.GGMLQuantizationType.F16
-            data = gguf.quants.quantize(data, data_qtype)
-
-        # reverse shape to make it similar to the internal ggml dimension order
-        shape_str = f"{{{', '.join(str(n) for n in reversed(data_torch.shape))}}}"
-        logger.info(f"{f'%-32s' % f'{name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
-
-        self.gguf_writer.add_tensor(name, data, raw_dtype=data_qtype)
-
-    def write(self):
-        self.gguf_writer.write_header_to_file(path=self.fname_out)
-        self.gguf_writer.write_kv_data_to_file()
-        self.gguf_writer.write_tensors_to_file(progress=True)
-        self.gguf_writer.close()
-
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(
-        description="Convert Gemma 3 vision tower safetensors to GGUF format",)
-    parser.add_argument(
-        "--outfile", type=Path, default="mmproj.gguf",
-        help="path to write to",
-    )
-    parser.add_argument(
-        "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0"], default="f16",
-        help="output format",
-    )
-    parser.add_argument(
-        "--bigendian", action="store_true",
-        help="model is executed on big endian machine",
-    )
-    parser.add_argument(
-        "model", type=Path,
-        help="directory containing model file",
-        nargs="?",
-    )
-    parser.add_argument(
-        "--verbose", action="store_true",
-        help="increase output verbosity",
-    )
-
-    args = parser.parse_args()
-    if args.model is None:
-        parser.error("the following arguments are required: model")
-    return args
-
-
-def main() -> None:
-    args = parse_args()
-
-    if args.verbose:
-        logging.basicConfig(level=logging.DEBUG)
-    else:
-        logging.basicConfig(level=logging.INFO)
-
-    dir_model = args.model
-
-    if not dir_model.is_dir():
-        logger.error(f'Error: {args.model} is not a directory')
-        sys.exit(1)
-
-    ftype_map: dict[str, gguf.LlamaFileType] = {
-        "f32": gguf.LlamaFileType.ALL_F32,
-        "f16": gguf.LlamaFileType.MOSTLY_F16,
-        "bf16": gguf.LlamaFileType.MOSTLY_BF16,
-        "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
-    }
-
-    logger.info(f"Loading model: {dir_model.name}")
-
-    with torch.inference_mode():
-        gemma3_vision_tower = Gemma3VisionTower(
-            dir_model=dir_model,
-            fname_out=args.outfile,
-            ftype=ftype_map[args.outtype],
-            is_big_endian=args.bigendian,
-        )
-        gemma3_vision_tower.write()
-
-
-if __name__ == '__main__':
-    main()
-
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -1,332 +0,0 @@
-#include "arg.h"
-#include "base64.hpp"
-#include "log.h"
-#include "common.h"
-#include "sampling.h"
-#include "clip.h"
-#include "llava.h"
-#include "llama.h"
-#include "ggml.h"
-
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <vector>
-
-static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
-    int N = (int) tokens.size();
-    for (int i = 0; i < N; i += n_batch) {
-        int n_eval = (int) tokens.size() - i;
-        if (n_eval > n_batch) {
-            n_eval = n_batch;
-        }
-        if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval))) {
-            LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
-            return false;
-        }
-        *n_past += n_eval;
-    }
-    return true;
-}
-
-static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
-    std::vector<llama_token> tokens;
-    tokens.push_back(id);
-    return eval_tokens(ctx_llama, tokens, 1, n_past);
-}
-
-static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
-    std::string              str2     = str;
-    std::vector<llama_token> embd_inp = common_tokenize(ctx_llama, str2, add_bos, true);
-    eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
-    return true;
-}
-
-static const char * sample(struct common_sampler * smpl,
-                           struct llama_context * ctx_llama,
-                           int * n_past) {
-    const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
-    common_sampler_accept(smpl, id, true);
-
-    const llama_model * model = llama_get_model(ctx_llama);
-    const llama_vocab * vocab = llama_model_get_vocab(model);
-
-    static std::string ret;
-    if (llama_vocab_is_eog(vocab, id)) {
-        ret = "</s>";
-    } else {
-        ret = common_token_to_piece(ctx_llama, id);
-    }
-    eval_id(ctx_llama, id, n_past);
-    return ret.c_str();
-}
-
-static const char* IMG_BASE64_TAG_BEGIN = "<img src=\"data:image/jpeg;base64,";
-static const char* IMG_BASE64_TAG_END = "\">";
-
-static void find_image_tag_in_prompt(const std::string& prompt, size_t& begin_out, size_t& end_out) {
-    begin_out = prompt.find(IMG_BASE64_TAG_BEGIN);
-    end_out = prompt.find(IMG_BASE64_TAG_END, (begin_out == std::string::npos) ? 0UL : begin_out);
-}
-
-static bool prompt_contains_image(const std::string& prompt) {
-    size_t begin, end;
-    find_image_tag_in_prompt(prompt, begin, end);
-    return (begin != std::string::npos);
-}
-
-// replaces the base64 image tag in the prompt with `replacement`
-static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip_ctx * ctx_clip, int n_threads, const std::string& prompt) {
-    size_t img_base64_str_start, img_base64_str_end;
-    find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
-    if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
-        LOG_ERR("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
-        return NULL;
-    }
-
-    auto base64_bytes_start = img_base64_str_start + strlen(IMG_BASE64_TAG_BEGIN);
-    auto base64_bytes_count = img_base64_str_end - base64_bytes_start;
-    auto base64_str = prompt.substr(base64_bytes_start, base64_bytes_count );
-
-    auto required_bytes = base64::required_encode_size(base64_str.size());
-    auto img_bytes = std::vector<unsigned char>(required_bytes);
-    base64::decode(base64_str.begin(), base64_str.end(), img_bytes.begin());
-
-    auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
-    if (!embed) {
-        LOG_ERR("%s: could not load image from base64 string.\n", __func__);
-        return NULL;
-    }
-
-    return embed;
-}
-
-static std::string remove_image_from_prompt(const std::string& prompt, const char * replacement = "") {
-    size_t begin, end;
-    find_image_tag_in_prompt(prompt, begin, end);
-    if (begin == std::string::npos || end == std::string::npos) {
-        return prompt;
-    }
-    auto pre = prompt.substr(0, begin);
-    auto post = prompt.substr(end + strlen(IMG_BASE64_TAG_END));
-    return pre + replacement + post;
-}
-
-struct llava_context {
-    struct clip_ctx * ctx_clip = NULL;
-    struct llama_context * ctx_llama = NULL;
-    struct llama_model * model = NULL;
-};
-
-static void print_usage(int, char ** argv) {
-    LOG("\n example usage:\n");
-    LOG("\n     %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
-    LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
-}
-
-static struct llava_image_embed * load_image(llava_context * ctx_llava, common_params * params, const std::string & fname) {
-
-    // load and preprocess the image
-    llava_image_embed * embed = NULL;
-    auto prompt = params->prompt;
-    if (prompt_contains_image(prompt)) {
-        if (!params->image.empty()) {
-            LOG_INF("using base64 encoded image instead of command line image path\n");
-        }
-        embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt);
-        if (!embed) {
-            LOG_ERR("%s: can't load image from prompt\n", __func__);
-            return NULL;
-        }
-        params->prompt = remove_image_from_prompt(prompt);
-    } else {
-        embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->cpuparams.n_threads, fname.c_str());
-        if (!embed) {
-            fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
-            return NULL;
-        }
-    }
-
-    return embed;
-}
-
-static void process_prompt(struct llava_context * ctx_llava, struct llava_image_embed * image_embed, common_params * params, const std::string & prompt) {
-    int n_past = 0;
-
-    const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
-
-    std::string system_prompt, user_prompt;
-    size_t image_pos = prompt.find("<image>");
-    if (image_pos != std::string::npos) {
-        // new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
-        system_prompt = prompt.substr(0, image_pos);
-        user_prompt = prompt.substr(image_pos + std::string("<image>").length());
-        LOG_INF("system_prompt: %s\n", system_prompt.c_str());
-        if (params->verbose_prompt) {
-            auto tmp = common_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
-            for (int i = 0; i < (int) tmp.size(); i++) {
-                LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
-            }
-        }
-        LOG_INF("user_prompt: %s\n", user_prompt.c_str());
-        if (params->verbose_prompt) {
-            auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
-            for (int i = 0; i < (int) tmp.size(); i++) {
-                LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
-            }
-        }
-    } else {
-        // llava-1.5 native mode
-        system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:";
-        user_prompt = prompt + "\nASSISTANT:";
-        if (params->verbose_prompt) {
-            auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
-            for (int i = 0; i < (int) tmp.size(); i++) {
-                LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
-            }
-        }
-    }
-
-    eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, true);
-    llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
-    eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
-
-    // generate the response
-
-    LOG("\n");
-
-    struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling);
-    if (!smpl) {
-        LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
-        exit(1);
-    }
-
-    std::string response = "";
-    for (int i = 0; i < max_tgt_len; i++) {
-        const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past);
-        response += tmp;
-        if (strcmp(tmp, "</s>") == 0) break;
-        if (strstr(tmp, "###")) break; // Yi-VL behavior
-        LOG("%s", tmp);
-        if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
-        if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
-        if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
-
-        fflush(stdout);
-    }
-
-    common_sampler_free(smpl);
-    LOG("\n");
-}
-
-static struct llama_model * llava_init(common_params * params) {
-    llama_backend_init();
-    llama_numa_init(params->numa);
-
-    llama_model_params model_params = common_model_params_to_llama(*params);
-
-    llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
-    if (model == NULL) {
-        LOG_ERR("%s: unable to load model\n" , __func__);
-        return NULL;
-    }
-    return model;
-}
-
-static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
-    const char * clip_path = params->mmproj.c_str();
-
-    auto prompt = params->prompt;
-    if (prompt.empty()) {
-        prompt = "describe the image in detail.";
-    }
-
-    auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
-
-    llama_context_params ctx_params = common_context_params_to_llama(*params);
-    ctx_params.n_ctx           = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
-
-    llama_context * ctx_llama = llama_init_from_model(model, ctx_params);
-
-    if (ctx_llama == NULL) {
-        LOG_ERR("%s: failed to create the llama_context\n" , __func__);
-        return NULL;
-    }
-
-    auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
-
-    ctx_llava->ctx_llama = ctx_llama;
-    ctx_llava->ctx_clip = ctx_clip;
-    ctx_llava->model = model;
-    return ctx_llava;
-}
-
-static void llava_free(struct llava_context * ctx_llava) {
-    if (ctx_llava->ctx_clip) {
-        clip_free(ctx_llava->ctx_clip);
-        ctx_llava->ctx_clip = NULL;
-    }
-
-    llama_free(ctx_llava->ctx_llama);
-    llama_model_free(ctx_llava->model);
-    llama_backend_free();
-}
-
-int main(int argc, char ** argv) {
-    ggml_time_init();
-
-    common_params params;
-
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) {
-        return 1;
-    }
-
-    common_init();
-
-    if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
-        print_usage(argc, argv);
-        return 1;
-    }
-
-    auto * model = llava_init(&params);
-    if (model == NULL) {
-        fprintf(stderr, "%s: error: failed to init llava model\n", __func__);
-        return 1;
-    }
-
-    if (prompt_contains_image(params.prompt)) {
-        auto * ctx_llava = llava_init_context(&params, model);
-
-        auto * image_embed = load_image(ctx_llava, &params, "");
-
-        // process the prompt
-        process_prompt(ctx_llava, image_embed, &params, params.prompt);
-
-        llama_perf_context_print(ctx_llava->ctx_llama);
-        llava_image_embed_free(image_embed);
-        ctx_llava->model = NULL;
-        llava_free(ctx_llava);
-    } else {
-        for (auto & image : params.image) {
-            auto * ctx_llava = llava_init_context(&params, model);
-
-            auto * image_embed = load_image(ctx_llava, &params, image);
-            if (!image_embed) {
-                LOG_ERR("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str());
-                return 1;
-            }
-
-            // process the prompt
-            process_prompt(ctx_llava, image_embed, &params, params.prompt);
-
-            llama_perf_context_print(ctx_llava->ctx_llama);
-            llava_image_embed_free(image_embed);
-            ctx_llava->model = NULL;
-            llava_free(ctx_llava);
-        }
-    }
-
-    llama_model_free(model);
-
-    return 0;
-}
--- a/examples/llava/minicpmv-cli.cpp
+++ b/examples/llava/minicpmv-cli.cpp
@@ -1,354 +0,0 @@
-#include "arg.h"
-#include "log.h"
-#include "common.h"
-#include "sampling.h"
-#include "clip.h"
-#include "llava.h"
-#include "llama.h"
-#include "ggml.h"
-
-#include <algorithm>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <vector>
-#include <iostream> // TODO: remove me
-
-struct llava_context {
-    struct clip_ctx * ctx_clip = NULL;
-    struct llama_context * ctx_llama = NULL;
-    struct llama_model * model = NULL;
-};
-
-static void show_additional_info(int /*argc*/, char ** argv) {
-    LOG("\nexample usage:\n\n%s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
-    LOG("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n");
-}
-
-static struct llama_model * llava_init(common_params * params) {
-    llama_backend_init();
-    llama_numa_init(params->numa);
-
-    llama_model_params model_params = common_model_params_to_llama(*params);
-
-    llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
-    if (model == NULL) {
-        LOG_ERR("%s: unable to load model\n" , __func__);
-        return NULL;
-    }
-    return model;
-}
-
-static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
-    auto prompt = params->prompt;
-    if (prompt.empty()) {
-        prompt = "describe the image in detail.";
-    }
-
-    llama_context_params ctx_params = common_context_params_to_llama(*params);
-    if (params->n_ctx < 2048) {
-        // warn user here, "Image processing requires at least 2048 context, setting context to 2048"
-        LOG_WRN("%s: Image processing requires at least 2048 context, setting context to 2048\n" , __func__);
-        ctx_params.n_ctx = 2048;
-    } else {
-        ctx_params.n_ctx = params->n_ctx;
-    }
-
-    llama_context * ctx_llama = llama_init_from_model(model, ctx_params);
-
-    if (ctx_llama == NULL) {
-        LOG_ERR("%s: failed to create the llama_context\n" , __func__);
-        return NULL;
-    }
-
-    auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
-
-    ctx_llava->ctx_llama = ctx_llama;
-    ctx_llava->model = model;
-    return ctx_llava;
-}
-
-static void llava_free(struct llava_context * ctx_llava) {
-    if (ctx_llava->ctx_clip) {
-        clip_free(ctx_llava->ctx_clip);
-        ctx_llava->ctx_clip = NULL;
-    }
-
-    llama_free(ctx_llava->ctx_llama);
-    llama_model_free(ctx_llava->model);
-    llama_backend_free();
-}
-
-static struct clip_ctx * clip_init_context(common_params * params) {
-    const char * clip_path = params->mmproj.c_str();
-
-    auto prompt = params->prompt;
-    if (prompt.empty()) {
-        prompt = "describe the image in detail.";
-    }
-    struct clip_context_params clip_params = {
-        /* use_gpu */   params->n_gpu_layers != 0,
-        /* verbosity */ params->verbosity,
-    };
-    auto * ctx_clip = clip_init(clip_path, clip_params);
-    return ctx_clip;
-}
-
-static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
-    int N = (int) tokens.size();
-    for (int i = 0; i < N; i += n_batch) {
-        int n_eval = (int) tokens.size() - i;
-        if (n_eval > n_batch) {
-            n_eval = n_batch;
-        }
-        if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval))) {
-            LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
-            return false;
-        }
-        *n_past += n_eval;
-    }
-    return true;
-}
-
-static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
-    std::vector<llama_token> tokens;
-    tokens.push_back(id);
-    return eval_tokens(ctx_llama, tokens, 1, n_past);
-}
-
-static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
-    std::string              str2     = str;
-    std::vector<llama_token> embd_inp = common_tokenize(ctx_llama, str2, add_bos, true);
-    return eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
-}
-
-static void process_eval_image_embed(struct llava_context * ctx_llava, const struct llava_image_embed * embeds, int n_batch, int * n_past, int idx) {
-    float * image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip));
-    std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip));
-
-    auto * slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed));
-    slice_embed->embed = image_embed;
-    slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip);
-    llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past);
-    llava_image_embed_free(slice_embed);
-}
-
-static void process_image(struct llava_context * ctx_llava, struct llava_image_embed * embeds, common_params * params, int &n_past) {
-    std::string system_prompt;
-    int idx = 0;
-    int num_image_embeds = embeds->n_image_pos / clip_n_patches(ctx_llava->ctx_clip);
-    int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
-    if (has_minicpmv_projector == 2) {
-        system_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n";
-    }
-    else if (has_minicpmv_projector == 3) {
-        system_prompt = "<|im_start|>user\n";
-    }
-    else if (has_minicpmv_projector == 4) {
-        system_prompt = "<|im_start|>user\n";
-    }
-    LOG_INF("%s: image token past: %d\n", __func__, n_past);
-    eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
-    process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
-    eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
-    if (num_image_embeds > 1) {
-        if (has_minicpmv_projector == 2) {
-            size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip);
-            eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
-            for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) {
-                for (size_t j = 0; j < num_image_embeds_col; ++j) {
-                    eval_string(ctx_llava->ctx_llama, std::string("<image>").c_str(), params->n_batch, &n_past, false);
-                    process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
-                    eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
-                    if (j == num_image_embeds_col - 1) {
-                        eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
-                    }
-                }
-            }
-            eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
-        }
-        else if (has_minicpmv_projector == 3 || has_minicpmv_projector == 4) {
-            size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip);
-            for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) {
-                for (size_t j = 0; j < num_image_embeds_col; ++j) {
-                    eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
-                    process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
-                    eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
-                    if (j == num_image_embeds_col - 1) {
-                        eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
-                    }
-                }
-            }
-        }
-    }
-    LOG_INF("%s: image token past: %d\n", __func__, n_past);
-}
-
-static const char * sample(struct common_sampler * smpl,
-                           struct llama_context * ctx_llama,
-                           int * n_past) {
-    const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
-    common_sampler_accept(smpl, id, true);
-
-    const llama_model * model = llama_get_model(ctx_llama);
-    const llama_vocab * vocab = llama_model_get_vocab(model);
-
-    static std::string ret;
-    if (llama_vocab_is_eog(vocab, id)) {
-        ret = "</s>";
-    } else {
-        ret = common_token_to_piece(ctx_llama, id);
-    }
-    eval_id(ctx_llama, id, n_past);
-    return ret.c_str();
-}
-
-static struct llava_context * minicpmv_init(common_params * params, const std::string & fname, int &n_past){
-    auto * ctx_clip = clip_init_context(params);
-    auto * embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str());
-    if (!embeds) {
-        LOG_ERR("failed to load image %s. Terminating\n\n", fname.c_str());
-        return NULL;
-    }
-
-    // process the prompt
-    if (params->prompt.empty() && params->interactive == false) {
-        LOG_ERR("prompt should be given or interactive mode should be on");
-        return NULL;
-    }
-
-    auto * model = llava_init(params);
-    if (model == NULL) {
-        fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__);
-        return NULL;
-    }
-    const int64_t t_llava_init_start_us = ggml_time_us();
-    auto * ctx_llava = llava_init_context(params, model);
-    ctx_llava->ctx_clip = ctx_clip;
-    const int64_t t_llava_init_end_us = ggml_time_us();
-    float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0;
-    LOG_INF("%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);
-
-    const int64_t t_process_image_start_us = ggml_time_us();
-    process_image(ctx_llava, embeds, params, n_past);
-    const int64_t t_process_image_end_us = ggml_time_us();
-    float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0;
-    LOG_INF("%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);
-
-    llava_image_embed_free(embeds);
-    return ctx_llava;
-}
-
-static struct common_sampler * llama_init(struct llava_context * ctx_llava, common_params * params, const std::string & prompt, int & n_past, bool is_first = false){
-    std::string user_prompt = prompt;
-    int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
-    if (!is_first) {
-        if (has_minicpmv_projector == 2) {
-            user_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" + prompt;
-        }
-        else if (has_minicpmv_projector == 3) {
-            user_prompt = "<|im_start|>user\n" + prompt;
-        }
-        else if (has_minicpmv_projector == 4) {
-            user_prompt = "<|im_start|>user\n" + prompt;
-        }
-    }
-
-    eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
-    if (has_minicpmv_projector == 2) {
-        eval_string(ctx_llava->ctx_llama, "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", params->n_batch, &n_past, false);
-    }
-    else if (has_minicpmv_projector == 3) {
-        eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false);
-    }
-    else if (has_minicpmv_projector == 4) {
-        eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false);
-    }
-
-    // generate the response
-
-    LOG_INF("\n");
-
-    struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling);
-    return smpl;
-}
-
-static const char * llama_loop(struct llava_context * ctx_llava,struct common_sampler * smpl, int &n_past){
-
-    const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past);
-    return tmp;
-}
-
-int main(int argc, char ** argv) {
-    ggml_time_init();
-
-    common_params params;
-
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) {
-        return 1;
-    }
-
-    common_init();
-
-    if (params.mmproj.empty() || (params.image.empty())) {
-        show_additional_info(argc, argv);
-        return 1;
-    }
-
-    for (auto & image : params.image) {
-        int n_past = 0;
-        auto * ctx_llava = minicpmv_init(&params, image, n_past);
-
-        if (!params.prompt.empty()) {
-            LOG("<user>%s\n", params.prompt.c_str());
-            LOG("<assistant>");
-            auto * smpl = llama_init(ctx_llava, &params, params.prompt, n_past, true);
-            const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
-            std::string response;
-            bool have_tmp = false;
-            for (int i = 0; i < max_tgt_len; i++) {
-                const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
-                response += tmp;
-                if (strcmp(tmp, "</s>") == 0){
-                    if (!have_tmp) {
-                        continue;
-                    }
-                    break;
-                }
-                if (strstr(tmp, "###")) break; // Yi-VL behavior
-                have_tmp = true;
-                printf("%s", tmp);
-                if (strstr(response.c_str(), "<user>")) break; // minicpm-v
-
-                fflush(stdout);
-            }
-            common_sampler_free(smpl);
-        }else {
-            while (true) {
-                LOG("<user>");
-                std::string prompt;
-                std::getline(std::cin, prompt);
-                LOG("<assistant>");
-                auto * smpl = llama_init(ctx_llava, &params, prompt, n_past, true);
-                const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
-                std::string response;
-                for (int i = 0; i < max_tgt_len; i++) {
-                    const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
-                    response += tmp;
-                    if (strcmp(tmp, "</s>") == 0) break;
-                    printf("%s", tmp);// mistral llava-1.6
-                    if (strstr(response.c_str(), "<user>")) break; // minicpm-v
-                    fflush(stdout);
-                }
-                common_sampler_free(smpl);
-            }
-        }
-        printf("\n");
-        llama_perf_context_print(ctx_llava->ctx_llama);
-
-        ctx_llava->model = NULL;
-        llava_free(ctx_llava);
-    }
-
-    return 0;
-}
--- a/examples/llava/qwen2_vl_surgery.py
+++ b/examples/llava/qwen2_vl_surgery.py
@@ -1,165 +0,0 @@
-import argparse
-from typing import Dict
-
-import torch
-import numpy as np
-from gguf import *
-from transformers import (
-    Qwen2VLForConditionalGeneration,
-    Qwen2VLProcessor,
-    AutoProcessor,
-    Qwen2VLConfig
-)
-
-
-VISION = "clip.vision"
-
-
-def k(raw_key: str, arch: str) -> str:
-    return raw_key.format(arch=arch)
-
-
-def to_gguf_name(name: str) -> str:
-    og = name
-    name = name.replace("text_model", "t").replace("vision_model", "v")
-    name = name.replace("blocks", "blk").replace("embeddings.", "")
-    name = name.replace("attn.", "attn_")
-    name = name.replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("proj.", "out.")
-    # name = name.replace("layrnorm", "ln").replace("layer_norm", "ln").replace("layernorm", "ln")
-    name = name.replace("norm1", "ln1").replace("norm2", "ln2")
-    name = name.replace("merger.mlp", 'mm')
-    print(f"[to_gguf_name] {og} --> {name}")
-    return name
-
-
-def find_vision_tensors(qwen2vl, dtype) -> Dict[str, np.ndarray]:
-    vision_model = qwen2vl.visual
-    tensor_map = {}
-    for name, ten in vision_model.state_dict().items():
-        ten = ten.numpy()
-        if 'qkv' in name:
-            if ten.ndim == 2: # weight
-                c3, _ = ten.shape
-            else:             # bias
-                c3 = ten.shape[0]
-            assert c3 % 3 == 0
-            c = c3 // 3
-            wq = ten[:c]
-            wk = ten[c: c * 2]
-            wv = ten[c * 2:]
-            tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "q")] = wq
-            tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "k")] = wk
-            tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "v")] = wv
-        elif 'merger' in name:
-            if name.endswith("ln_q.weight"):
-                tensor_map['v.post_ln.weight'] = ten
-            elif name.endswith("ln_q.bias"):
-                tensor_map['v.post_ln.bias'] = ten
-            else:
-                # "merger.mlp.%d.weight/bias" --> "mm.%d.weight/bias"
-                tensor_map[to_gguf_name(name)] = ten
-        elif 'patch_embed.proj.weight' in name:
-            # NOTE: split Conv3D into Conv2Ds
-            c1, c2, kt, kh, kw = ten.shape
-            assert kt == 2, "Current implmentation only support temporal_patch_size of 2"
-            tensor_map["v.patch_embd.weight"] = ten[:, :, 0, ...]
-            tensor_map["v.patch_embd.weight.1"] = ten[:, :, 1, ...]
-        else:
-            tensor_map[to_gguf_name(f"vision_model.{name}")] = ten
-
-    for new_name, ten in tensor_map.items():
-        if ten.ndim <= 1 or new_name.endswith("_norm.weight"):
-            tensor_map[new_name] = ten.astype(np.float32)
-        else:
-            tensor_map[new_name] = ten.astype(dtype)
-    tensor_map["v.position_embd.weight"] = np.zeros([10, 10], dtype=np.float32)  # dummy tensor, just here as a placeholder
-    return tensor_map
-
-
-def main(args):
-    if args.data_type == 'fp32':
-        dtype = torch.float32
-        np_dtype = np.float32
-        ftype = 0
-    elif args.data_type == 'fp16':
-        dtype = torch.float32
-        np_dtype = np.float16
-        ftype = 1
-    else:
-        raise ValueError()
-
-    local_model = False
-    model_path = ""
-    model_name = args.model_name
-    print("model_name: ", model_name)
-    qwen2vl = Qwen2VLForConditionalGeneration.from_pretrained(
-        model_name, torch_dtype=dtype, device_map="cpu"
-    )
-    cfg: Qwen2VLConfig = qwen2vl.config  # type: ignore[reportAssignmentType]
-    vcfg = cfg.vision_config
-
-    if os.path.isdir(model_name):
-        local_model = True
-        if model_name.endswith(os.sep):
-            model_name = model_name[:-1]
-        model_path = model_name
-        model_name = os.path.basename(model_name)
-    fname_out = f"{model_name.replace('/', '-').lower()}-vision.gguf"
-
-    fout = GGUFWriter(path=fname_out, arch="clip")
-    fout.add_description("image encoder for Qwen2VL")
-
-    fout.add_file_type(ftype)
-    fout.add_bool("clip.has_text_encoder", False)
-    fout.add_bool("clip.has_vision_encoder", True)
-    fout.add_bool("clip.has_qwen2vl_merger", True)
-    fout.add_string("clip.projector_type", "qwen2vl_merger")
-
-    print(cfg.vision_config)
-    if 'silu' in cfg.vision_config.hidden_act.lower():
-        fout.add_bool("clip.use_silu", True)
-        fout.add_bool("clip.use_gelu", False)
-    elif 'gelu' in cfg.vision_config.hidden_act.lower():
-        fout.add_bool("clip.use_silu", False)
-        fout.add_bool("clip.use_gelu", 'quick' not in cfg.vision_config.hidden_act.lower())
-    else:
-        raise ValueError()
-
-    tensor_map = find_vision_tensors(qwen2vl, np_dtype)
-    for name, data in tensor_map.items():
-        fout.add_tensor(name, data)
-
-    fout.add_uint32("clip.vision.patch_size", vcfg.patch_size)
-    fout.add_uint32("clip.vision.image_size", 14 * 40)  # some reasonable size that is divable by (14*2)
-    fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vcfg.embed_dim)
-    fout.add_uint32("clip.vision.projection_dim", vcfg.hidden_size)
-    fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), vcfg.num_heads)
-    fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
-    fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), vcfg.depth)
-    fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), 0)  # not sure what this does, put 0 here as a placeholder
-    fout.add_name(model_name)
-    """
-    HACK: Since vision rope related parameter aren't stored in the `Qwen2VLConfig,
-            it will be hardcoded in the `clip_image_build_graph` from `clip.cpp`.
-    """
-
-    if local_model:
-        processor: Qwen2VLProcessor = AutoProcessor.from_pretrained(model_path)
-    else:
-        processor: Qwen2VLProcessor = AutoProcessor.from_pretrained(model_name)
-    fout.add_array("clip.vision.image_mean", processor.image_processor.image_mean) # type: ignore[reportAttributeAccessIssue]
-    fout.add_array("clip.vision.image_std", processor.image_processor.image_std) # type: ignore[reportAttributeAccessIssue]
-
-    fout.write_header_to_file()
-    fout.write_kv_data_to_file()
-    fout.write_tensors_to_file()
-    fout.close()
-    print("save model as: ", fname_out)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("model_name", nargs='?', default="Qwen/Qwen2-VL-2B-Instruct")
-    parser.add_argument("--data_type", nargs='?', choices=['fp32', 'fp16'], default="fp32")
-    args = parser.parse_args()
-    main(args)
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -106,6 +106,8 @@ int main(int argc, char ** argv) {

    common_params params;

+    params.n_predict = 128;
+
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
        return 1;
    }
@@ -405,7 +407,7 @@ int main(int argc, char ** argv) {
        params.prompt_file = "used built-in defaults";
    }
    LOG_INF("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
-    LOG_INF("Model and path used:  \033[32m%s\033[0m\n\n", params.model.c_str());
+    LOG_INF("Model and path used:  \033[32m%s\033[0m\n\n", params.model.path.c_str());

    LOG_INF("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt              ) / (t_main_end - t_main_start) * 1e6);
    LOG_INF("Total gen tokens:    %6d, speed: %5.2f t/s\n", n_total_gen,    (double) (n_total_gen                 ) / (t_main_end - t_main_start) * 1e6);
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -64,7 +64,7 @@ int main(int argc, char ** argv) {

    llama_model_params model_params = common_model_params_to_llama(params);

-    llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
+    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);

    if (model == NULL) {
        LOG_ERR("%s: unable to load model\n" , __func__);
--- a/examples/pydantic_models_to_grammar_examples.py
+++ b/examples/pydantic_models_to_grammar_examples.py
@@ -23,7 +23,7 @@ def create_completion(host, prompt, gbnf_grammar):
    """Calls the /completion API on llama-server.

    See
-    https://github.com/ggml-org/llama.cpp/tree/HEAD/examples/server#api-endpoints
+    https://github.com/ggml-org/llama.cpp/tree/HEAD/tools/server#api-endpoints
    """
    print(f"  Request:\n    Grammar:\n{textwrap.indent(gbnf_grammar, '      ')}\n    Prompt:\n{textwrap.indent(prompt.rstrip(), '      ')}")
    headers = {"Content-Type": "application/json"}
--- a/examples/quantize-stats/CMakeLists.txt
+++ b/examples/quantize-stats/CMakeLists.txt
@@ -1,6 +0,0 @@
-set(TARGET llama-quantize-stats)
-add_executable(${TARGET} quantize-stats.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
-target_include_directories(${TARGET} PRIVATE ../../common)
-target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/rpc/CMakeLists.txt
+++ b/examples/rpc/CMakeLists.txt
@@ -1,2 +0,0 @@
-add_executable(rpc-server rpc-server.cpp)
-target_link_libraries(rpc-server PRIVATE ggml llama)
--- a/examples/rpc/rpc-server.cpp
+++ b/examples/rpc/rpc-server.cpp
@@ -1,171 +0,0 @@
-#include "ggml-cpu.h"
-
-#ifdef GGML_USE_CUDA
-#include "ggml-cuda.h"
-#endif
-
-#ifdef GGML_USE_METAL
-#include "ggml-metal.h"
-#endif
-
-#ifdef GGML_USE_VULKAN
-#include "ggml-vulkan.h"
-#endif
-
-#ifdef GGML_USE_SYCL
-#include "ggml-sycl.h"
-#endif
-
-#include "ggml-rpc.h"
-#ifdef _WIN32
-#  include <windows.h>
-#else
-#  include <unistd.h>
-#endif
-#include <string>
-#include <stdio.h>
-
-struct rpc_server_params {
-    std::string host        = "127.0.0.1";
-    int         port        = 50052;
-    size_t      backend_mem = 0;
-};
-
-static void print_usage(int /*argc*/, char ** argv, rpc_server_params params) {
-    fprintf(stderr, "Usage: %s [options]\n\n", argv[0]);
-    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h, --help            show this help message and exit\n");
-    fprintf(stderr, "  -H HOST, --host HOST  host to bind to (default: %s)\n", params.host.c_str());
-    fprintf(stderr, "  -p PORT, --port PORT  port to bind to (default: %d)\n", params.port);
-    fprintf(stderr, "  -m MEM, --mem MEM     backend memory size (in MB)\n");
-    fprintf(stderr, "\n");
-}
-
-static bool rpc_server_params_parse(int argc, char ** argv, rpc_server_params & params) {
-    std::string arg;
-    for (int i = 1; i < argc; i++) {
-        arg = argv[i];
-        if (arg == "-H" || arg == "--host") {
-            if (++i >= argc) {
-                return false;
-            }
-            params.host = argv[i];
-        } else if (arg == "-p" || arg == "--port") {
-            if (++i >= argc) {
-                return false;
-            }
-            params.port = std::stoi(argv[i]);
-            if (params.port <= 0 || params.port > 65535) {
-                return false;
-            }
-        } else if (arg == "-m" || arg == "--mem") {
-            if (++i >= argc) {
-                return false;
-            }
-            params.backend_mem = std::stoul(argv[i]) * 1024 * 1024;
-        } else if (arg == "-h" || arg == "--help") {
-            print_usage(argc, argv, params);
-            exit(0);
-        } else {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            print_usage(argc, argv, params);
-            exit(0);
-        }
-    }
-    return true;
-}
-
-static ggml_backend_t create_backend() {
-    ggml_backend_t backend = NULL;
-#ifdef GGML_USE_CUDA
-    fprintf(stderr, "%s: using CUDA backend\n", __func__);
-    backend = ggml_backend_cuda_init(0); // init device 0
-    if (!backend) {
-        fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
-    }
-#elif GGML_USE_METAL
-    fprintf(stderr, "%s: using Metal backend\n", __func__);
-    backend = ggml_backend_metal_init();
-    if (!backend) {
-        fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
-    }
-#elif GGML_USE_VULKAN
-    fprintf(stderr, "%s: using Vulkan backend\n", __func__);
-    backend = ggml_backend_vk_init(0); // init device 0
-    if (!backend) {
-        fprintf(stderr, "%s: ggml_backend_vulkan_init() failed\n", __func__);
-    }
-#elif GGML_USE_SYCL
-    fprintf(stderr, "%s: using SYCL backend\n", __func__);
-    backend = ggml_backend_sycl_init(0); // init device 0
-    if (!backend) {
-        fprintf(stderr, "%s: ggml_backend_sycl_init() failed\n", __func__);
-    }
-#endif
-
-    // if there aren't GPU Backends fallback to CPU backend
-    if (!backend) {
-        fprintf(stderr, "%s: using CPU backend\n", __func__);
-        backend = ggml_backend_cpu_init();
-    }
-    return backend;
-}
-
-static void get_backend_memory(size_t * free_mem, size_t * total_mem) {
-#ifdef GGML_USE_CUDA
-    ggml_backend_cuda_get_device_memory(0, free_mem, total_mem);
-#elif GGML_USE_VULKAN
-    ggml_backend_vk_get_device_memory(0, free_mem, total_mem);
-#elif GGML_USE_SYCL
-    ggml_backend_sycl_get_device_memory(0, free_mem, total_mem);
-#else
-    #ifdef _WIN32
-        MEMORYSTATUSEX status;
-        status.dwLength = sizeof(status);
-        GlobalMemoryStatusEx(&status);
-        *total_mem = status.ullTotalPhys;
-        *free_mem = status.ullAvailPhys;
-    #else
-        long pages = sysconf(_SC_PHYS_PAGES);
-        long page_size = sysconf(_SC_PAGE_SIZE);
-        *total_mem = pages * page_size;
-        *free_mem = *total_mem;
-    #endif
-#endif
-}
-
-int main(int argc, char * argv[]) {
-    rpc_server_params params;
-    if (!rpc_server_params_parse(argc, argv, params)) {
-        fprintf(stderr, "Invalid parameters\n");
-        return 1;
-    }
-
-    if (params.host != "127.0.0.1") {
-        fprintf(stderr, "\n");
-        fprintf(stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
-        fprintf(stderr, "WARNING: Host ('%s') is != '127.0.0.1'\n", params.host.c_str());
-        fprintf(stderr, "         Never expose the RPC server to an open network!\n");
-        fprintf(stderr, "         This is an experimental feature and is not secure!\n");
-        fprintf(stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
-        fprintf(stderr, "\n");
-    }
-
-    ggml_backend_t backend = create_backend();
-    if (!backend) {
-        fprintf(stderr, "Failed to create backend\n");
-        return 1;
-    }
-    std::string endpoint = params.host + ":" + std::to_string(params.port);
-    size_t free_mem, total_mem;
-    if (params.backend_mem > 0) {
-        free_mem = params.backend_mem;
-        total_mem = params.backend_mem;
-    } else {
-        get_backend_memory(&free_mem, &total_mem);
-    }
-    printf("Starting RPC server on %s, backend memory: %zu MB\n", endpoint.c_str(), free_mem / (1024 * 1024));
-    ggml_backend_rpc_start_server(backend, endpoint.c_str(), free_mem, total_mem);
-    ggml_backend_free(backend);
-    return 0;
-}
--- a/examples/run/CMakeLists.txt
+++ b/examples/run/CMakeLists.txt
@@ -1,5 +0,0 @@
-set(TARGET llama-run)
-add_executable(${TARGET} run.cpp linenoise.cpp/linenoise.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/server/public/index.html.gz
+++ b/examples/server/public/index.html.gz
--- a/examples/server/webui/postcss.config.js
+++ b/examples/server/webui/postcss.config.js
@@ -1,6 +0,0 @@
-export default {
-  plugins: {
-    tailwindcss: {},
-    autoprefixer: {},
-  },
-}
--- a/examples/server/webui/src/components/Header.tsx
+++ b/examples/server/webui/src/components/Header.tsx
@@ -1,179 +0,0 @@
-import { useEffect, useState } from 'react';
-import StorageUtils from '../utils/storage';
-import { useAppContext } from '../utils/app.context';
-import { classNames } from '../utils/misc';
-import daisyuiThemes from 'daisyui/src/theming/themes';
-import { THEMES } from '../Config';
-import { useNavigate } from 'react-router';
-
-export default function Header() {
-  const navigate = useNavigate();
-  const [selectedTheme, setSelectedTheme] = useState(StorageUtils.getTheme());
-  const { setShowSettings } = useAppContext();
-
-  const setTheme = (theme: string) => {
-    StorageUtils.setTheme(theme);
-    setSelectedTheme(theme);
-  };
-
-  useEffect(() => {
-    document.body.setAttribute('data-theme', selectedTheme);
-    document.body.setAttribute(
-      'data-color-scheme',
-      // @ts-expect-error daisyuiThemes complains about index type, but it should work
-      daisyuiThemes[selectedTheme]?.['color-scheme'] ?? 'auto'
-    );
-  }, [selectedTheme]);
-
-  const { isGenerating, viewingChat } = useAppContext();
-  const isCurrConvGenerating = isGenerating(viewingChat?.conv.id ?? '');
-
-  const removeConversation = () => {
-    if (isCurrConvGenerating || !viewingChat) return;
-    const convId = viewingChat?.conv.id;
-    if (window.confirm('Are you sure to delete this conversation?')) {
-      StorageUtils.remove(convId);
-      navigate('/');
-    }
-  };
-
-  const downloadConversation = () => {
-    if (isCurrConvGenerating || !viewingChat) return;
-    const convId = viewingChat?.conv.id;
-    const conversationJson = JSON.stringify(viewingChat, null, 2);
-    const blob = new Blob([conversationJson], { type: 'application/json' });
-    const url = URL.createObjectURL(blob);
-    const a = document.createElement('a');
-    a.href = url;
-    a.download = `conversation_${convId}.json`;
-    document.body.appendChild(a);
-    a.click();
-    document.body.removeChild(a);
-    URL.revokeObjectURL(url);
-  };
-
-  return (
-    <div className="flex flex-row items-center pt-6 pb-6 sticky top-0 z-10 bg-base-100">
-      {/* open sidebar button */}
-      <label htmlFor="toggle-drawer" className="btn btn-ghost lg:hidden">
-        <svg
-          xmlns="http://www.w3.org/2000/svg"
-          width="16"
-          height="16"
-          fill="currentColor"
-          className="bi bi-list"
-          viewBox="0 0 16 16"
-        >
-          <path
-            fillRule="evenodd"
-            d="M2.5 12a.5.5 0 0 1 .5-.5h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5m0-4a.5.5 0 0 1 .5-.5h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5m0-4a.5.5 0 0 1 .5-.5h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5"
-          />
-        </svg>
-      </label>
-
-      <div className="grow text-2xl font-bold ml-2">llama.cpp</div>
-
-      {/* action buttons (top right) */}
-      <div className="flex items-center">
-        {viewingChat && (
-          <div className="dropdown dropdown-end">
-            {/* "..." button */}
-            <button
-              tabIndex={0}
-              role="button"
-              className="btn m-1"
-              disabled={isCurrConvGenerating}
-            >
-              <svg
-                xmlns="http://www.w3.org/2000/svg"
-                width="16"
-                height="16"
-                fill="currentColor"
-                className="bi bi-three-dots-vertical"
-                viewBox="0 0 16 16"
-              >
-                <path d="M9.5 13a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0m0-5a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0m0-5a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0" />
-              </svg>
-            </button>
-            {/* dropdown menu */}
-            <ul
-              tabIndex={0}
-              className="dropdown-content menu bg-base-100 rounded-box z-[1] w-52 p-2 shadow"
-            >
-              <li onClick={downloadConversation}>
-                <a>Download</a>
-              </li>
-              <li className="text-error" onClick={removeConversation}>
-                <a>Delete</a>
-              </li>
-            </ul>
-          </div>
-        )}
-
-        <div className="tooltip tooltip-bottom" data-tip="Settings">
-          <button className="btn" onClick={() => setShowSettings(true)}>
-            {/* settings button */}
-            <svg
-              xmlns="http://www.w3.org/2000/svg"
-              width="16"
-              height="16"
-              fill="currentColor"
-              className="bi bi-gear"
-              viewBox="0 0 16 16"
-            >
-              <path d="M8 4.754a3.246 3.246 0 1 0 0 6.492 3.246 3.246 0 0 0 0-6.492M5.754 8a2.246 2.246 0 1 1 4.492 0 2.246 2.246 0 0 1-4.492 0" />
-              <path d="M9.796 1.343c-.527-1.79-3.065-1.79-3.592 0l-.094.319a.873.873 0 0 1-1.255.52l-.292-.16c-1.64-.892-3.433.902-2.54 2.541l.159.292a.873.873 0 0 1-.52 1.255l-.319.094c-1.79.527-1.79 3.065 0 3.592l.319.094a.873.873 0 0 1 .52 1.255l-.16.292c-.892 1.64.901 3.434 2.541 2.54l.292-.159a.873.873 0 0 1 1.255.52l.094.319c.527 1.79 3.065 1.79 3.592 0l.094-.319a.873.873 0 0 1 1.255-.52l.292.16c1.64.893 3.434-.902 2.54-2.541l-.159-.292a.873.873 0 0 1 .52-1.255l.319-.094c1.79-.527 1.79-3.065 0-3.592l-.319-.094a.873.873 0 0 1-.52-1.255l.16-.292c.893-1.64-.902-3.433-2.541-2.54l-.292.159a.873.873 0 0 1-1.255-.52zm-2.633.283c.246-.835 1.428-.835 1.674 0l.094.319a1.873 1.873 0 0 0 2.693 1.115l.291-.16c.764-.415 1.6.42 1.184 1.185l-.159.292a1.873 1.873 0 0 0 1.116 2.692l.318.094c.835.246.835 1.428 0 1.674l-.319.094a1.873 1.873 0 0 0-1.115 2.693l.16.291c.415.764-.42 1.6-1.185 1.184l-.291-.159a1.873 1.873 0 0 0-2.693 1.116l-.094.318c-.246.835-1.428.835-1.674 0l-.094-.319a1.873 1.873 0 0 0-2.692-1.115l-.292.16c-.764.415-1.6-.42-1.184-1.185l.159-.291A1.873 1.873 0 0 0 1.945 8.93l-.319-.094c-.835-.246-.835-1.428 0-1.674l.319-.094A1.873 1.873 0 0 0 3.06 4.377l-.16-.292c-.415-.764.42-1.6 1.185-1.184l.292.159a1.873 1.873 0 0 0 2.692-1.115z" />
-            </svg>
-          </button>
-        </div>
-
-        {/* theme controller is copied from https://daisyui.com/components/theme-controller/ */}
-        <div className="tooltip tooltip-bottom" data-tip="Themes">
-          <div className="dropdown dropdown-end dropdown-bottom">
-            <div tabIndex={0} role="button" className="btn m-1">
-              <svg
-                xmlns="http://www.w3.org/2000/svg"
-                width="16"
-                height="16"
-                fill="currentColor"
-                className="bi bi-palette2"
-                viewBox="0 0 16 16"
-              >
-                <path d="M0 .5A.5.5 0 0 1 .5 0h5a.5.5 0 0 1 .5.5v5.277l4.147-4.131a.5.5 0 0 1 .707 0l3.535 3.536a.5.5 0 0 1 0 .708L10.261 10H15.5a.5.5 0 0 1 .5.5v5a.5.5 0 0 1-.5.5H3a3 3 0 0 1-2.121-.879A3 3 0 0 1 0 13.044m6-.21 7.328-7.3-2.829-2.828L6 7.188zM4.5 13a1.5 1.5 0 1 0-3 0 1.5 1.5 0 0 0 3 0M15 15v-4H9.258l-4.015 4zM0 .5v12.495zm0 12.495V13z" />
-              </svg>
-            </div>
-            <ul
-              tabIndex={0}
-              className="dropdown-content bg-base-300 rounded-box z-[1] w-52 p-2 shadow-2xl h-80 overflow-y-auto"
-            >
-              <li>
-                <button
-                  className={classNames({
-                    'btn btn-sm btn-block btn-ghost justify-start': true,
-                    'btn-active': selectedTheme === 'auto',
-                  })}
-                  onClick={() => setTheme('auto')}
-                >
-                  auto
-                </button>
-              </li>
-              {THEMES.map((theme) => (
-                <li key={theme}>
-                  <input
-                    type="radio"
-                    name="theme-dropdown"
-                    className="theme-controller btn btn-sm btn-block btn-ghost justify-start"
-                    aria-label={theme}
-                    value={theme}
-                    checked={selectedTheme === theme}
-                    onChange={(e) => e.target.checked && setTheme(theme)}
-                  />
-                </li>
-              ))}
-            </ul>
-          </div>
-        </div>
-      </div>
-    </div>
-  );
-}
--- a/examples/server/webui/src/components/Sidebar.tsx
+++ b/examples/server/webui/src/components/Sidebar.tsx
@@ -1,96 +0,0 @@
-import { useEffect, useState } from 'react';
-import { classNames } from '../utils/misc';
-import { Conversation } from '../utils/types';
-import StorageUtils from '../utils/storage';
-import { useNavigate, useParams } from 'react-router';
-
-export default function Sidebar() {
-  const params = useParams();
-  const navigate = useNavigate();
-
-  const [conversations, setConversations] = useState<Conversation[]>([]);
-  const [currConv, setCurrConv] = useState<Conversation | null>(null);
-
-  useEffect(() => {
-    StorageUtils.getOneConversation(params.convId ?? '').then(setCurrConv);
-  }, [params.convId]);
-
-  useEffect(() => {
-    const handleConversationChange = async () => {
-      setConversations(await StorageUtils.getAllConversations());
-    };
-    StorageUtils.onConversationChanged(handleConversationChange);
-    handleConversationChange();
-    return () => {
-      StorageUtils.offConversationChanged(handleConversationChange);
-    };
-  }, []);
-
-  return (
-    <>
-      <input
-        id="toggle-drawer"
-        type="checkbox"
-        className="drawer-toggle"
-        defaultChecked
-      />
-
-      <div className="drawer-side h-screen lg:h-screen z-50 lg:max-w-64">
-        <label
-          htmlFor="toggle-drawer"
-          aria-label="close sidebar"
-          className="drawer-overlay"
-        ></label>
-        <div className="flex flex-col bg-base-200 min-h-full max-w-64 py-4 px-4">
-          <div className="flex flex-row items-center justify-between mb-4 mt-4">
-            <h2 className="font-bold ml-4">Conversations</h2>
-
-            {/* close sidebar button */}
-            <label htmlFor="toggle-drawer" className="btn btn-ghost lg:hidden">
-              <svg
-                xmlns="http://www.w3.org/2000/svg"
-                width="16"
-                height="16"
-                fill="currentColor"
-                className="bi bi-arrow-bar-left"
-                viewBox="0 0 16 16"
-              >
-                <path
-                  fillRule="evenodd"
-                  d="M12.5 15a.5.5 0 0 1-.5-.5v-13a.5.5 0 0 1 1 0v13a.5.5 0 0 1-.5.5M10 8a.5.5 0 0 1-.5.5H3.707l2.147 2.146a.5.5 0 0 1-.708.708l-3-3a.5.5 0 0 1 0-.708l3-3a.5.5 0 1 1 .708.708L3.707 7.5H9.5a.5.5 0 0 1 .5.5"
-                />
-              </svg>
-            </label>
-          </div>
-
-          {/* list of conversations */}
-          <div
-            className={classNames({
-              'btn btn-ghost justify-start': true,
-              'btn-active': !currConv,
-            })}
-            onClick={() => navigate('/')}
-          >
-            + New conversation
-          </div>
-          {conversations.map((conv) => (
-            <div
-              key={conv.id}
-              className={classNames({
-                'btn btn-ghost justify-start font-normal': true,
-                'btn-active': conv.id === currConv?.id,
-              })}
-              onClick={() => navigate(`/chat/${conv.id}`)}
-              dir="auto"
-            >
-              <span className="truncate">{conv.name}</span>
-            </div>
-          ))}
-          <div className="text-center text-xs opacity-40 mt-auto mx-4">
-            Conversations are saved to browser's IndexedDB
-          </div>
-        </div>
-      </div>
-    </>
-  );
-}
--- a/examples/server_embd.py
+++ b/examples/server_embd.py
@@ -15,7 +15,7 @@ async def main():
    model_url = "http://127.0.0.1:6900"
    responses: list[requests.Response] = await asyncio.gather(*[requests_post_async(
        url= f"{model_url}/embedding",
-        json= {"content": str(0)*1024}
+        json= {"content": "a "*1022}
    ) for i in range(n)])

    for response in responses:
--- a/examples/speculative-simple/speculative-simple.cpp
+++ b/examples/speculative-simple/speculative-simple.cpp
@@ -24,7 +24,7 @@ int main(int argc, char ** argv) {

    common_init();

-    if (params.speculative.model.empty()) {
+    if (params.speculative.model.path.empty()) {
        LOG_ERR("%s: --model-draft is required\n", __func__);
        return 1;
    }
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -46,7 +46,7 @@ int main(int argc, char ** argv) {

    common_init();

-    if (params.speculative.model.empty()) {
+    if (params.speculative.model.path.empty()) {
        LOG_ERR("%s: --model-draft is required\n", __func__);
        return 1;
    }
--- a/examples/sycl/build.sh
+++ b/examples/sycl/build.sh
@@ -8,10 +8,10 @@ cd build
 source /opt/intel/oneapi/setvars.sh

 #for FP16
-#cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON # faster for long-prompt inference
+#cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DLLAMA_CURL=OFF # faster for long-prompt inference

 #for FP32
-cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=OFF

 #build example/main
 #cmake --build . --config Release --target main
--- a/examples/sycl/win-build-sycl.bat
+++ b/examples/sycl/win-build-sycl.bat
@@ -13,10 +13,10 @@ if %errorlevel% neq 0 goto ERROR

 ::  for FP16
 ::  faster for long-prompt inference
-::  cmake -G "MinGW Makefiles" ..  -DGGML_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
+::  cmake -G "MinGW Makefiles" .. -DLLAMA_CURL=OFF -DGGML_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON

 ::  for FP32
-cmake -G "Ninja" ..  -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
+cmake -G "Ninja" .. -DLLAMA_CURL=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
 if %errorlevel% neq 0 goto ERROR
 ::  build example/main only
 ::  make main
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -100,9 +100,14 @@ else()
    set(INS_ENB ON)
 endif()

+message(DEBUG "GGML_NATIVE         : ${GGML_NATIVE}")
+message(DEBUG "GGML_NATIVE_DEFAULT : ${GGML_NATIVE_DEFAULT}")
+message(DEBUG "INS_ENB             : ${INS_ENB}")
+
 option(GGML_CPU_HBM          "ggml: use memkind for CPU HBM" OFF)
 option(GGML_CPU_AARCH64      "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
 option(GGML_CPU_KLEIDIAI     "ggml: use KleidiAI optimized kernels if applicable" OFF)
+option(GGML_SSE42            "ggml: enable SSE 4.2"          ${INS_ENB})
 option(GGML_AVX              "ggml: enable AVX"              ${INS_ENB})
 option(GGML_AVX_VNNI         "ggml: enable AVX-VNNI"         OFF)
 option(GGML_AVX2             "ggml: enable AVX2"             ${INS_ENB})
@@ -123,10 +128,12 @@ endif()
 option(GGML_LASX             "ggml: enable lasx"             ON)
 option(GGML_LSX              "ggml: enable lsx"              ON)
 option(GGML_RVV              "ggml: enable rvv"              ON)
+option(GGML_RV_ZFH           "ggml: enable riscv zfh"        OFF)
 option(GGML_VXE              "ggml: enable vxe"              ON)

 option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
-set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
+set(GGML_CPU_ARM_ARCH        "" CACHE STRING "ggml: CPU architecture for ARM")
+set(GGML_CPU_POWERPC_CPUTYPE "" CACHE STRING "ggml: CPU type for PowerPC")


 if (WIN32)
@@ -164,7 +171,6 @@ option(GGML_HIP                             "ggml: use HIP"
 option(GGML_HIP_GRAPHS                      "ggml: use HIP graph, experimental, slow"         OFF)
 option(GGML_HIP_NO_VMM                      "ggml: do not try to use HIP VMM"                 ON)
 option(GGML_HIP_ROCWMMA_FATTN               "ggml: enable rocWMMA for FlashAttention"         OFF)
-option(GGML_HIP_UMA                         "ggml: use HIP unified memory architecture"       OFF)
 option(GGML_VULKAN                          "ggml: use Vulkan"                                OFF)
 option(GGML_VULKAN_CHECK_RESULTS            "ggml: run Vulkan op checks"                      OFF)
 option(GGML_VULKAN_DEBUG                    "ggml: enable Vulkan debug output"                OFF)
@@ -354,3 +360,29 @@ write_basic_package_version_file(
 install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml-config.cmake
              ${CMAKE_CURRENT_BINARY_DIR}/ggml-version.cmake
        DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ggml)
+
+if (MSVC)
+    set(MSVC_WARNING_FLAGS
+        /wd4005  # Macro redefinition
+        /wd4244  # Conversion from one type to another type, possible loss of data
+        /wd4267  # Conversion from 'size_t' to a smaller type, possible loss of data
+        /wd4996  # Disable POSIX deprecation warnings
+        /wd4702  # Unreachable code warnings
+    )
+    function(disable_msvc_warnings target_name)
+        if(TARGET ${target_name})
+            target_compile_options(${target_name} PRIVATE ${MSVC_WARNING_FLAGS})
+        endif()
+    endfunction()
+
+    disable_msvc_warnings(ggml-base)
+    disable_msvc_warnings(ggml)
+    disable_msvc_warnings(ggml-cpu)
+    disable_msvc_warnings(ggml-cpu-x64)
+    disable_msvc_warnings(ggml-cpu-sse42)
+    disable_msvc_warnings(ggml-cpu-sandybridge)
+    disable_msvc_warnings(ggml-cpu-haswell)
+    disable_msvc_warnings(ggml-cpu-skylakex)
+    disable_msvc_warnings(ggml-cpu-icelake)
+    disable_msvc_warnings(ggml-cpu-alderlake)
+endif()
--- a/ggml/cmake/GitVars.cmake
+++ b/ggml/cmake/GitVars.cmake
@@ -0,0 +1,22 @@
+find_package(Git)
+
+# the commit's SHA1
+execute_process(COMMAND
+    "${GIT_EXECUTABLE}" describe --match=NeVeRmAtCh --always --abbrev=8
+    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
+    OUTPUT_VARIABLE GIT_SHA1
+    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+# the date of the commit
+execute_process(COMMAND
+    "${GIT_EXECUTABLE}" log -1 --format=%ad --date=local
+    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
+    OUTPUT_VARIABLE GIT_DATE
+    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+# the subject of the commit
+execute_process(COMMAND
+    "${GIT_EXECUTABLE}" log -1 --format=%s
+    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
+    OUTPUT_VARIABLE GIT_COMMIT_SUBJECT
+    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
--- a/ggml/cmake/ggml-config.cmake.in
+++ b/ggml/cmake/ggml-config.cmake.in
@@ -5,7 +5,7 @@

 set_and_check(GGML_INCLUDE_DIR "@PACKAGE_GGML_INCLUDE_INSTALL_DIR@")
 set_and_check(GGML_LIB_DIR "@PACKAGE_GGML_LIB_INSTALL_DIR@")
-set_and_check(GGML_BIN_DIR "@PACKAGE_GGML_BIN_INSTALL_DIR@")
+#set_and_check(GGML_BIN_DIR "@PACKAGE_GGML_BIN_INSTALL_DIR@")

 find_package(Threads REQUIRED)

--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -38,7 +38,7 @@ extern "C" {
    GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer  (ggml_backend_buffer_type_t buft, size_t size);
    GGML_API size_t                ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
    GGML_API size_t                ggml_backend_buft_get_max_size  (ggml_backend_buffer_type_t buft);
-    GGML_API size_t                ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
+    GGML_API size_t                ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
    GGML_API bool                  ggml_backend_buft_is_host       (ggml_backend_buffer_type_t buft);
    GGML_API ggml_backend_dev_t    ggml_backend_buft_get_device    (ggml_backend_buffer_type_t buft);

@@ -59,7 +59,7 @@ extern "C" {
    GGML_API enum ggml_status               ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
    GGML_API size_t                         ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
    GGML_API size_t                         ggml_backend_buffer_get_max_size  (ggml_backend_buffer_t buffer);
-    GGML_API size_t                         ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API size_t                         ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor);
    GGML_API void                           ggml_backend_buffer_clear         (ggml_backend_buffer_t buffer, uint8_t value);
    GGML_API bool                           ggml_backend_buffer_is_host       (ggml_backend_buffer_t buffer);
    GGML_API void                           ggml_backend_buffer_set_usage     (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
--- a/ggml/include/ggml-cpp.h
+++ b/ggml/include/ggml-cpp.h
@@ -24,7 +24,7 @@ typedef std::unique_ptr<gguf_context, gguf_context_deleter> gguf_context_ptr;

 struct ggml_gallocr_deleter { void operator()(ggml_gallocr_t galloc) { ggml_gallocr_free(galloc); } };

-typedef std::unique_ptr<ggml_gallocr_t, ggml_gallocr_deleter> ggml_gallocr_ptr;
+typedef std::unique_ptr<ggml_gallocr, ggml_gallocr_deleter> ggml_gallocr_ptr;

 // ggml-backend

--- a/Show More
+++ b/Show More