Vulkan: Add renderdoc tracing support

2026-04-23 16:37:33 +03:00 · 2025-01-12 13:47:36 +00:00
234 changed files with 8875 additions and 24237 deletions
--- a/.devops/cpu.Dockerfile
+++ b/.devops/cpu.Dockerfile
@@ -2,10 +2,6 @@ ARG UBUNTU_VERSION=22.04

 FROM ubuntu:$UBUNTU_VERSION AS build

-ARG TARGETARCH
-
-ARG GGML_CPU_ARM_ARCH=armv8-a
-
 RUN apt-get update && \
    apt-get install -y build-essential git cmake libcurl4-openssl-dev

@@ -13,14 +9,7 @@ WORKDIR /app

 COPY . .

-RUN if [ "$TARGETARCH" = "amd64" ]; then \
-        cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
-    elif [ "$TARGETARCH" = "arm64" ]; then \
-        cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=${GGML_CPU_ARM_ARCH}; \
-    else \
-        echo "Unsupported architecture"; \
-        exit 1; \
-    fi && \
+RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \
    cmake --build build -j $(nproc)

 RUN mkdir -p /app/lib && \
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@@ -13,13 +13,9 @@ elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
    exec ./llama-quantize "$@"
 elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
    exec ./llama-cli "$@"
-elif [[ "$arg1" == '--bench' || "$arg1" == '-b' ]]; then
-    exec ./llama-bench "$@"
-elif [[ "$arg1" == '--perplexity' || "$arg1" == '-p' ]]; then
-    exec ./llama-perplexity "$@"
 elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
    echo "Converting PTH to GGML..."
-    for i in $(ls $1/$2/ggml-model-f16.bin*); do
+    for i in `ls $1/$2/ggml-model-f16.bin*`; do
        if [ -f "${i/f16/q4_0}" ]; then
            echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
        else
@@ -34,10 +30,6 @@ else
    echo "Available commands: "
    echo "  --run (-r): Run a model previously converted into ggml"
    echo "              ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
-    echo "  --bench (-b): Benchmark the performance of the inference for various parameters."
-    echo "              ex: -m model.gguf"
-    echo "  --perplexity (-p): Measure the perplexity of a model over a given text."
-    echo "              ex: -m model.gguf -f file.txt"
    echo "  --convert (-c): Convert a llama model into ggml"
    echo "              ex: --outtype f16 \"/models/7B/\" "
    echo "  --quantize (-q): Optimize with quantization process ggml"
--- a/.devops/vulkan.Dockerfile
+++ b/.devops/vulkan.Dockerfile
@@ -1,4 +1,4 @@
-ARG UBUNTU_VERSION=24.04
+ARG UBUNTU_VERSION=jammy

 FROM ubuntu:$UBUNTU_VERSION AS build

@@ -7,7 +7,7 @@ RUN apt update && apt install -y git build-essential cmake wget

 # Install Vulkan SDK and cURL
 RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
-    wget -qO /etc/apt/sources.list.d/lunarg-vulkan-noble.list https://packages.lunarg.com/vulkan/lunarg-vulkan-noble.list && \
+    wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
    apt update -y && \
    apt-get install -y vulkan-sdk libcurl4-openssl-dev curl

@@ -34,7 +34,7 @@ RUN mkdir -p /app/full \
 FROM ubuntu:$UBUNTU_VERSION AS base

 RUN apt-get update \
-    && apt-get install -y libgomp1 curl libvulkan-dev \
+    && apt-get install -y libgomp1 curl\
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
@@ -55,9 +55,8 @@ RUN apt-get update \
    git \
    python3 \
    python3-pip \
-    python3-wheel \
-    && pip install --break-system-packages --upgrade setuptools \
-    && pip install --break-system-packages -r requirements.txt \
+    && pip install --upgrade pip setuptools wheel \
+    && pip install -r requirements.txt \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
--- a/.editorconfig
+++ b/.editorconfig
@@ -40,11 +40,3 @@ indent_style = tab
 [examples/cvector-generator/*.txt]
 trim_trailing_whitespace = unset
 insert_final_newline = unset
-
-[models/templates/*.jinja]
-indent_style = unset
-indent_size = unset
-end_of_line = unset
-charset = unset
-trim_trailing_whitespace = unset
-insert_final_newline = unset
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -43,12 +43,6 @@ jobs:
        with:
          fetch-depth: 0

-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: macOS-latest-cmake-arm64
-          evict-old-files: 1d
-
      - name: Dependencies
        id: depends
        continue-on-error: true
@@ -62,7 +56,6 @@ jobs:
          mkdir build
          cd build
          cmake .. \
-            -DCMAKE_BUILD_RPATH="@loader_path" \
            -DLLAMA_FATAL_WARNINGS=ON \
            -DLLAMA_CURL=ON \
            -DGGML_METAL_USE_BF16=ON \
@@ -94,7 +87,6 @@ jobs:
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        run: |
          cp LICENSE ./build/bin/
-          cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
          zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*

      - name: Upload artifacts
@@ -114,12 +106,6 @@ jobs:
        with:
          fetch-depth: 0

-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: macOS-latest-cmake-x64
-          evict-old-files: 1d
-
      - name: Dependencies
        id: depends
        continue-on-error: true
@@ -133,7 +119,6 @@ jobs:
          # Metal is disabled due to intermittent failures with Github runners not having a GPU:
          # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
          cmake -B build \
-            -DCMAKE_BUILD_RPATH="@loader_path" \
            -DLLAMA_FATAL_WARNINGS=ON \
            -DLLAMA_CURL=ON \
            -DGGML_METAL=OFF \
@@ -164,7 +149,6 @@ jobs:
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        run: |
          cp LICENSE ./build/bin/
-          cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
          zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*

      - name: Upload artifacts
@@ -174,8 +158,8 @@ jobs:
          path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
          name: llama-bin-macos-x64.zip

-  ubuntu-cpu-cmake:
-    runs-on: ubuntu-22.04
+  ubuntu-latest-cmake:
+    runs-on: ubuntu-latest

    steps:
      - name: Clone
@@ -184,12 +168,6 @@ jobs:
        with:
          fetch-depth: 0

-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: ubuntu-cpu-cmake
-          evict-old-files: 1d
-
      - name: Dependencies
        id: depends
        run: |
@@ -201,10 +179,7 @@ jobs:
        run: |
          mkdir build
          cd build
-          cmake .. \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DLLAMA_CURL=ON \
-            -DGGML_RPC=ON
+          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DGGML_RPC=ON
          cmake --build . --config Release -j $(nproc)

      - name: Test
@@ -242,7 +217,6 @@ jobs:
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        run: |
          cp LICENSE ./build/bin/
-          cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
          zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/*

      - name: Upload artifacts
@@ -260,19 +234,13 @@ jobs:
    strategy:
      matrix:
        sanitizer: [ADDRESS, THREAD, UNDEFINED]
-        build_type: [Debug]
+        build_type: [Debug, Release]

    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4

-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: ubuntu-latest-cmake-sanitizer-${{ matrix.sanitizer }}
-          evict-old-files: 1d
-
      - name: Dependencies
        id: depends
        run: |
@@ -285,10 +253,7 @@ jobs:
        run: |
          mkdir build
          cd build
-          cmake .. \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
          cmake --build . --config ${{ matrix.build_type }} -j $(nproc)

      - name: Build (no OpenMP)
@@ -297,11 +262,7 @@ jobs:
        run: |
          mkdir build
          cd build
-          cmake .. \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-            -DGGML_OPENMP=OFF
+          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DGGML_OPENMP=OFF
          cmake --build . --config ${{ matrix.build_type }} -j $(nproc)

      - name: Test
@@ -320,12 +281,6 @@ jobs:
        id: checkout
        uses: actions/checkout@v4

-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: ubuntu-latest-cmake-rpc
-          evict-old-files: 1d
-
      - name: Dependencies
        id: depends
        run: |
@@ -337,8 +292,7 @@ jobs:
        run: |
          mkdir build
          cd build
-          cmake .. \
-            -DGGML_RPC=ON
+          cmake -DGGML_RPC=ON ..
          cmake --build . --config Release -j $(nproc)

      - name: Test
@@ -355,12 +309,6 @@ jobs:
        id: checkout
        uses: actions/checkout@v4

-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: ubuntu-22-cmake-vulkan
-          evict-old-files: 1d
-
      - name: Dependencies
        id: depends
        run: |
@@ -374,16 +322,14 @@ jobs:
        run: |
          mkdir build
          cd build
-          cmake .. \
-            -DGGML_VULKAN=ON
+          cmake -DGGML_VULKAN=ON ..
          cmake --build . --config Release -j $(nproc)

      - name: Test
        id: cmake_test
        run: |
          cd build
-          # This is using llvmpipe and runs slower than other backends
-          ctest -L main --verbose --timeout 1800
+          ctest -L main --verbose --timeout 900

  ubuntu-22-cmake-hip:
    runs-on: ubuntu-22.04
@@ -400,27 +346,16 @@ jobs:
          sudo apt-get update
          sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev

-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: ubuntu-22-cmake-hip
-          evict-old-files: 1d
-
      - name: Build with native CMake HIP support
        id: cmake_build
        run: |
-          cmake -B build -S . \
-            -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
-            -DGGML_HIP=ON
+          cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DGGML_HIP=ON
          cmake --build build --config Release -j $(nproc)

      - name: Build with legacy HIP support
        id: cmake_build_legacy_hip
        run: |
-          cmake -B build2 -S . \
-            -DCMAKE_C_COMPILER=hipcc \
-            -DCMAKE_CXX_COMPILER=hipcc \
-            -DGGML_HIP=ON
+          cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DGGML_HIP=ON
          cmake --build build2 --config Release -j $(nproc)

  ubuntu-22-cmake-musa:
@@ -438,17 +373,10 @@ jobs:
          apt-get update
          apt-get install -y build-essential git cmake libcurl4-openssl-dev

-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: ubuntu-22-cmake-musa
-          evict-old-files: 1d
-
      - name: Build with native CMake MUSA support
        id: cmake_build
        run: |
-          cmake -B build -S . \
-            -DGGML_MUSA=ON
+          cmake -B build -S . -DGGML_MUSA=ON
          cmake --build build --config Release -j $(nproc)

  ubuntu-22-cmake-sycl:
@@ -483,22 +411,13 @@ jobs:
        id: checkout
        uses: actions/checkout@v4

-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: ubuntu-22-cmake-sycl
-          evict-old-files: 1d
-
      - name: Build
        id: cmake_build
        run: |
          source /opt/intel/oneapi/setvars.sh
          mkdir build
          cd build
-          cmake .. \
-            -DGGML_SYCL=ON \
-            -DCMAKE_C_COMPILER=icx \
-            -DCMAKE_CXX_COMPILER=icpx
+          cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
          cmake --build . --config Release -j $(nproc)

  ubuntu-22-cmake-sycl-fp16:
@@ -533,25 +452,48 @@ jobs:
        id: checkout
        uses: actions/checkout@v4

-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: ubuntu-22-cmake-sycl-fp16
-          evict-old-files: 1d
-
      - name: Build
        id: cmake_build
        run: |
          source /opt/intel/oneapi/setvars.sh
          mkdir build
          cd build
-          cmake .. \
-            -DGGML_SYCL=ON \
-            -DCMAKE_C_COMPILER=icx \
-            -DCMAKE_CXX_COMPILER=icpx \
-            -DGGML_SYCL_F16=ON
+          cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON ..
          cmake --build . --config Release -j $(nproc)

+  # TODO: build with GGML_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
+  #       how to debug it.
+  #       ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584
+  #       would be great if we fix these
+  macOS-latest-cmake:
+    runs-on: macos-latest
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Dependencies
+        id: depends
+        continue-on-error: true
+        run: |
+          brew update
+
+      - name: Build
+        id: cmake_build
+        run: |
+          sysctl -a
+          mkdir build
+          cd build
+          cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF ..
+          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L main --verbose --timeout 900
+
  macOS-latest-cmake-ios:
    runs-on: macos-latest

@@ -560,12 +502,6 @@ jobs:
        id: checkout
        uses: actions/checkout@v4

-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: macOS-latest-cmake-ios
-          evict-old-files: 1d
-
      - name: Dependencies
        id: depends
        continue-on-error: true
@@ -597,12 +533,6 @@ jobs:
        id: checkout
        uses: actions/checkout@v4

-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: macOS-latest-cmake-tvos
-          evict-old-files: 1d
-
      - name: Dependencies
        id: depends
        continue-on-error: true
@@ -638,12 +568,6 @@ jobs:
        id: checkout
        uses: actions/checkout@v4

-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: macOS-latest-swift
-          evict-old-files: 1d
-
      - name: Dependencies
        id: depends
        continue-on-error: true
@@ -685,12 +609,6 @@ jobs:
      - name: Clone
        uses: actions/checkout@v4

-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: windows-msys2
-          evict-old-files: 1d
-
      - name: Setup ${{ matrix.sys }}
        uses: msys2/setup-msys2@v2
        with:
@@ -698,7 +616,6 @@ jobs:
          msystem: ${{matrix.sys}}
          install: >-
            base-devel
-            git
            mingw-w64-${{matrix.env}}-toolchain
            mingw-w64-${{matrix.env}}-cmake
            mingw-w64-${{matrix.env}}-openblas
@@ -759,12 +676,6 @@ jobs:
        with:
          fetch-depth: 0

-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: windows-latest-cmake-${{ matrix.build }}
-          evict-old-files: 1d
-
      - name: Clone Kompute submodule
        id: clone_kompute
        if: ${{ matrix.build == 'kompute-x64' }}
@@ -885,7 +796,6 @@ jobs:
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        run: |
          Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
-          Copy-Item .\examples\run\linenoise.cpp\LICENSE .\build\bin\Release\linenoise.cpp.txt
          7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*

      - name: Upload artifacts
@@ -903,8 +813,6 @@ jobs:
        - name: Clone
          id: checkout
          uses: actions/checkout@v4
-          with:
-            fetch-depth: 0

        - name: Install dependencies
          env:
@@ -913,21 +821,9 @@ jobs:
              apt update
              apt install -y cmake build-essential ninja-build libgomp1 git

-        - name: ccache
-          uses: hendrikmuhs/ccache-action@v1.2.16
-          with:
-            key: ubuntu-latest-cmake-cuda
-            evict-old-files: 1d
-
        - name: Build with CMake
          run: |
-            cmake -S . -B build -G Ninja \
-              -DCMAKE_BUILD_TYPE=Release \
-              -DCMAKE_CUDA_ARCHITECTURES=89-real \
-              -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined \
-              -DLLAMA_FATAL_WARNINGS=ON \
-              -DGGML_NATIVE=OFF \
-              -DGGML_CUDA=ON
+            cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=89-real -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined -DLLAMA_FATAL_WARNINGS=ON
            cmake --build build

  windows-2019-cmake-cuda:
@@ -945,12 +841,6 @@ jobs:
        with:
            fetch-depth: 0

-      - name: Install ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: ${{ github.job }}-${{ matrix.cuda }}-${{ matrix.build }}
-          evict-old-files: 1d
-
      - name: Install Cuda Toolkit 11.7
        if: ${{ matrix.cuda == '11.7' }}
        run: |
@@ -1007,6 +897,11 @@ jobs:
          echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
          echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8

+      - name: Install ccache
+        uses: hendrikmuhs/ccache-action@v1.2
+        with:
+          key: ${{ github.job }}-${{ matrix.cuda }}-${{ matrix.build }}
+
      - name: Install Ninja
        id: install_ninja
        run: |
@@ -1017,11 +912,7 @@ jobs:
        shell: cmd
        run: |
          call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
-          cmake -S . -B build -G "Ninja Multi-Config" ^
-            -DLLAMA_BUILD_SERVER=ON ^
-            -DGGML_NATIVE=OFF ^
-            -DGGML_CUDA=ON ^
-            -DGGML_RPC=ON
+          cmake -S . -B build -G "Ninja Multi-Config" -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DGGML_RPC=ON
          set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
          cmake --build build --config Release -j %NINJA_JOBS% -t ggml
          cmake --build build --config Release
@@ -1086,12 +977,6 @@ jobs:
        with:
          fetch-depth: 0

-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: windows-latest-cmake-sycl
-          evict-old-files: 1d
-
      - name: Install
        run:  |
          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
@@ -1171,22 +1056,16 @@ jobs:
          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version

      - name: Install ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2
        with:
          key: ${{ github.job }}
-          evict-old-files: 1d

      - name: Build
        id: cmake_build
        run: |
          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
-          cmake -G "Unix Makefiles" -B build -S . `
-            -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
-            -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
-            -DCMAKE_BUILD_TYPE=Release `
-            -DGGML_HIP=ON `
-            -DGGML_RPC=ON
+          cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIP=ON -DCMAKE_BUILD_TYPE=Release -DGGML_RPC=ON
          cmake --build build -j ${env:NUMBER_OF_PROCESSORS}

  windows-latest-cmake-hip-release:
@@ -1204,12 +1083,6 @@ jobs:
        with:
            fetch-depth: 0

-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: windows-latest-cmake-hip-release
-          evict-old-files: 1d
-
      - name: Install
        id: depends
        run: |
@@ -1230,13 +1103,7 @@ jobs:
        run: |
          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
-          cmake -G "Unix Makefiles" -B build -S . `
-            -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
-            -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
-            -DCMAKE_BUILD_TYPE=Release `
-            -DAMDGPU_TARGETS=${{ matrix.gpu_target }} `
-            -DGGML_HIP=ON `
-            -DGGML_RPC=ON
+          cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIP=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=${{ matrix.gpu_target }} -DGGML_RPC=ON
          cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
          md "build\bin\rocblas\library\"
          cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
@@ -1307,12 +1174,6 @@ jobs:
      - name: Clone
        uses: actions/checkout@v4

-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: android-build
-          evict-old-files: 1d
-
      - name: Set up JDK
        uses: actions/setup-java@v3
        with:
@@ -1336,7 +1197,8 @@ jobs:
    runs-on: ubuntu-latest

    needs:
-      - ubuntu-cpu-cmake
+      - ubuntu-latest-cmake
+      - macOS-latest-cmake
      - windows-latest-cmake
      - windows-2019-cmake-cuda
      - windows-latest-cmake-hip-release
@@ -1350,12 +1212,6 @@ jobs:
        with:
          fetch-depth: 0

-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: release
-          evict-old-files: 1d
-
      - name: Determine tag name
        id: tag
        shell: bash
@@ -1601,37 +1457,3 @@ jobs:
 #          popd
 #          emcmake cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }}
 #          make
-
-  openEuler-latest-cmake-cann:
-    if: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'Ascend NPU') }}
-    defaults:
-      run:
-       shell: bash -el {0}
-    runs-on: ubuntu-24.04-arm
-    strategy:
-      matrix:
-        cann:
-          - '8.0.rc3.beta1-910b-openeuler22.03-py3.10'
-        device:
-          - 'ascend910b3'
-        build:
-          - 'Release'
-    container: ascendai/cann:${{ matrix.cann }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-
-      - name: Dependencies
-        run: |
-          yum update -y
-          yum install -y git gcc gcc-c++ make cmake
-
-      - name: Build
-        run: |
-          export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
-
-          cmake -S . -B build \
-              -DCMAKE_BUILD_TYPE=${{ matrix.build }} \
-              -DGGML_CANN=on \
-              -DSOC_TYPE=${{ matrix.device }}
-          cmake --build build -j $(nproc)
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -28,11 +28,10 @@ jobs:
  push_to_registry:
    name: Push Docker image to Docker Hub

-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
    env:
      COMMIT_SHA: ${{ github.sha }}
    strategy:
-      fail-fast: false
      matrix:
        config:
          # Multi-stage build
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -112,9 +112,9 @@ jobs:
              -DGGML_OPENMP=OFF ;
          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server

-      - name: Build (sanitizers)
-        id: cmake_build_sanitizers
-        if: ${{ matrix.sanitizer != '' && matrix.sanitizer != 'THREAD' }}
+      - name: Build
+        id: cmake_build
+        if: ${{ matrix.sanitizer != 'THREAD' }}
        run: |
          cmake -B build \
              -DGGML_NATIVE=OFF \
@@ -124,31 +124,12 @@ jobs:
              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server

-      - name: Build (sanitizers)
-        id: cmake_build
-        if: ${{ matrix.sanitizer == '' }}
-        run: |
-          cmake -B build \
-              -DGGML_NATIVE=OFF \
-              -DLLAMA_BUILD_SERVER=ON \
-              -DLLAMA_CURL=ON \
-              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
-
      - name: Tests
        id: server_integration_tests
-        if: ${{ matrix.sanitizer == '' }}
        run: |
          cd examples/server/tests
          ./tests.sh

-      - name: Tests (sanitizers)
-        id: server_integration_tests_sanitizers
-        if: ${{ matrix.sanitizer != '' }}
-        run: |
-          cd examples/server/tests
-          LLAMA_SANITIZE=1 ./tests.sh
-
      - name: Slow tests
        id: server_integration_tests_slow
        if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
@@ -205,7 +186,7 @@ jobs:
        run: |
          cd examples/server/tests
          $env:PYTHONIOENCODING = ":replace"
-          pytest -v -x -m "not slow"
+          pytest -v -x

      - name: Slow tests
        id: server_integration_tests_slow
--- a/.gitignore
+++ b/.gitignore
@@ -18,7 +18,6 @@
 *.metallib
 *.o
 *.so
-*.swp
 *.tmp

 # IDE / OS
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,7 +16,6 @@ endif()
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")

 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
-set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)

 if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
    set(LLAMA_STANDALONE ON)
@@ -50,8 +49,6 @@ endif()
 if (MSVC)
    add_compile_options("$<$<COMPILE_LANGUAGE:C>:/utf-8>")
    add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/utf-8>")
-    add_compile_options("$<$<COMPILE_LANGUAGE:C>:/bigobj>")
-    add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/bigobj>")
 endif()

 #
@@ -86,8 +83,11 @@ include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake)

 # override ggml options
-set(GGML_ALL_WARNINGS   ${LLAMA_ALL_WARNINGS})
-set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})
+set(GGML_SANITIZE_THREAD    ${LLAMA_SANITIZE_THREAD})
+set(GGML_SANITIZE_ADDRESS   ${LLAMA_SANITIZE_ADDRESS})
+set(GGML_SANITIZE_UNDEFINED ${LLAMA_SANITIZE_UNDEFINED})
+set(GGML_ALL_WARNINGS       ${LLAMA_ALL_WARNINGS})
+set(GGML_FATAL_WARNINGS     ${LLAMA_FATAL_WARNINGS})

 # change the default for these ggml options
 if (NOT DEFINED GGML_LLAMAFILE)
@@ -117,62 +117,16 @@ llama_option_depr(WARNING     LLAMA_SYCL                GGML_SYCL)
 llama_option_depr(WARNING     LLAMA_SYCL_F16            GGML_SYCL_F16)
 llama_option_depr(WARNING     LLAMA_CANN                GGML_CANN)

-if (NOT MSVC)
-    if (LLAMA_SANITIZE_THREAD)
-        message(STATUS "Using -fsanitize=thread")
-
-        add_compile_options(-fsanitize=thread)
-        link_libraries     (-fsanitize=thread)
-    endif()
-
-    if (LLAMA_SANITIZE_ADDRESS)
-        message(STATUS "Using -fsanitize=address")
-
-        add_compile_options(-fsanitize=address -fno-omit-frame-pointer)
-        link_libraries     (-fsanitize=address)
-    endif()
-
-    if (LLAMA_SANITIZE_UNDEFINED)
-        message(STATUS "Using -fsanitize=undefined")
-
-        add_compile_options(-fsanitize=undefined)
-        link_libraries     (-fsanitize=undefined)
-    endif()
-endif()
-
 #
-# 3rd-party
+# build the library
 #

 if (NOT TARGET ggml)
    add_subdirectory(ggml)
    # ... otherwise assume ggml is added by a parent CMakeLists.txt
 endif()
-
-#
-# build the library
-#
-
 add_subdirectory(src)

-#
-# utils, programs, examples and tests
-#
-
-if (LLAMA_BUILD_COMMON)
-    add_subdirectory(common)
-endif()
-
-if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
-    include(CTest)
-    add_subdirectory(tests)
-endif()
-
-if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES)
-    add_subdirectory(examples)
-    add_subdirectory(pocs)
-endif()
-
 #
 # install
 #
@@ -188,14 +142,27 @@ set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location o
 set(LLAMA_LIB_INSTALL_DIR     ${CMAKE_INSTALL_LIBDIR}     CACHE PATH "Location of library files")
 set(LLAMA_BIN_INSTALL_DIR     ${CMAKE_INSTALL_BINDIR}     CACHE PATH "Location of binary  files")

+# At the moment some compile definitions are placed within the ggml/src
+# directory but not exported on the `ggml` target. This could be improved by
+# determining _precisely_ which defines are necessary for the llama-config
+# package.
+#
+set(GGML_TRANSIENT_DEFINES)
+get_target_property(GGML_DIRECTORY ggml SOURCE_DIR)
+get_directory_property(GGML_DIR_DEFINES DIRECTORY ${GGML_DIRECTORY} COMPILE_DEFINITIONS)
+if (GGML_DIR_DEFINES)
+    list(APPEND GGML_TRANSIENT_DEFINES ${GGML_DIR_DEFINES})
+endif()
+get_target_property(GGML_TARGET_DEFINES ggml COMPILE_DEFINITIONS)
+if (GGML_TARGET_DEFINES)
+    list(APPEND GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES})
+endif()
+get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)
+# all public headers
 set(LLAMA_PUBLIC_HEADERS
    ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h
    ${CMAKE_CURRENT_SOURCE_DIR}/include/llama-cpp.h)
-
-set_target_properties(llama
-    PROPERTIES
-        PUBLIC_HEADER "${LLAMA_PUBLIC_HEADERS}")
-
+set_target_properties(llama PROPERTIES PUBLIC_HEADER "${LLAMA_PUBLIC_HEADERS}")
 install(TARGETS llama LIBRARY PUBLIC_HEADER)

 configure_package_config_file(
@@ -233,3 +200,21 @@ configure_file(cmake/llama.pc.in

 install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
        DESTINATION lib/pkgconfig)
+
+#
+# utils, programs, examples and tests
+#
+
+if (LLAMA_BUILD_COMMON)
+    add_subdirectory(common)
+endif()
+
+if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
+    include(CTest)
+    add_subdirectory(tests)
+endif()
+
+if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES)
+    add_subdirectory(examples)
+    add_subdirectory(pocs)
+endif()
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,10 +1,10 @@
 # Pull requests (for contributors)

 - Test your changes:
-    - Execute [the full CI locally on your machine](ci/README.md) before publishing
-    - Verify that the perplexity and the performance are not affected negatively by your changes (use `llama-perplexity` and `llama-bench`)
-    - If you modified the `ggml` source, run the `test-backend-ops` tool to check whether different backend implementations of the `ggml` operators produce consistent results (this requires access to at least two different `ggml` backends)
-    - If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops`
+  - Execute [the full CI locally on your machine](ci/README.md) before publishing
+  - Verify that the perplexity and the performance are not affected negatively by your changes (use `llama-perplexity` and `llama-bench`)
+  - If you modified the `ggml` source, run the `test-backend-ops` tool to check whether different backend implementations of the `ggml` operators produce consistent results (this requires access to at least two different `ggml` backends)
+  - If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops`
 - Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
 - If your PR becomes stale, don't hesitate to ping the maintainers in the comments

@@ -20,104 +20,14 @@
 - Avoid adding third-party dependencies, extra files, extra headers, etc.
 - Always consider cross-compatibility with other operating systems and architectures
 - Avoid fancy-looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple
- Vertical alignment makes things more readable and easier to batch edit
+- There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit
 - Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
- Use sized integer types such as `int32_t` in the public API, e.g. `size_t` may also be appropriate for allocation sizes or byte offsets
- Declare structs with `struct foo {}` instead of `typedef struct foo {} foo`
-    - In C++ code omit optional `struct` and `enum` keyword whenever they are not necessary
-    ```cpp
-    // OK
-    llama_context * ctx;
-    const llama_rope_type rope_type;
-
-    // not OK
-    struct llama_context * ctx;
-    const enum llama_rope_type rope_type;
-    ```
-
-    _(NOTE: this guideline is yet to be applied to the `llama.cpp` codebase. New code should follow this guideline.)_
-
- Try to follow the existing patterns in the code (indentation, spaces, etc.). In case of doubt use `clang-format` to format the added code
- For anything not covered in the current guidelines, refer to the [C++ Core Guidelines](https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines)
+- Naming usually optimizes for common prefix (see https://github.com/ggerganov/ggml/pull/302#discussion_r1243240963)
 - Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
 - Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$

 ![matmul](media/matmul.png)

-# Naming guidelines
-
- Use `snake_case` for function, variable and type names
- Naming usually optimizes for longest common prefix (see https://github.com/ggerganov/ggml/pull/302#discussion_r1243240963)
-
-    ```cpp
-    // not OK
-    int small_number;
-    int big_number;
-
-    // OK
-    int number_small;
-    int number_big;
-    ```
-
- Enum values are always in upper case and prefixed with the enum name
-
-    ```cpp
-    enum llama_vocab_type {
-        LLAMA_VOCAB_TYPE_NONE = 0,
-        LLAMA_VOCAB_TYPE_SPM  = 1,
-        LLAMA_VOCAB_TYPE_BPE  = 2,
-        LLAMA_VOCAB_TYPE_WPM  = 3,
-        LLAMA_VOCAB_TYPE_UGM  = 4,
-        LLAMA_VOCAB_TYPE_RWKV = 5,
-    };
-    ```
-
- The general naming pattern is `<class>_<method>`, with `<method>` being `<action>_<noun>`
-
-    ```cpp
-    llama_model_init();           // class: "llama_model",         method: "init"
-    llama_sampler_chain_remove(); // class: "llama_sampler_chain", method: "remove"
-    llama_sampler_get_seed();     // class: "llama_sampler",       method: "get_seed"
-    llama_set_embeddings();       // class: "llama_context",       method: "set_embeddings"
-    llama_n_threads();            // class: "llama_context",       method: "n_threads"
-    llama_adapter_lora_free();    // class: "llama_adapter_lora",  method: "free"
-    ```
-
-    - The `get` `<action>` can be omitted
-    - The `<noun>` can be omitted if not necessary
-    - The `_context` suffix of the `<class>` is optional. Use it to disambiguate symbols when needed
-    - Use `init`/`free` for constructor/destructor `<action>`
-
- Use the `_t` suffix when a type is supposed to be opaque to the user - it's not relevant to them if it is a struct or anything else
-
-    ```cpp
-    typedef struct llama_context * llama_context_t;
-
-    enum llama_pooling_type llama_pooling_type(const llama_context_t ctx);
-    ```
-
-    _(NOTE: this guideline is yet to be applied to the `llama.cpp` codebase. New code should follow this guideline)_
-
- C/C++ filenames are all lowercase with dashes. Headers use the `.h` extension. Source files use the `.c` or `.cpp` extension
- Python filenames are all lowercase with underscores
-
- _(TODO: abbreviations usage)_
-
-# Preprocessor directives
-
- _(TODO: add guidelines with examples and apply them to the codebase)_
-
-    ```cpp
-    #ifdef FOO
-    #endif // FOO
-    ```
-
-# Documentation
-
- Documentation is a community effort
- When you need to look into the source code to figure out how to use an API consider adding a short summary to the header file for future reference
- When you notice incorrect or outdated documentation, please update it
-
 # Resources

 The Github issues, PRs and discussions contain a lot of information that can be useful to get familiar with the codebase. For convenience, some of the more important information is referenced from Github projects:
--- a/11
+++ b/11
@@ -52,7 +52,6 @@ TEST_TARGETS = \
 	tests/test-arg-parser \
 	tests/test-autorelease \
 	tests/test-backend-ops \
-	tests/test-chat \
 	tests/test-chat-template \
 	tests/test-double-float \
 	tests/test-grammar-integration \
@@ -984,7 +983,6 @@ OBJ_COMMON = \
 	$(DIR_COMMON)/ngram-cache.o \
 	$(DIR_COMMON)/sampling.o \
 	$(DIR_COMMON)/speculative.o \
-	$(DIR_COMMON)/chat.o \
 	$(DIR_COMMON)/build-info.o \
 	$(DIR_COMMON)/json-schema-to-grammar.o

@@ -1363,11 +1361,7 @@ llama-server: \
 	examples/server/httplib.h \
 	examples/server/index.html.hpp \
 	examples/server/loading.html.hpp \
-	common/chat.cpp \
-	common/chat.hpp \
-	common/chat-template.hpp \
 	common/json.hpp \
-	common/minja.hpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
@@ -1475,11 +1469,6 @@ tests/test-json-schema-to-grammar: tests/test-json-schema-to-grammar.cpp \
 	$(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-tests/test-chat: tests/test-chat.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
 tests/test-opt: tests/test-opt.cpp \
 	$(OBJ_GGML)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
--- a/README.md
+++ b/README.md
@@ -16,11 +16,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)

 ## Hot topics

- **How to use [MTLResidencySet](https://developer.apple.com/documentation/metal/mtlresidencyset?language=objc) to keep the GPU memory active?** https://github.com/ggerganov/llama.cpp/pull/11427
- **VS Code extension for FIM completions:** https://github.com/ggml-org/llama.vscode
- Universal tool call support in `llama-server`: https://github.com/ggerganov/llama.cpp/pull/9639
- Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
- Introducing GGUF-my-LoRA https://github.com/ggerganov/llama.cpp/discussions/10123
+- **Introducing GGUF-my-LoRA** https://github.com/ggerganov/llama.cpp/discussions/10123
 - Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669
 - Hugging Face GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)

@@ -208,7 +204,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
 - [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly
 - [llama-swap](https://github.com/mostlygeek/llama-swap) - transparent proxy that adds automatic model switching with llama-server
- [Kalavai](https://github.com/kalavai-net/kalavai-client) - Crowdsource end to end LLM deployment at any scale

 </details>

@@ -250,8 +245,6 @@ The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](htt
 - [Trending](https://huggingface.co/models?library=gguf&sort=trending)
 - [LLaMA](https://huggingface.co/models?sort=trending&search=llama+gguf)

-You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from Hugging Face by using this CLI argument: `-hf <user>/<model>[:quant]`
-
 After downloading a model, use the CLI tools to run it locally - see below.

 `llama.cpp` requires the model to be stored in the [GGUF](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) file format. Models in other data formats can be converted to GGUF using the `convert_*.py` Python scripts in this repo.
@@ -270,12 +263,21 @@ To learn more about model quantization, [read this documentation](examples/quant
 #### A CLI tool for accessing and experimenting with most of `llama.cpp`'s functionality.

 - <details open>
-    <summary>Run in conversation mode</summary>
-
-    Models with a built-in chat template will automatically activate conversation mode. If this doesn't occur, you can manually enable it by adding `-cnv` and specifying a suitable chat template with `--chat-template NAME`
+    <summary>Run simple text completion</summary>

    ```bash
-    llama-cli -m model.gguf
+    llama-cli -m model.gguf -p "I believe the meaning of life is" -n 128
+
+    # I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga – it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
+    ```
+
+    </details>
+
+- <details>
+    <summary>Run in conversation mode</summary>
+
+    ```bash
+    llama-cli -m model.gguf -p "You are a helpful assistant" -cnv

    # > hi, who are you?
    # Hi there! I'm your helpful assistant! I'm an AI-powered chatbot designed to assist and provide information to users like you. I'm here to help answer your questions, provide guidance, and offer support on a wide range of topics. I'm a friendly and knowledgeable AI, and I'm always happy to help with anything you need. What's on your mind, and how can I assist you today?
@@ -287,28 +289,17 @@ To learn more about model quantization, [read this documentation](examples/quant
    </details>

 - <details>
-    <summary>Run in conversation mode with custom chat template</summary>
+    <summary>Run with custom chat template</summary>

    ```bash
-    # use the "chatml" template (use -h to see the list of supported templates)
-    llama-cli -m model.gguf -cnv --chat-template chatml
+    # use the "chatml" template
+    llama-cli -m model.gguf -p "You are a helpful assistant" -cnv --chat-template chatml

    # use a custom template
-    llama-cli -m model.gguf -cnv --in-prefix 'User: ' --reverse-prompt 'User:'
+    llama-cli -m model.gguf -p "You are a helpful assistant" -cnv --in-prefix 'User: ' --reverse-prompt 'User:'
    ```

-    </details>
-
- <details>
-    <summary>Run simple text completion</summary>
-
-    To disable conversation mode explicitly, use `-no-cnv`
-
-    ```bash
-    llama-cli -m model.gguf -p "I believe the meaning of life is" -n 128 -no-cnv
-
-    # I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga – it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
-    ```
+    [Supported templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)

    </details>

@@ -423,7 +414,7 @@ To learn more about model quantization, [read this documentation](examples/quant

    </details>

-[^1]: [examples/perplexity/README.md](./examples/perplexity/README.md)
+[^1]: [examples/perplexity/README.md](examples/perplexity/README.md)
 [^2]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity)

 ## [`llama-bench`](examples/llama-bench)
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -326,17 +326,17 @@ function gg_run_open_llama_7b_v2 {
    ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
    ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k

-    (time ./bin/llama-cli -no-cnv --model ${model_f16}  -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-cli --model ${model_f16}  -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log

    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
@@ -460,17 +460,17 @@ function gg_run_pythia_1_4b {
    ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
    ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k

-    (time ./bin/llama-cli -no-cnv --model ${model_f16}  -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q8_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q4_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q4_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q5_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q5_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q2_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q3_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q4_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q5_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q6_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-cli --model ${model_f16}  -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-cli --model ${model_q8_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-cli --model ${model_q4_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-cli --model ${model_q4_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-cli --model ${model_q5_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-cli --model ${model_q5_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-cli --model ${model_q2_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-cli --model ${model_q3_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-cli --model ${model_q4_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-cli --model ${model_q5_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-cli --model ${model_q6_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log

    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
@@ -591,17 +591,17 @@ function gg_run_pythia_2_8b {
    ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
    ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k

-    (time ./bin/llama-cli -no-cnv --model ${model_f16}  -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-cli --model ${model_f16}  -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log

    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
--- a/cmake/build-info.cmake
+++ b/cmake/build-info.cmake
@@ -44,7 +44,7 @@ if(MSVC)
    set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME})
 else()
    execute_process(
-        COMMAND sh -c "\"$@\" --version | head -1" _ ${CMAKE_C_COMPILER}
+        COMMAND sh -c "$@ --version | head -1" _ ${CMAKE_C_COMPILER}
        OUTPUT_VARIABLE OUT
        OUTPUT_STRIP_TRAILING_WHITESPACE
    )
--- a/cmake/llama-config.cmake.in
+++ b/cmake/llama-config.cmake.in
@@ -3,13 +3,159 @@ set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
 set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
 set(LLAMA_SHARED_LIB   @BUILD_SHARED_LIBS@)

+set(GGML_STATIC @GGML_STATIC@)
+set(GGML_NATIVE @GGML_NATIVE@)
+set(GGML_LTO    @GGML_LTO@)
+set(GGML_CCACHE @GGML_CCACHE@)
+set(GGML_AVX    @GGML_AVX@)
+set(GGML_AVX2   @GGML_AVX2@)
+set(GGML_AVX512 @GGML_AVX512@)
+set(GGML_AVX512_VBMI @GGML_AVX512_VBMI@)
+set(GGML_AVX512_VNNI @GGML_AVX512_VNNI@)
+set(GGML_AVX512_BF16 @GGML_AVX512_BF16@)
+set(GGML_AMX_TILE @GGML_AMX_TILE@)
+set(GGML_AMX_INT8 @GGML_AMX_INT8@)
+set(GGML_AMX_BF16 @GGML_AMX_BF16@)
+set(GGML_FMA  @GGML_FMA@)
+set(GGML_LASX @GGML_LASX@)
+set(GGML_LSX  @GGML_LSX@)
+set(GGML_RVV  @GGML_RVV@)
+set(GGML_SVE  @GGML_SVE@)
+
+set(GGML_ACCELERATE @GGML_ACCELERATE@)
+set(GGML_OPENMP  @GGML_OPENMP@)
+set(GGML_CPU_HBM @GGML_CPU_HBM@)
+set(GGML_BLAS_VENDOR @GGML_BLAS_VENDOR@)
+
+set(GGML_CUDA_FORCE_MMQ    @GGML_CUDA_FORCE_MMQ@)
+set(GGML_CUDA_FORCE_CUBLAS @GGML_CUDA_FORCE_CUBLAS@)
+set(GGML_CUDA_F16          @GGML_CUDA_F16@)
+set(GGML_CUDA_PEER_MAX_BATCH_SIZE @GGML_CUDA_PEER_MAX_BATCH_SIZE@)
+set(GGML_CUDA_NO_PEER_COPY  @GGML_CUDA_NO_PEER_COPY@)
+set(GGML_CUDA_NO_VMM        @GGML_CUDA_NO_VMM@)
+set(GGML_CUDA_FA_ALL_QUANTS @GGML_CUDA_FA_ALL_QUANTS@)
+set(GGML_CUDA_GRAPHS        @GGML_CUDA_GRAPHS@)
+
+set(GGML_HIP_UMA @GGML_HIP_UMA@)
+
+set(GGML_VULKAN_CHECK_RESULTS @GGML_VULKAN_CHECK_RESULTS@)
+set(GGML_VULKAN_DEBUG         @GGML_VULKAN_DEBUG@)
+set(GGML_VULKAN_MEMORY_DEBUG  @GGML_VULKAN_MEMORY_DEBUG@)
+set(GGML_VULKAN_SHADER_DEBUG_INFO @GGML_VULKAN_SHADER_DEBUG_INFO@)
+set(GGML_VULKAN_PERF      @GGML_VULKAN_PERF@)
+set(GGML_VULKAN_VALIDATE  @GGML_VULKAN_VALIDATE@)
+set(GGML_VULKAN_RUN_TESTS @GGML_VULKAN_RUN_TESTS@)
+
+set(GGML_METAL_USE_BF16 @GGML_METAL_USE_BF16@)
+set(GGML_METAL_NDEBUG   @GGML_METAL_NDEBUG@)
+set(GGML_METAL_SHADER_DEBUG  @GGML_METAL_SHADER_DEBUG@)
+set(GGML_METAL_EMBED_LIBRARY @GGML_METAL_EMBED_LIBRARY@)
+set(GGML_METAL_MACOSX_VERSION_MIN @GGML_METAL_MACOSX_VERSION_MIN@)
+set(GGML_METAL_STD @GGML_METAL_STD@)
+
+set(GGML_SYCL_F16    @GGML_SYCL_F16@)
+set(GGML_SYCL_TARGET @GGML_SYCL_TARGET@)
+set(GGML_SYCL_DEVICE_ARCH @GGML_SYCL_DEVICE_ARCH@)
+
+
@PACKAGE_INIT@

 set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@")
 set_and_check(LLAMA_LIB_DIR     "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
 set_and_check(LLAMA_BIN_DIR     "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")

-find_package(ggml REQUIRED HINTS ${LLAMA_LIB_DIR}/cmake)
+find_package(Threads REQUIRED)
+
+set(_llama_transient_defines "@GGML_TRANSIENT_DEFINES@")
+set(_llama_link_deps "")
+set(_llama_link_opts "")
+foreach(_ggml_lib ggml ggml-base)
+    string(REPLACE "-" "_" _ggml_lib_var "${_ggml_lib}_LIBRARY")
+    find_library(${_ggml_lib_var} ${_ggml_lib}
+        REQUIRED
+        HINTS ${LLAMA_LIB_DIR}
+        NO_CMAKE_FIND_ROOT_PATH
+    )
+    list(APPEND _llama_link_deps "${${_ggml_lib_var}}")
+    message(STATUS "Found ${${_ggml_lib_var}}")
+endforeach()
+
+foreach(backend amx blas cann cpu cuda hip kompute metal musa rpc sycl vulkan)
+    string(TOUPPER "GGML_${backend}" backend_id)
+    set(_ggml_lib "ggml-${backend}")
+    string(REPLACE "-" "_" _ggml_lib_var "${_ggml_lib}_LIBRARY")
+
+    find_library(${_ggml_lib_var} ${_ggml_lib}
+        HINTS ${LLAMA_LIB_DIR}
+        NO_CMAKE_FIND_ROOT_PATH
+    )
+    if(${_ggml_lib_var})
+        list(APPEND _llama_link_deps "${${_ggml_lib_var}}")
+        set(${backend_id} ON)
+        message(STATUS "Found backend ${${_ggml_lib_var}}")
+    else()
+        set(${backend_id} OFF)
+    endif()
+endforeach()
+
+if (NOT LLAMA_SHARED_LIB)
+    if (APPLE AND GGML_ACCELERATE)
+        find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
+        list(APPEND _llama_link_deps ${ACCELERATE_FRAMEWORK})
+    endif()
+
+    if (GGML_OPENMP)
+        find_package(OpenMP REQUIRED)
+        list(APPEND _llama_link_deps OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
+    endif()
+
+    if (GGML_CPU_HBM)
+        find_library(memkind memkind REQUIRED)
+        list(APPEND _llama_link_deps memkind)
+    endif()
+
+    if (GGML_BLAS)
+        find_package(BLAS REQUIRED)
+        list(APPEND _llama_link_deps ${BLAS_LIBRARIES})
+        list(APPEND _llama_link_opts ${BLAS_LINKER_FLAGS})
+    endif()
+
+    if (GGML_CUDA)
+        find_package(CUDAToolkit REQUIRED)
+    endif()
+
+    if (GGML_METAL)
+        find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
+        find_library(METAL_FRAMEWORK    Metal REQUIRED)
+        find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
+        list(APPEND _llama_link_deps ${FOUNDATION_LIBRARY}
+                                     ${METAL_FRAMEWORK} ${METALKIT_FRAMEWORK})
+    endif()
+
+    if (GGML_VULKAN)
+        find_package(Vulkan REQUIRED)
+        list(APPEND _llama_link_deps Vulkan::Vulkan)
+    endif()
+
+    if (GGML_HIP)
+        find_package(hip     REQUIRED)
+        find_package(hipblas REQUIRED)
+        find_package(rocblas REQUIRED)
+        list(APPEND _llama_link_deps hip::host roc::rocblas roc::hipblas)
+    endif()
+
+    if (GGML_SYCL)
+        find_package(DNNL)
+        if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
+            list(APPEND _llama_link_deps DNNL::dnnl)
+        endif()
+        if (WIN32)
+            find_package(IntelSYCL REQUIRED)
+            find_package(MKL       REQUIRED)
+            list(APPEND _llama_link_deps IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
+        endif()
+    endif()
+endif()

 find_library(llama_LIBRARY llama
    REQUIRED
@@ -21,10 +167,12 @@ add_library(llama UNKNOWN IMPORTED)
 set_target_properties(llama
    PROPERTIES
        INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
-        INTERFACE_LINK_LIBRARIES "ggml::ggml;ggml::ggml-base;"
+        INTERFACE_LINK_LIBRARIES "${_llama_link_deps}"
+        INTERFACE_LINK_OPTIONS   "${_llama_link_opts}"
+        INTERFACE_COMPILE_DEFINITIONS "${_llama_transient_defines}"
        IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
        IMPORTED_LOCATION "${llama_LIBRARY}"
-        INTERFACE_COMPILE_FEATURES c_std_90
-        POSITION_INDEPENDENT_CODE ON)
+        INTERFACE_COMPILE_FEATURES cxx_std_11
+        POSITION_INDEPENDENT_CODE ON )

 check_required_components(Llama)
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -56,9 +56,6 @@ add_library(${TARGET} STATIC
    arg.cpp
    arg.h
    base64.hpp
-    chat.cpp
-    chat.hpp
-    chat-template.hpp
    common.cpp
    common.h
    console.cpp
@@ -67,7 +64,6 @@ add_library(${TARGET} STATIC
    json.hpp
    log.cpp
    log.h
-    minja.hpp
    ngram-cache.cpp
    ngram-cache.h
    sampling.cpp
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -130,27 +130,17 @@ std::string common_arg::to_string() {

 static void common_params_handle_model_default(
        std::string & model,
-        const std::string & model_url,
+        std::string & model_url,
        std::string & hf_repo,
-        std::string & hf_file,
-        const std::string & hf_token,
-        const std::string & model_default) {
+        std::string & hf_file) {
    if (!hf_repo.empty()) {
        // short-hand to avoid specifying --hf-file -> default it to --model
        if (hf_file.empty()) {
            if (model.empty()) {
-                auto auto_detected = common_get_hf_file(hf_repo, hf_token);
-                if (auto_detected.first.empty() || auto_detected.second.empty()) {
-                    exit(1); // built without CURL, error message already printed
-                }
-                hf_repo = auto_detected.first;
-                hf_file = auto_detected.second;
-            } else {
-                hf_file = model;
+                throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n");
            }
-        }
-        // make sure model path is present (for caching purposes)
-        if (model.empty()) {
+            hf_file = model;
+        } else if (model.empty()) {
            // this is to avoid different repo having same file name, or same file name in different subdirs
            std::string filename = hf_repo + "_" + hf_file;
            // to make sure we don't have any slashes in the filename
@@ -164,7 +154,7 @@ static void common_params_handle_model_default(
            model = fs_get_cache_file(string_split<std::string>(f, '/').back());
        }
    } else if (model.empty()) {
-        model = model_default;
+        model = DEFAULT_MODEL_PATH;
    }
 }

@@ -300,9 +290,8 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
    }

    // TODO: refactor model params in a common struct
-    common_params_handle_model_default(params.model,             params.model_url,             params.hf_repo,             params.hf_file,             params.hf_token, DEFAULT_MODEL_PATH);
-    common_params_handle_model_default(params.speculative.model, params.speculative.model_url, params.speculative.hf_repo, params.speculative.hf_file, params.hf_token, "");
-    common_params_handle_model_default(params.vocoder.model,     params.vocoder.model_url,     params.vocoder.hf_repo,     params.vocoder.hf_file,     params.hf_token, "");
+    common_params_handle_model_default(params.model,         params.model_url,         params.hf_repo,         params.hf_file);
+    common_params_handle_model_default(params.vocoder.model, params.vocoder.model_url, params.vocoder.hf_repo, params.vocoder.hf_file);

    if (params.escape) {
        string_process_escapes(params.prompt);
@@ -325,14 +314,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
        throw std::invalid_argument("error: either --embedding or --reranking can be specified, but not both");
    }

-    if (!params.chat_template.empty() && !common_chat_verify_template(params.chat_template, params.use_jinja)) {
-        throw std::runtime_error(string_format(
-            "error: the supplied chat template is not supported: %s%s\n",
-            params.chat_template.c_str(),
-            params.use_jinja ? "" : "\nnote: llama.cpp was started without --jinja, we only support commonly used templates"
-        ));
-    }
-
    return true;
 }

@@ -386,30 +367,6 @@ static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & val
    return devices;
 }

-static void add_rpc_devices(std::string servers) {
-    auto rpc_servers = string_split<std::string>(servers, ',');
-    if (rpc_servers.empty()) {
-        throw std::invalid_argument("no RPC servers specified");
-    }
-    ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
-    if (!rpc_reg) {
-        throw std::invalid_argument("failed to find RPC backend");
-    }
-    typedef ggml_backend_dev_t (*ggml_backend_rpc_add_device_t)(const char * endpoint);
-    ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
-    if (!ggml_backend_rpc_add_device_fn) {
-        throw std::invalid_argument("failed to find RPC device add function");
-    }
-    for (const auto & server : rpc_servers) {
-        ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str());
-        if (dev) {
-            ggml_backend_device_register(dev);
-        } else {
-            throw std::invalid_argument("failed to register RPC device");
-        }
-    }
-}
-
 bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
    auto ctx_arg = common_params_parser_init(params, ex, print_usage);
    const common_params params_org = ctx_arg.params; // the example can modify the default params
@@ -811,19 +768,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"-cnv", "--conversation"},
-        "run in conversation mode:\n"
-        "- does not print special tokens and suffix/prefix\n"
-        "- interactive mode is also enabled\n"
-        "(default: auto enabled if chat template is available)",
+        string_format(
+            "run in conversation mode:\n"
+            "- does not print special tokens and suffix/prefix\n"
+            "- interactive mode is also enabled\n"
+            "(default: %s)",
+            params.conversation ? "true" : "false"
+        ),
        [](common_params & params) {
-            params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED;
-        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
-    add_opt(common_arg(
-        {"-no-cnv", "--no-conversation"},
-        "force disable conversation mode (default: false)",
-        [](common_params & params) {
-            params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
+            params.conversation = true;
        }
    ).set_examples({LLAMA_EXAMPLE_MAIN}));
    add_opt(common_arg(
@@ -877,7 +830,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params) {
            params.warmup = false;
        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"--spm-infill"},
        string_format(
@@ -1419,8 +1372,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            {"--rpc"}, "SERVERS",
            "comma separated list of RPC servers",
            [](common_params & params, const std::string & value) {
-                add_rpc_devices(value);
-                GGML_UNUSED(params);
+                params.rpc_servers = value;
            }
        ).set_env("LLAMA_ARG_RPC"));
    }
@@ -1631,30 +1583,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_env("LLAMA_ARG_MODEL_URL"));
    add_opt(common_arg(
-        {"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]",
-        "Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"
-        "example: unsloth/phi-4-GGUF:q4_k_m\n"
-        "(default: unused)",
+        {"-hfr", "--hf-repo"}, "REPO",
+        "Hugging Face model repository (default: unused)",
        [](common_params & params, const std::string & value) {
            params.hf_repo = value;
        }
    ).set_env("LLAMA_ARG_HF_REPO"));
-    add_opt(common_arg(
-        {"-hfd", "-hfrd", "--hf-repo-draft"}, "<user>/<model>[:quant]",
-        "Same as --hf-repo, but for the draft model (default: unused)",
-        [](common_params & params, const std::string & value) {
-            params.speculative.hf_repo = value;
-        }
-    ).set_env("LLAMA_ARG_HFD_REPO"));
    add_opt(common_arg(
        {"-hff", "--hf-file"}, "FILE",
-        "Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)",
+        "Hugging Face model file (default: unused)",
        [](common_params & params, const std::string & value) {
            params.hf_file = value;
        }
    ).set_env("LLAMA_ARG_HF_FILE"));
    add_opt(common_arg(
-        {"-hfv", "-hfrv", "--hf-repo-v"}, "<user>/<model>[:quant]",
+        {"-hfrv", "--hf-repo-v"}, "REPO",
        "Hugging Face model repository for the vocoder model (default: unused)",
        [](common_params & params, const std::string & value) {
            params.vocoder.hf_repo = value;
@@ -1955,44 +1898,24 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            }
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}));
-    add_opt(common_arg(
-        {"--jinja"},
-        "use jinja template for chat (default: disabled)",
-        [](common_params & params) {
-            params.use_jinja = true;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
    add_opt(common_arg(
        {"--chat-template"}, "JINJA_TEMPLATE",
        string_format(
            "set custom jinja chat template (default: template taken from model's metadata)\n"
            "if suffix/prefix are specified, template will be disabled\n"
-            "only commonly used templates are accepted (unless --jinja is set before this flag):\n"
            "list of built-in templates:\n%s", list_builtin_chat_templates().c_str()
        ),
        [](common_params & params, const std::string & value) {
+            if (!common_chat_verify_template(value)) {
+                throw std::runtime_error(string_format(
+                    "error: the supplied chat template is not supported: %s\n"
+                    "note: llama.cpp does not use jinja parser, we only support commonly used templates\n",
+                    value.c_str()
+                ));
+            }
            params.chat_template = value;
        }
    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
-    add_opt(common_arg(
-        {"--chat-template-file"}, "JINJA_TEMPLATE_FILE",
-        string_format(
-            "set custom jinja chat template file (default: template taken from model's metadata)\n"
-            "if suffix/prefix are specified, template will be disabled\n"
-            "only commonly used templates are accepted (unless --jinja is set before this flag):\n"
-            "list of built-in templates:\n%s", list_builtin_chat_templates().c_str()
-        ),
-        [](common_params & params, const std::string & value) {
-            std::ifstream file(value);
-            if (!file) {
-                throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
-            }
-            std::copy(
-                std::istreambuf_iterator<char>(file),
-                std::istreambuf_iterator<char>(),
-                std::back_inserter(params.chat_template));
-        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
    add_opt(common_arg(
        {"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
        string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
@@ -2291,13 +2214,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.vocoder.model = value;
        }
    ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
-     add_opt(common_arg(
-        {"--tts-use-guide-tokens"},
-        "Use guide tokens to improve TTS word recall",
-        [](common_params & params) {
-            params.vocoder.use_guide_tokens = true;
-        }
-    ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));

    // model-specific
    add_opt(common_arg(
--- a/common/chat-template.hpp
+++ b/common/chat-template.hpp
@@ -1,366 +0,0 @@
-/*
-    Copyright 2024 Google LLC
-
-    Use of this source code is governed by an MIT-style
-    license that can be found in the LICENSE file or at
-    https://opensource.org/licenses/MIT.
-*/
-// SPDX-License-Identifier: MIT
-#pragma once
-
-#include "minja.hpp"
-#include <json.hpp>
-#include <string>
-#include <vector>
-
-using json = nlohmann::ordered_json;
-
-namespace minja {
-
-struct chat_template_caps {
-    bool supports_tools = false;
-    bool supports_tool_calls = false;
-    bool supports_tool_responses = false;
-    bool supports_system_role = false;
-    bool supports_parallel_tool_calls = false;
-    bool supports_tool_call_id = false;
-    // meta-llama/Llama-3.1-8B-Instruct expects arguments to be an object.
-    // Most other templates (and OpenAI's API) expect the arguments object to be stringified.
-    bool requires_object_arguments = false;
-    // CohereForAI/c4ai-command-r-plus simple variant
-    bool requires_non_null_content = false;
-    // MiniMaxAI/MiniMax-Text-01 special
-    bool requires_typed_content = false;
-};
-
-class chat_template {
-
-  private:
-    chat_template_caps caps_;
-    std::string source_;
-    std::string bos_token_;
-    std::string eos_token_;
-    std::shared_ptr<minja::TemplateNode> template_root_;
-
-    std::string try_raw_render(
-        const nlohmann::ordered_json & messages,
-        const nlohmann::ordered_json & tools,
-        bool add_generation_prompt,
-        const nlohmann::ordered_json & extra_context = nlohmann::ordered_json()) const
-    {
-        try {
-            auto prompt = apply(messages, tools, add_generation_prompt, extra_context, /* adjust_inputs= */ false);
-            // fprintf(stderr, "try_raw_render: %s\n", prompt.c_str());
-            return prompt;
-        } catch (const std::exception & e) {
-            // fprintf(stderr, "try_raw_render error: %s\n", e.what());
-            return "";
-        }
-    }
-
-  public:
-
-    chat_template(const std::string & source, const std::string & bos_token, const std::string & eos_token)
-        : source_(source), bos_token_(bos_token), eos_token_(eos_token)
-    {
-        template_root_ = minja::Parser::parse(source_, {
-            /* .trim_blocks = */ true,
-            /* .lstrip_blocks = */ true,
-            /* .keep_trailing_newline = */ false,
-        });
-
-        auto contains = [](const std::string & haystack, const std::string & needle) {
-            return haystack.find(needle) != std::string::npos;
-        };
-
-        const std::string user_needle = "<User Needle>";
-        const std::string sys_needle = "<System Needle>";
-        const json dummy_str_user_msg = {{"role", "user"}, {"content", user_needle}};
-        const json dummy_typed_user_msg = {{"role", "user"}, {"content", json::array({{{"type", "text"}, {"text", user_needle}}})}};
-
-        caps_.requires_typed_content =
-            !contains(try_raw_render(json::array({dummy_str_user_msg}), {}, false), user_needle)
-            && contains(try_raw_render(json::array({dummy_typed_user_msg}), {}, false), user_needle);
-
-        const auto dummy_user_msg = caps_.requires_typed_content
-            ? dummy_typed_user_msg
-            : dummy_str_user_msg;
-        const json needle_system_msg = {
-            {"role", "system"},
-            {"content", caps_.requires_typed_content ? json::array({{{"type", "text"}, {"text", sys_needle}}}) : json(sys_needle)},
-        };
-
-        caps_.supports_system_role = contains(try_raw_render({needle_system_msg, dummy_user_msg,}, {}, false), sys_needle);
-
-        auto out = try_raw_render(json::array({
-            dummy_user_msg
-        }), json::array({
-            {
-                {"name", "some_tool"},
-                {"type", "function"},
-                {"function", {
-                    {"name", "some_tool"},
-                    {"description", "Some tool."},
-                    {"parameters", {
-                        {"type", "object"},
-                        {"properties", {
-                            {"arg", {
-                                {"type", "string"},
-                                {"description", "Some argument."},
-                            }},
-                        }},
-                        {"required", json::array({ "arg" })},
-                    }},
-                }},
-            },
-        }), false);
-        caps_.supports_tools = contains(out, "some_tool");
-
-        auto make_tool_calls_msg = [&](const json & tool_calls) {
-            return json {
-                {"role", "assistant"},
-                {"content", nullptr},
-                {"tool_calls", tool_calls},
-            };
-        };
-        auto make_tool_call = [](const std::string & tool_name, const json & arguments) {
-            return json {
-                {"id", "call_1___"},
-                {"type", "function"},
-                {"function", {
-                    {"arguments", arguments},
-                    {"name", tool_name},
-                }},
-            };
-        };
-        const json dummy_args_obj {{"argument_needle", "print('Hello, World!')"}};
-
-        // Note: the arguments are rendered in both cases, but may be double-escaped, which we don't want.
-        out = try_raw_render(json::array({
-            dummy_user_msg,
-            make_tool_calls_msg(json::array({make_tool_call("ipython", dummy_args_obj.dump())})),
-        }), {}, false);
-        auto tool_call_renders_str_arguments = contains(out, "\"argument_needle\":") || contains(out, "'argument_needle':");
-        out = try_raw_render(json::array({
-            dummy_user_msg,
-            make_tool_calls_msg(json::array({make_tool_call("ipython", dummy_args_obj)})),
-        }), {}, false);
-        auto tool_call_renders_obj_arguments = contains(out, "\"argument_needle\":") || contains(out, "'argument_needle':");
-
-        caps_.supports_tool_calls = tool_call_renders_str_arguments || tool_call_renders_obj_arguments;
-        caps_.requires_object_arguments = !tool_call_renders_str_arguments && tool_call_renders_obj_arguments;
-        auto out_empty = try_raw_render(json::array({dummy_user_msg, {{"role", "assistant"}, {"content", ""}}}), {}, false);
-        auto out_null = try_raw_render(json::array({dummy_user_msg, {{"role", "assistant"}, {"content", nullptr}}}), {}, false);
-        caps_.requires_non_null_content = contains(out_empty, user_needle) && !contains(out_null, user_needle);
-
-        if (caps_.supports_tool_calls) {
-            auto dummy_args = caps_.requires_object_arguments ? dummy_args_obj : json(dummy_args_obj.dump());
-            auto tc1 = make_tool_call("test_tool1", dummy_args);
-            auto tc2 = make_tool_call("test_tool2", dummy_args);
-            auto out = try_raw_render(json::array({
-                dummy_user_msg,
-                make_tool_calls_msg(json::array({tc1, tc2})),
-            }), {}, false);
-            caps_.supports_parallel_tool_calls = contains(out, "test_tool1") && contains(out, "test_tool2");
-
-            out = try_raw_render(json::array({
-                dummy_user_msg,
-                make_tool_calls_msg(json::array({tc1})),
-                {
-                    {"role", "tool"},
-                    {"name", "test_tool1"},
-                    {"content", "Some response!"},
-                    {"tool_call_id", "call_911_"},
-                }
-            }), {}, false);
-            caps_.supports_tool_responses = contains(out, "Some response!");
-            caps_.supports_tool_call_id = contains(out, "call_911_");
-        }
-    }
-
-    const std::string & source() const { return source_; }
-    const std::string & bos_token() const { return bos_token_; }
-    const std::string & eos_token() const { return eos_token_; }
-    const chat_template_caps & original_caps() const { return caps_; }
-
-    std::string apply(
-        const nlohmann::ordered_json & messages,
-        const nlohmann::ordered_json & tools,
-        bool add_generation_prompt,
-        const nlohmann::ordered_json & extra_context = nlohmann::ordered_json(),
-        bool adjust_inputs = true) const
-    {
-        json actual_messages;
-
-        auto needs_adjustments = adjust_inputs && (false
-            || !caps_.supports_system_role
-            || !caps_.supports_tools
-            || !caps_.supports_tool_responses
-            || !caps_.supports_tool_calls
-            || caps_.requires_object_arguments
-            || caps_.requires_typed_content
-        );
-        if (needs_adjustments) {
-            actual_messages = json::array();
-
-            auto add_message = [&](const json & msg) {
-                if (caps_.requires_typed_content && msg.contains("content") && !msg.at("content").is_null() && msg.at("content").is_string()) {
-                    actual_messages.push_back({
-                        {"role", msg.at("role")},
-                        {"content", {{
-                            {"type", "text"},
-                            {"text", msg.at("content")},
-                        }}},
-                    });
-                } else {
-                    actual_messages.push_back(msg);
-                }
-            };
-
-            std::string pending_system;
-            auto flush_sys = [&]() {
-                if (!pending_system.empty()) {
-                    add_message({
-                        {"role", "user"},
-                        {"content", pending_system},
-                    });
-                    pending_system.clear();
-                }
-            };
-            auto needs_tools_in_system = !tools.is_null() && tools.size() > 0 && !caps_.supports_tools;
-
-            for (const auto & message_ : needs_tools_in_system ? add_system(messages, "Available tools: " + tools.dump(2)) : messages) {
-                auto message = message_;
-                if (!message.contains("role") || !message.contains("content")) {
-                    throw std::runtime_error("message must have 'role' and 'content' fields: " + message.dump());
-                }
-                std::string role = message.at("role");
-
-                if (message.contains("tool_calls")) {
-                    if (caps_.requires_object_arguments || !caps_.supports_tool_calls) {
-                        for (auto & tool_call : message.at("tool_calls")) {
-                            if (tool_call["type"] == "function") {
-                                auto & function = tool_call.at("function");
-                                auto & arguments = function.at("arguments");
-                                if (arguments.is_string()) {
-                                    try {
-                                        arguments = json::parse(arguments.get<std::string>());
-                                    } catch (const std::exception & ecvt) {
-                                        fprintf(stderr, "Failed to parse arguments: %s\n", ecvt.what());
-                                    }
-                                }
-                            }
-                        }
-                    }
-                    if (!caps_.supports_tool_calls) {
-                        auto content = message.at("content");
-                        auto tool_calls = json::array();
-                        for (const auto & tool_call : message.at("tool_calls")) {
-                            if (tool_call.at("type") != "function") {
-                                continue;
-                            }
-                            const auto & function = tool_call.at("function");
-                            auto tc = json {
-                                {"name", function.at("name")},
-                                {"arguments", function.at("arguments")},
-                            };
-                            if (tool_call.contains("id")) {
-                                tc["id"] = tool_call["id"];
-                            }
-                            tool_calls.push_back(tc);
-                        }
-                        auto obj = json {
-                            {"tool_calls", tool_calls},
-                        };
-                        if (!content.is_null() && content != "") {
-                            obj["content"] = content;
-                        }
-                        message["content"] = obj.dump(2);
-                        message.erase("tool_calls");
-                    }
-                }
-                if (!caps_.supports_tool_responses && role == "tool") {
-                    message["role"] = "user";
-                    auto obj = json {
-                        {"tool_response", {
-                            {"tool", message.at("name")},
-                            {"content", message.at("content")},
-                        }},
-                    };
-                    if (message.contains("tool_call_id")) {
-                        obj["tool_response"]["tool_call_id"] = message.at("tool_call_id");
-                    }
-                    message["content"] = obj.dump(2);
-                    message.erase("name");
-                }
-
-                if (!message["content"].is_null() && !caps_.supports_system_role) {
-                    std::string content = message.at("content");
-                    if (role == "system") {
-                        if (!pending_system.empty()) pending_system += "\n";
-                        pending_system += content;
-                        continue;
-                    } else {
-                        if (role == "user") {
-                            if (!pending_system.empty()) {
-                                message["content"] = pending_system + (content.empty() ? "" : "\n" + content);
-                                pending_system.clear();
-                            }
-                        } else {
-                            flush_sys();
-                        }
-                    }
-                }
-                add_message(message);
-            }
-            if (!caps_.supports_system_role) {
-                flush_sys();
-            }
-        } else {
-            actual_messages = messages;
-        }
-
-        auto context = minja::Context::make(json({
-            {"messages", actual_messages},
-            {"add_generation_prompt", add_generation_prompt},
-            {"bos_token", bos_token_},
-            {"eos_token", eos_token_},
-        }));
-
-        if (!tools.is_null()) {
-            auto tools_val = minja::Value(tools);
-            context->set("tools", tools_val);
-        }
-        if (!extra_context.is_null()) {
-            for (auto & kv : extra_context.items()) {
-                minja::Value val(kv.value());
-                context->set(kv.key(), val);
-            }
-        }
-
-        auto ret = template_root_->render(context);
-        // fprintf(stderr, "actual_messages: %s\n", actual_messages.dump(2).c_str());
-        // fprintf(stderr, "apply: %s\n\n", ret.c_str());
-        return ret;
-    }
-
-    static nlohmann::ordered_json add_system(const nlohmann::ordered_json & messages, const std::string & system_prompt) {
-        json messages_with_system = messages;
-
-        if (messages_with_system.size() > 0 && messages_with_system[0].at("role") == "system") {
-            std::string existing_system = messages_with_system.at(0).at("content");
-            messages_with_system[0] = json {
-                {"role", "system"},
-                {"content", existing_system + "\n" + system_prompt},
-            };
-        } else {
-            messages_with_system.insert(messages_with_system.begin(), json {
-                {"role", "system"},
-                {"content", system_prompt},
-            });
-        }
-        return messages_with_system;
-    }
-};
-
-}  // namespace minja
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -1,848 +0,0 @@
-#include "chat.hpp"
-#include "chat-template.hpp"
-#include "json-schema-to-grammar.h"
-#include "log.h"
-#include "minja.hpp"
-
-std::string common_chat_format_name(common_chat_format format) {
-    switch (format) {
-        case COMMON_CHAT_FORMAT_CONTENT_ONLY: return "Content-only";
-        case COMMON_CHAT_FORMAT_GENERIC: return "Generic";
-        case COMMON_CHAT_FORMAT_MISTRAL_NEMO: return "Mistral Nemo";
-        case COMMON_CHAT_FORMAT_LLAMA_3_X: return "Llama 3.x";
-        case COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS: return "Llama 3.x with builtin tools";
-        case COMMON_CHAT_FORMAT_DEEPSEEK_R1: return "DeepSeek R1";
-        case COMMON_CHAT_FORMAT_FIREFUNCTION_V2: return "FireFunction v2";
-        case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2: return "Functionary v3.2";
-        case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
-        case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
-        default:
-            throw std::runtime_error("Unknown chat format");
-    }
-}
-
-const common_grammar_options grammar_options {
-    /* .dotall = */ false,
-    /* .compact_spaces = */ false,
-    // /* .compact_spaces = */ true,
-};
-
-static bool parse_json(std::string::const_iterator & it, const std::string::const_iterator & end, json & out) {
-    // // https://json.nlohmann.me/features/parsing/sax_interface/
-    struct json_error_locator : public nlohmann::json_sax<json> {
-        std::size_t position;
-        bool found_error;
-
-        json_error_locator() : position(0), found_error(false) {}
-
-        bool parse_error(std::size_t position, const std::string &, const json::exception &) override {
-            this->position = position - 1;
-            this->found_error = true;
-            return false;
-        }
-        bool null() override { return true; }
-        bool boolean(bool) override { return true; }
-        bool number_integer(number_integer_t) override { return true; }
-        bool number_unsigned(number_unsigned_t) override { return true; }
-        bool number_float(number_float_t, const string_t &) override { return true; }
-        bool string(string_t &) override { return true; }
-        bool binary(binary_t &) override { return true; }
-        bool start_object(std::size_t) override { return true; }
-        bool key(string_t &) override { return true; }
-        bool end_object() override { return true; }
-        bool start_array(std::size_t) override { return true; }
-        bool end_array() override { return true; }
-    };
-    json_error_locator err_loc;
-    json::sax_parse(it, end, &err_loc);
-
-    std::string::const_iterator temptative_end;
-    if (err_loc.found_error) {
-        temptative_end = it + err_loc.position;
-    } else {
-        temptative_end = end;
-    }
-    std::string json_sub {it, temptative_end};
-    try {
-        out = json::parse(json_sub);
-        it = temptative_end;
-        return true;
-    } catch (const std::exception &) {
-        return false;
-    }
-}
-
-
-/**
- * Takes a prefix regex that must have 1 group to capture the function name, a closing suffix, and expects json parameters in between.
- * Aggregates the prefix, suffix and in-between text into the content.
- */
-static common_chat_msg parse_json_tool_calls(
-    const std::string& input,
-    const std::optional<std::regex> & trigger_opt,
-    const std::regex & function_regex,
-    const std::regex & close_regex) {
-    std::smatch match;
-
-    common_chat_msg result;
-    result.role = "assistant";
-
-
-    auto end = input.end();
-    auto it = input.begin();
-
-    if (trigger_opt) {
-        if (!std::regex_search(it, end, match, *trigger_opt)) {
-            result.content = input;
-            return result;
-        }
-        result.content = match.prefix().str();
-        it = match.suffix().first;
-    }
-
-    while (it != end) {
-        std::sregex_iterator rend;
-        std::sregex_iterator rit(it, end, function_regex);
-        if (rit == rend) {
-            fprintf(stderr, "No more tool calls found\n");
-            result.content += std::string(it, end);
-            break;
-        }
-        auto name = rit->str(1);
-        result.content += std::string(it, rit->prefix().second);
-        it = rit->suffix().first;
-
-        json arguments;
-        if (!parse_json(it, end, arguments)) {
-            throw std::runtime_error("Failed to parse json tool call arguments");
-        }
-        if (!std::regex_search(it, end, match, close_regex)) {
-            throw std::runtime_error("Malformed input, missing closing pattern");
-        }
-        it = match.suffix().first;
-        result.tool_calls.push_back({name, arguments.is_string() ? arguments.get<std::string>() : arguments.dump(), /* id= */ ""});
-    }
-    return result;
-}
-
-static common_chat_msg parse_prefixed_json_tool_call_array(const std::string& input, const std::string & prefix, size_t rstrip_prefix = 0) {
-    auto content_end = input.find(prefix);
-    size_t tc_start = std::string::npos;
-
-    common_chat_msg result;
-    result.role = "assistant";
-    const auto process_tool_calls = [&](const json & tool_calls) {
-        for (const auto & tool_call : tool_calls) {
-            const auto & arguments = tool_call["arguments"];
-            result.tool_calls.push_back({
-                tool_call["name"],
-                arguments.is_string() ? arguments.get<std::string>() : arguments.dump(),
-                tool_call.contains("id") ? tool_call["id"] : "",
-            });
-        }
-    };
-    if (content_end == std::string::npos) {
-        result.content = input;
-    } else {
-        tc_start = content_end + prefix.size() - rstrip_prefix;
-        result.content = input.substr(0, content_end);
-        auto tool_calls = json::parse(input.substr(tc_start));
-        process_tool_calls(tool_calls);
-    }
-    return result;
-}
-
-static void foreach_function(const json & tools, const std::function<void(const json &)> & fn) {
-    for (const auto & tool : tools) {
-        if (!tool.contains("type") || tool["type"] != "function" || !tool.contains("function")) {
-            LOG_INF("Skipping tool without function: %s", tool.dump(2).c_str());
-            continue;
-        }
-        fn(tool);
-    }
-}
-
-static common_chat_params common_chat_params_init_generic(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
-    common_chat_params data;
-
-    auto tool_call_schemas = json::array();
-    foreach_function(inputs.tools, [&](const json & tool) {
-        const auto & function = tool["function"];
-        auto tool_schema = json {
-            {"type", "object"},
-            {"properties", {
-                {"name", {
-                    {"type", "string"},
-                    {"const", function["name"]},
-                }},
-                {"arguments", function["parameters"]},
-            }},
-            {"required", json::array({"name", "arguments"})},
-        };
-        if (function.contains("description")) {
-            tool_schema["description"] = function["description"];
-        }
-        if (inputs.parallel_tool_calls) {
-            tool_schema["properties"]["id"] = {
-                {"type", "string"},
-                {"minLength", 4},
-            };
-            tool_schema["required"].push_back("id");
-        }
-        tool_call_schemas.emplace_back(tool_schema);
-    });
-    const auto tool_call =
-        inputs.parallel_tool_calls
-            ? json {
-                {"type", "object"},
-                {"properties", {
-                    {"tool_calls", {
-                        {"type", "array"},
-                        {"items", tool_call_schemas.size() == 1 ? tool_call_schemas[0] : json {
-                            {"anyOf", tool_call_schemas},
-                        }},
-                        {"minItems", 1},
-                    }},
-                }},
-                {"required", json::array({"tool_calls"})},
-            }
-            : json {
-                {"type", "object"},
-                {"properties", {
-                    {"tool_call", tool_call_schemas.size() == 1 ? tool_call_schemas[0] : json {
-                        {"anyOf", tool_call_schemas},
-                    }},
-                }},
-                {"required", json::array({"tool_call"})},
-            };
-    const auto schema =
-        inputs.tool_choice != "required"
-            ? json {
-                {"anyOf", json::array({
-                    tool_call,
-                    {
-                        {"type", "object"},
-                        {"properties", {
-                            {"response", inputs.json_schema.is_null()
-                                ? json {{"type", "string"}}
-                                : inputs.json_schema
-                            },
-                        }},
-                        {"required", json::array({"response"})},
-                    },
-                })}
-            }
-            : tool_call;
-
-    data.grammar_lazy = false;
-    data.grammar = build_grammar([&](const common_grammar_builder & builder) {
-        builder.add_schema("root", schema);
-    }, grammar_options);
-
-    auto tweaked_messages = common_chat_template::add_system(
-        inputs.messages,
-        "Respond in JSON format, either with `tool_call` (a request to call tools) or with `response` reply to the user's request");
-
-    data.prompt = tmpl.apply(tweaked_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
-    data.format = COMMON_CHAT_FORMAT_GENERIC;
-    return data;
-}
-static common_chat_msg common_chat_parse_generic(const std::string & input) {
-    json data = json::parse(input);
-    common_chat_msg result;
-    result.role = "assistant";
-    if (data.contains("tool_calls")) {
-        for (const auto & tool_call : data["tool_calls"]) {
-            result.tool_calls.push_back({
-                tool_call["name"],
-                tool_call["arguments"].dump(),
-                tool_call.contains("id") ? tool_call["id"] : "",
-            });
-        }
-    } else if (data.contains("tool_call")) {
-        result.tool_calls.push_back({
-            data["tool_call"]["name"],
-            data["tool_call"]["arguments"].dump(),
-            /* id= */ "",
-        });
-    } else if (data.contains("response")) {
-        const auto & response = data["response"];
-        result.content = response.is_string() ? response.get<std::string>() : response.dump(2);
-    }
-    return result;
-}
-
-static common_chat_params common_chat_params_init_mistral_nemo(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
-    common_chat_params data;
-    data.grammar_lazy = inputs.tool_choice != "required";
-    data.grammar = build_grammar([&](const common_grammar_builder & builder) {
-        auto schemas = json::array();
-        foreach_function(inputs.tools, [&](const json & tool) {
-            const auto & function = tool["function"];
-            schemas.push_back({
-                {"type", "object"},
-                {"properties", {
-                    // Important note: the model is probably trained to take a JSON stringified arguments value.
-                    // It's hard to constrain that for now (while reusing the JSON schema conversion), so we're just expecting a plain object.
-                    {"name", {
-                        {"type", "string"},
-                        {"const", function["name"]},
-                    }},
-                    {"arguments", function["parameters"]},
-                    {"id", {
-                        {"type", "string"},
-                        // Nemo's template expects a 9-character alphanumeric ID.
-                        {"pattern", "^[a-zA-Z0-9]{9}$"},
-                    }},
-                }},
-                {"required", json::array({"name", "arguments", "id"})},
-            });
-        });
-        auto schema = json {
-            {"type", "array"},
-            {"items", schemas.size() == 1 ? schemas[0] : json {{"anyOf", schemas}}},
-            {"minItems", 1},
-        };
-        if (!inputs.parallel_tool_calls) {
-            schema["maxItems"] = 1;
-        }
-        builder.add_rule("root", "\"[TOOL_CALLS]\" " + builder.add_schema("tool_calls", schema));
-    }, grammar_options);
-    data.grammar_triggers.push_back({"[TOOL_CALLS]", /* .at_start = */ true});
-    data.prompt = tmpl.apply(inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
-    data.format = COMMON_CHAT_FORMAT_MISTRAL_NEMO;
-    return data;
-}
-static common_chat_msg common_chat_parse_mistral_nemo(const std::string & input) {
-    return parse_prefixed_json_tool_call_array(input, "[TOOL_CALLS]");
-}
-
-static void expect_tool_parameters(const std::string & name, const json & parameters, const std::vector<std::string> & expected_properties) {
-    if (!parameters.is_object() || !parameters.contains("type") || parameters["type"] != "object" || !parameters.contains("properties") || !parameters.contains("required")) {
-        throw std::runtime_error("Parameters of tool " + name + " must be an object w/ required properties");
-    }
-    const auto & parameters_properties = parameters.at("properties");
-    const auto & parameters_required = parameters.at("required");
-    for (const auto & prop : expected_properties) {
-        if (!parameters_properties.contains(prop)) {
-            throw std::runtime_error("Parameters of tool " + name + " is missing property: " + prop);
-        }
-        if (std::find(parameters_required.begin(), parameters_required.end(), json(prop)) == parameters_required.end()) {
-            throw std::runtime_error("Parameters of tool " + name + " must have property marked as required: " + prop);
-        }
-    }
-    if (parameters_properties.size() != expected_properties.size()) {
-        throw std::runtime_error("Parameters of tool " + name + " must only have these properties:" + string_join(expected_properties, ", "));
-    }
-}
-
-static common_chat_params common_chat_params_init_llama_3_1_tool_calls(const common_chat_template & tmpl, const struct common_chat_inputs & inputs, bool allow_python_tag_builtin_tools) {
-    auto builtin_tools = json::array();
-    common_chat_params data;
-    data.grammar_lazy = inputs.tool_choice != "required";
-    data.grammar = build_grammar([&](const common_grammar_builder & builder) {
-        std::vector<std::string> tool_rules;
-
-        auto handle_builtin_tool = [&](const std::string & name, const json & parameters) {
-            if (name == "wolfram_alpha") {
-                // https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/tool_runtime/wolfram_alpha/wolfram_alpha.py
-                expect_tool_parameters(name, parameters, {"query"});
-            } else if (name == "web_search" || name == "brave_search") {
-                // https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/tool_runtime/brave_search/brave_search.py
-                expect_tool_parameters(name, parameters, {"query"});
-            } else if (name == "python" || name == "code_interpreter") {
-                // https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/inline/tool_runtime/code_interpreter/code_interpreter.py
-                expect_tool_parameters(name, parameters, {"code"});
-            } else {
-                return false;
-            }
-
-            std::vector<std::string> kvs;
-            for (const auto & [key, value] : parameters.at("properties").items()) {
-                kvs.push_back("\"" + key + "=\" " + builder.add_schema(name + "-args-" + key, value));
-            }
-
-            tool_rules.push_back(
-                builder.add_rule(
-                    name + "-call",
-                    "\"<|python_tag|>" + name + ".call(\" " + string_join(kvs, " \", \" ") + " \")\""));
-            builtin_tools.push_back(name);
-
-            return true;
-        };
-
-        foreach_function(inputs.tools, [&](const json & tool) {
-            const auto & function = tool["function"];
-            std::string name = function["name"];
-            auto parameters = function["parameters"];
-            builder.resolve_refs(parameters);
-
-            // https://github.com/meta-llama/llama-stack/tree/main/llama_stack/providers/remote/tool_runtime
-            if (allow_python_tag_builtin_tools) {
-                handle_builtin_tool(name, parameters);
-            }
-            tool_rules.push_back(
-                builder.add_rule(
-                    name + "-call",
-                    "\"{\" ( \"\\\"type\\\": \\\"function\\\", \" | space ) "
-                    "\"\\\"name\\\": \\\"" + name + "\\\", \\\"parameters\\\": \" " +
-                        builder.add_schema(name + "-args", parameters) +
-                    " \"}\""));
-            data.grammar_triggers.push_back({"{\"name\": \"" + name + "\"", /* .at_start = */ true});
-        });
-        data.grammar_triggers.push_back({"{\"name\":", /* .at_start = */ true});
-        data.grammar_triggers.push_back({"{\"type\": \"function\"", /* .at_start = */ true});
-        if (!builtin_tools.empty()) {
-            data.grammar_triggers.push_back({"<|python_tag|>", /* .at_start = */ false});
-        }
-        builder.add_rule("root", string_join(tool_rules, " | "));
-    }, grammar_options);
-    data.additional_stops.push_back("<|eom_id|>");
-    data.prompt = tmpl.apply(inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {
-        {"tools_in_user_message", false},
-        {"builtin_tools", builtin_tools.empty() ? json() : builtin_tools},
-    });
-    data.format = allow_python_tag_builtin_tools && !builtin_tools.empty()
-        ? COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS
-        : COMMON_CHAT_FORMAT_LLAMA_3_X;
-    return data;
-}
-static common_chat_msg common_chat_parse_llama_3_1(const std::string & input, bool with_builtin_tools = false) {
-    // TODO: tighten & simplify the parser, don't accept leading text context.
-    static std::regex function_regex("\\{[\\s\\n\\r]*(?:\"type\"[\\s\\n\\r]*:[\\s\\n\\r]*\"function\"[\\s\\n\\r]*,[\\s\\n\\r]*|[\\s\\n\\r]*)\"name\"[\\s\\n\\r]*:[\\s\\n\\r]*\"([^\"]+)\"[\\s\\n\\r]*,[\\s\\n\\r]*\"parameters\": ");
-    static std::regex close_regex("\\}");
-    static std::regex builtin_call_regex("<\\|python_tag\\|>([^.(]+)\\.call\\((.*)\\)");
-
-    if (with_builtin_tools) {
-        std::smatch match;
-        if (std::regex_match(input, match, builtin_call_regex)) {
-            auto name = match[1].str();
-            auto raw_args = match[2].str();
-
-            // TODO: if/when builtin tools start accepting more than 1 argument, use parse_json for real parsing.
-            auto it_eq = raw_args.find('=');
-            auto arg_name = raw_args.substr(0, it_eq);
-            auto arg_value_str = raw_args.substr(it_eq + 1);
-            auto arg_value = json::parse(arg_value_str);
-
-            return {
-                /* .role = */ "assistant",
-                /* .content = */ match.prefix().str(),
-                /* .tool_calls = */ {
-                    {
-                        /* .name = */ match[1],
-                        /* .arguments = */ (json {
-                            {arg_name, arg_value},
-                        }).dump(),
-                        /* .id = */ "",
-                    },
-                },
-            };
-        }
-    }
-    return parse_json_tool_calls(input, std::nullopt, function_regex, close_regex);
-}
-
-static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
-    common_chat_params data;
-    data.grammar_lazy = inputs.tool_choice != "required";
-    data.grammar = build_grammar([&](const common_grammar_builder & builder) {
-        std::vector<std::string> tool_rules;
-        foreach_function(inputs.tools, [&](const json & tool) {
-            const auto & function = tool["function"];
-            std::string name = function["name"];
-            auto parameters = function["parameters"];
-            auto args_rule = builder.add_schema(name + "-args", parameters);
-            tool_rules.push_back(builder.add_rule(name + "-call",
-                "\"<｜tool▁call▁begin｜>function<｜tool▁sep｜>" + name + "\\n```json\\n\" " + args_rule + " \"```<｜tool▁call▁end｜>\""));
-        });
-        data.grammar_triggers.push_back({"<｜tool▁calls▁begin｜>", /* .at_start = */ false});
-        builder.add_rule("root", "\"<｜tool▁calls▁begin｜>\" (" + string_join(tool_rules, " | ") + ")" + (inputs.parallel_tool_calls ? "*" : "") + " space");
-    }, grammar_options);
-    data.prompt = tmpl.apply(inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
-    data.format = COMMON_CHAT_FORMAT_DEEPSEEK_R1;
-    return data;
-}
-static common_chat_msg common_chat_parse_deepseek_r1(const std::string & input) {
-    static std::regex trigger_regex("<｜tool▁calls▁begin｜>");
-    static std::regex function_regex("<｜tool▁call▁begin｜>function<｜tool▁sep｜>([^\n]+)\n```json\n");
-    static std::regex close_regex("```<｜tool▁call▁end｜>");
-    return parse_json_tool_calls(input, trigger_regex, function_regex, close_regex);
-}
-
-static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
-    fprintf(stderr, "%s\n", __func__);
-    common_chat_params data;
-    data.prompt = tmpl.apply(inputs.messages, /* tools= */ nullptr, inputs.add_generation_prompt, {
-        {"datetime", "Jan 29 2025 13:00:00 GMT"},
-        {"functions", json(inputs.tools.empty() ? "" : inputs.tools.dump(2))},
-    }, /* adjust_inputs= */ false);
-    if (!inputs.tools.is_null() && !inputs.tools.empty()) {
-        data.grammar_lazy = inputs.tool_choice != "required";
-        data.grammar = build_grammar([&](const common_grammar_builder & builder) {
-            auto schemas = json::array();
-            foreach_function(inputs.tools, [&](const json & tool) {
-                const auto & function = tool["function"];
-                schemas.push_back({
-                    {"type", "object"},
-                    {"properties", {
-                        {"name", {
-                            {"type", "string"},
-                            {"const", function["name"]},
-                        }},
-                        {"arguments", function["parameters"]},
-                    }},
-                    {"required", json::array({"name", "arguments", "id"})},
-                });
-            });
-            auto schema = json {
-                {"type", "array"},
-                {"items", schemas.size() == 1 ? schemas[0] : json {{"anyOf", schemas}}},
-                {"minItems", 1},
-            };
-            if (!inputs.parallel_tool_calls) {
-                schema["maxItems"] = 1;
-            }
-            builder.add_rule("root", "\" functools\"? " + builder.add_schema("tool_calls", schema));
-        }, grammar_options);
-        data.grammar_triggers.push_back({" functools[", /* .at_start = */ false});
-        data.format = COMMON_CHAT_FORMAT_FIREFUNCTION_V2;
-    } else {
-        data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
-    }
-    return data;
-}
-static common_chat_msg common_chat_parse_firefunction_v2(const std::string & input) {
-    return parse_prefixed_json_tool_call_array(input, " functools[", /* rstrip_prefix= */ 1);
-}
-
-static common_chat_params common_chat_params_init_functionary_v3_2(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
-    // >>>all\nlet's call functions>>>fn1\n{"arg1": 1...}\n>>>fn2\n{"arg1": 1...}...
-    // Using ">>>f1\n", ">>>f2\n"... as trigger words for the grammar
-    common_chat_params data;
-    data.prompt = tmpl.apply(inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
-    data.format = COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2;
-    if (!inputs.tools.is_null() && !inputs.tools.empty()) {
-        data.grammar_lazy = inputs.tool_choice != "required";
-        data.grammar = build_grammar([&](const common_grammar_builder & builder) {
-            std::vector<std::string> first_tool_rules;
-            std::vector<std::string> subsequent_tool_rules;
-            foreach_function(inputs.tools, [&](const json & tool) {
-                const auto & function = tool["function"];
-                std::string name = function["name"];
-                auto parameters = function["parameters"];
-                auto args_rule = builder.add_schema(name + "-args", parameters);
-                first_tool_rules.push_back(builder.add_rule(name + "-call", "\"" + name + "\\n\" " + args_rule));
-                subsequent_tool_rules.push_back(builder.add_rule(name + "-call2", "\">>>" + name + "\\n\" " + args_rule));
-                data.grammar_triggers.push_back({name, /* .at_start = */ true});
-                data.grammar_triggers.push_back({">>>" + name, /* .at_start = */ false});
-            });
-            auto first_rule = first_tool_rules.empty() ? "" : builder.add_rule("first_tool_call", string_join(first_tool_rules, " | ")) + " space";
-            if (inputs.parallel_tool_calls) {
-                auto subsequent_rule = builder.add_rule("subsequent_tool_call", string_join(subsequent_tool_rules, " | ")) + " space";
-                builder.add_rule("root", first_rule + " (" + subsequent_rule + ")*");
-            } else {
-                builder.add_rule("root", first_rule);
-            }
-
-        }, grammar_options);
-    }
-    return data;
-}
-
-static bool consume(std::string::const_iterator & it, const std::string::const_iterator & end, const std::string & expected) {
-    auto expected_it = expected.begin();
-    auto tmp_it = it;
-    while (tmp_it != end && expected_it != expected.end() && *tmp_it == *expected_it) {
-        ++tmp_it;
-        ++expected_it;
-    }
-    if (expected_it == expected.end()) {
-        it = tmp_it;
-        return true;
-    }
-    return false;
-}
-
-static common_chat_msg common_chat_parse_functionary_v3_2(const std::string & input) {
-    static std::regex function_regex(R"((?:>>>)?(\w+)\n)");
-    static std::regex close_regex(R"($|(?=>>>))");
-
-    std::string content;
-    auto it = input.begin();
-    const auto end = input.end();
-
-    if (consume(it, end, "all\n")) {
-        std::smatch match;
-        if (std::regex_search(it, end, match, function_regex)) {
-            auto fun_it = match.prefix().second;
-            content = std::string(it, fun_it);
-            it = fun_it;
-        } else {
-            common_chat_msg res;
-            res.role = "assistant";
-            res.content = std::string(it, end);
-            return res;
-        }
-    }
-    // TODO: tighten & simplify.
-    auto res = parse_json_tool_calls(std::string(it, end), std::nullopt, function_regex, close_regex);
-    res.content = content;
-    return res;
-}
-
-static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
-    // https://github.com/MeetKai/functionary/blob/main/tests/prompt_test_v3-llama3.1.txt
-    common_chat_params data;
-    json tools = inputs.tools.is_null() ? inputs.tools : json::array();
-    std::string python_code_argument_name;
-    auto has_raw_python = false;
-
-    data.grammar_lazy = inputs.tool_choice != "required";
-    data.grammar = build_grammar([&](const common_grammar_builder & builder) {
-        std::vector<std::string> tool_rules;
-        foreach_function(inputs.tools, [&](const json & tool) {
-            const auto & function = tool["function"];
-            const auto & parameters = function["parameters"];
-            std::string name = function["name"];
-            if (name == "python" || name == "ipython") {
-                if (!parameters.contains("type")) {
-                    throw std::runtime_error("Missing type in python tool");
-                }
-                has_raw_python = true;
-                auto type = parameters.at("type");
-                if (type == "object") {
-                    auto properties = parameters.at("properties");
-                    for (auto it = properties.begin(); it != properties.end(); ++it) {
-                        if (it.value().at("type") == "string") {
-                            if (!python_code_argument_name.empty()) {
-                                throw std::runtime_error("Multiple string arguments found in python tool");
-                            }
-                            python_code_argument_name = it.key();
-                        }
-                    }
-                    if (python_code_argument_name.empty()) {
-                        throw std::runtime_error("No string argument found in python tool");
-                    }
-                } else if (type != "string") {
-                    throw std::runtime_error("Invalid type in python tool: " + type.dump());
-                }
-            }
-            tool_rules.push_back(builder.add_rule(name + "-call", "\"<function=" + name + ">\" " + builder.add_schema(name + "-args", parameters) + " \"</function>\" space"));
-        });
-        if (has_raw_python) {
-            tool_rules.push_back(builder.add_rule("python-call", "\"<|python_tag|>\" .*"));
-            data.grammar_triggers.push_back({"<|python_tag|>", /* .at_start = */ false});
-        }
-        auto tool_call = builder.add_rule("tool_call", string_join(tool_rules, " | ")) + " space";
-        builder.add_rule("root", inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call);
-        data.grammar_triggers.push_back({"<function=", /* .at_start = */ false});
-    }, grammar_options);
-
-    data.prompt = tmpl.apply(inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
-    // TODO: if (has_raw_python)
-    data.format = COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1;
-    return data;
-}
-static common_chat_msg common_chat_parse_functionary_v3_1_llama_3_1(const std::string & input) {
-    // This version of Functionary still supports the llama 3.1 tool call format for the python tool.
-    static std::regex python_tag_regex(R"(<\|python_tag\|>([\s\S\n]*)$)");
-    std::smatch match;
-    if (std::regex_search(input, match, python_tag_regex)) {
-        auto code = match[1].str();
-        return {
-            /* .role = */ "assistant",
-            /* .content = */ match.prefix().str(),
-            /* .tool_calls = */ {
-                {
-                    /* .name = */ "python",
-                    /* .arguments = */ (json {{"code", code}}).dump(),
-                    /* .id = */ "",
-                },
-            }
-        };
-    }
-    static std::regex function_regex(R"(<function=(\w+)>)");
-    static std::regex close_regex(R"(</function>)");
-    // TODO: tighten & simplify.
-    return parse_json_tool_calls(input, std::nullopt, function_regex, close_regex);
-}
-
-static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
-    common_chat_params data;
-    // (content)?(<tool_call>{"name": "foo", "arguments": {"a": 1}}</tool_call>)*
-    data.grammar_lazy = inputs.tool_choice != "required";
-    data.grammar = build_grammar([&](const common_grammar_builder & builder) {
-        std::vector<std::string> tool_rules;
-        foreach_function(inputs.tools, [&](const json & tool) {
-            const auto & function = tool["function"];
-            std::string name = function["name"];
-            auto parameters = function["parameters"];
-            builder.resolve_refs(parameters);
-            tool_rules.push_back(builder.add_schema(name + "-call", {
-                {"type", "object"},
-                {"properties", json {
-                    {"name", json {{"const", name}}},
-                    {"arguments", parameters},
-                }},
-                {"required", json::array({"name", "arguments"})},
-            }));
-        });
-        auto tool_call = "\"<tool_call>\" space " + builder.add_rule("tool_call", string_join(tool_rules, " | ")) + " \"</tool_call>\" space";
-        builder.add_rule("root", inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call);
-        data.grammar_triggers.push_back({"<tool_call>", /* .at_start = */ false});
-        // Not really a trigger but need to print this special token to get a successful parse.
-        data.grammar_triggers.push_back({"</tool_call>", /* .at_start = */ false});
-    }, grammar_options);
-
-    data.prompt = tmpl.apply(inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
-    data.format = COMMON_CHAT_FORMAT_HERMES_2_PRO;
-    return data;
-}
-static common_chat_msg common_chat_parse_hermes_2_pro(const std::string & input) {
-    try {
-        std::regex start_pattern(R"([\n\s]*<tool_call>)");
-        std::regex middle_pattern(R"([\n\s]*</tool_call>[\n\s]*<tool_call>)");
-        std::regex end_pattern(R"([\n\s]*</tool_call>[\n\s]*$)");
-
-        auto end = input.end();
-        std::sregex_iterator rend;
-        std::sregex_iterator rit(input.begin(), end, start_pattern);
-        if (rit == rend) {
-            return {
-                /* .role = */ "assistant",
-                /* .content = */ input,
-                /* .tool_calls = */ {},
-            };
-        }
-
-        common_chat_msg result;
-        result.role = "assistant";
-        result.content = rit->prefix();
-
-        auto it = rit->suffix().first;
-        while (it != end) {
-            json call;
-            if (!parse_json(it, end, call)) {
-                throw std::runtime_error("Failed to parse json tool call");
-            }
-            const auto & arguments = call["arguments"];
-            result.tool_calls.push_back({
-                call["name"],
-                arguments.dump(),
-                // arguments.is_string() ? arguments.get<std::string>() : arguments.dump(),
-                /* id= */ "",
-            });
-            rit = {it, end, middle_pattern};
-            if (rit != rend) {
-                it = rit->suffix().first;
-            } else {
-                rit = {it, end, end_pattern};
-                if (rit == rend) {
-                    throw std::runtime_error("Malformed input, missing </tool_call>");
-                }
-                break;
-            }
-        }
-        return result;
-    } catch (const std::exception & e) {
-        return {
-            /* .role = */ "assistant",
-            /* .content = */ input,
-            /* .tool_calls = */ {},
-        };
-    }
-}
-
-static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
-    common_chat_params data;
-    data.prompt = tmpl.apply(inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
-    data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
-    data.grammar_lazy = false;
-    if (!inputs.json_schema.is_null()) {
-        if (!inputs.grammar.empty()) {
-            throw std::runtime_error("Either \"json_schema\" or \"grammar\" can be specified, but not both");
-        }
-        data.grammar = json_schema_to_grammar(inputs.json_schema);
-    } else {
-        data.grammar = inputs.grammar.empty();
-    }
-    return data;
-}
-
-common_chat_params common_chat_params_init(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
-    auto has_tools = !inputs.tools.is_null() && inputs.tool_choice != "none";
-    LOG_DBG("[%s] has_tools=%s\n", __func__, has_tools ? "true" : "false");
-
-    if (has_tools && !inputs.grammar.empty()) {
-        throw std::runtime_error("Cannot specify grammar with tools");
-    }
-
-    const auto & src = tmpl.source();
-    if (src.find(">>>all") != std::string::npos) {
-        // Functionary prepends "all\n" to plain content outputs, so we use the parser no matter when
-        return common_chat_params_init_functionary_v3_2(tmpl, inputs);
-    }
-    if (src.find(" functools[") != std::string::npos) {
-        // Firefunction v2 requires datetime and functions in the context, even w/o tools.
-        return common_chat_params_init_firefunction_v2(tmpl, inputs);
-    }
-
-    if (!has_tools) {
-        return common_chat_params_init_without_tools(tmpl, inputs);
-    }
-
-    if (src.find("<tool_call>") != std::string::npos) {
-        return common_chat_params_init_hermes_2_pro(tmpl, inputs);
-    }
-    if (src.find("<|start_header_id|>") != std::string::npos
-        && src.find("<function=") != std::string::npos) {
-        return common_chat_params_init_functionary_v3_1_llama_3_1(tmpl, inputs);
-    }
-    if (src.find("<|start_header_id|>ipython<|end_header_id|>") != std::string::npos) {
-        auto allow_python_tag_builtin_tools = src.find("<|python_tag|>") != std::string::npos;
-        return common_chat_params_init_llama_3_1_tool_calls(tmpl, inputs, allow_python_tag_builtin_tools);
-    }
-    if (src.find("<｜tool▁calls▁begin｜>") != std::string::npos) {
-        return common_chat_params_init_deepseek_r1(tmpl, inputs);
-    }
-    if (src.find("[TOOL_CALLS]") != std::string::npos) {
-        return common_chat_params_init_mistral_nemo(tmpl, inputs);
-    }
-    return common_chat_params_init_generic(tmpl, inputs);
-}
-
-static common_chat_msg common_chat_parse_content_only(const std::string & input) {
-    return {
-        /* .role = */ "assistant",
-        /* .content = */ input,
-        /* .tool_calls = */ {},
-    };
-}
-
-common_chat_msg common_chat_parse(const std::string & input, common_chat_format format) {
-    switch (format) {
-        case COMMON_CHAT_FORMAT_CONTENT_ONLY:
-            return common_chat_parse_content_only(input);
-        case COMMON_CHAT_FORMAT_GENERIC:
-            return common_chat_parse_generic(input);
-        case COMMON_CHAT_FORMAT_MISTRAL_NEMO:
-            return common_chat_parse_mistral_nemo(input);
-        case COMMON_CHAT_FORMAT_LLAMA_3_X:
-            return common_chat_parse_llama_3_1(input);
-        case COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS:
-            return common_chat_parse_llama_3_1(input, /* with_builtin_tools= */ true);
-        case COMMON_CHAT_FORMAT_DEEPSEEK_R1:
-            return common_chat_parse_deepseek_r1(input);
-        case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2:
-            return common_chat_parse_functionary_v3_2(input);
-        case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1:
-            return common_chat_parse_functionary_v3_1_llama_3_1(input);
-        case COMMON_CHAT_FORMAT_HERMES_2_PRO:
-            return common_chat_parse_hermes_2_pro(input);
-        case COMMON_CHAT_FORMAT_FIREFUNCTION_V2:
-            return common_chat_parse_firefunction_v2(input);
-        default:
-            throw std::runtime_error("Unsupported format: " + common_chat_format_name(format));
-    }
-}
--- a/common/chat.hpp
+++ b/common/chat.hpp
@@ -1,50 +0,0 @@
-// Chat support (incl. tool call grammar constraining & output parsing) w/ generic & custom template handlers.
-
-#pragma once
-
-#include "common.h"
-#include <json.hpp>
-#include <optional>
-#include <string>
-#include <vector>
-
-using json = nlohmann::ordered_json;
-
-struct common_chat_inputs {
-    json messages;
-    json tools;
-    json tool_choice;
-    json json_schema;
-    bool parallel_tool_calls;
-    bool stream;
-    std::string grammar;
-    bool add_generation_prompt = true;
-};
-
-enum common_chat_format {
-    COMMON_CHAT_FORMAT_CONTENT_ONLY,
-    COMMON_CHAT_FORMAT_GENERIC,
-    COMMON_CHAT_FORMAT_MISTRAL_NEMO,
-    COMMON_CHAT_FORMAT_LLAMA_3_X,
-    COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
-    COMMON_CHAT_FORMAT_DEEPSEEK_R1,
-    COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
-    COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
-    COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
-    COMMON_CHAT_FORMAT_HERMES_2_PRO,
-
-    COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
-};
-
-struct common_chat_params {
-    common_chat_format                  format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
-    json                                prompt;
-    std::string                         grammar;
-    bool                                grammar_lazy = false;
-    std::vector<common_grammar_trigger> grammar_triggers;
-    std::vector<std::string>            additional_stops;
-};
-
-struct common_chat_params common_chat_params_init(const common_chat_template & tmpl, const struct common_chat_inputs & params);
-std::string               common_chat_format_name(common_chat_format format);
-common_chat_msg           common_chat_parse(      const std::string & input, common_chat_format format);
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -12,8 +12,6 @@
 #include "json.hpp"
 #include "json-schema-to-grammar.h"
 #include "llama.h"
-#include "chat.hpp"
-#include "chat-template.hpp"

 #include <algorithm>
 #include <cinttypes>
@@ -75,22 +73,6 @@
 #include <sys/syslimits.h>
 #endif
 #define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
-
-//
-// CURL utils
-//
-
-using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
-
-// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
-struct curl_slist_ptr {
-    struct curl_slist * ptr = nullptr;
-    ~curl_slist_ptr() {
-        if (ptr) {
-            curl_slist_free_all(ptr);
-        }
-    }
-};
 #endif // LLAMA_USE_CURL

 using json = nlohmann::ordered_json;
@@ -485,48 +467,6 @@ void string_replace_all(std::string & s, const std::string & search, const std::
    s = std::move(builder);
 }

-std::string string_join(const std::vector<std::string> & values, const std::string & separator) {
-    std::ostringstream result;
-    for (size_t i = 0; i < values.size(); ++i) {
-        if (i > 0) {
-            result << separator;
-        }
-        result << values[i];
-    }
-    return result.str();
-}
-
-std::vector<std::string> string_split(const std::string & str, const std::string & delimiter) {
-    std::vector<std::string> parts;
-    size_t start = 0;
-    size_t end = str.find(delimiter);
-
-    while (end != std::string::npos) {
-        parts.push_back(str.substr(start, end - start));
-        start = end + delimiter.length();
-        end = str.find(delimiter, start);
-    }
-
-    parts.push_back(str.substr(start));
-
-    return parts;
-}
-
-std::string string_repeat(const std::string & str, size_t n) {
-    if (n == 0) {
-        return "";
-    }
-
-    std::string result;
-    result.reserve(str.length() * n);
-
-    for (size_t i = 0; i < n; ++i) {
-        result += str;
-    }
-
-    return result;
-}
-
 std::string string_from(bool value) {
    return value ? "true" : "false";
 }
@@ -917,23 +857,21 @@ struct common_init_result common_init_from_params(common_params & params) {
        return iparams;
    }

-    const llama_vocab * vocab = llama_model_get_vocab(model);
-
    if (params.reranking) {
        bool ok = true;

-        if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
-            LOG_WRN("%s: warning: vocab does not have a  BOS token, reranking will not work\n", __func__);
+        if (llama_token_bos(model) == LLAMA_TOKEN_NULL) {
+            LOG_WRN("%s: warning: model does not have a  BOS token, reranking will not work\n", __func__);
            ok = false;
        }

-        if (llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
-            LOG_WRN("%s: warning: vocab does not have an EOS token, reranking will not work\n", __func__);
+        if (llama_token_eos(model) == LLAMA_TOKEN_NULL) {
+            LOG_WRN("%s: warning: model does not have an EOS token, reranking will not work\n", __func__);
            ok = false;
        }

-        if (llama_vocab_sep(vocab) == LLAMA_TOKEN_NULL) {
-            LOG_WRN("%s: warning: vocab does not have a  SEP token, reranking will not work\n", __func__);
+        if (llama_token_sep(model) == LLAMA_TOKEN_NULL) {
+            LOG_WRN("%s: warning: model does not have a  SEP token, reranking will not work\n", __func__);
            ok = false;
        }

@@ -946,7 +884,7 @@ struct common_init_result common_init_from_params(common_params & params) {

    auto cparams = common_context_params_to_llama(params);

-    llama_context * lctx = llama_init_from_model(model, cparams);
+    llama_context * lctx = llama_new_context_with_model(model, cparams);
    if (lctx == NULL) {
        LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
        llama_model_free(model);
@@ -960,7 +898,7 @@ struct common_init_result common_init_from_params(common_params & params) {

    if (!params.control_vectors.empty()) {
        if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
-        if (params.control_vector_layer_end   <= 0) params.control_vector_layer_end   = llama_model_n_layer(model);
+        if (params.control_vector_layer_end   <= 0) params.control_vector_layer_end   = llama_n_layer(model);

        const auto cvec = common_control_vector_load(params.control_vectors);
        if (cvec.n_embd == -1) {
@@ -970,13 +908,12 @@ struct common_init_result common_init_from_params(common_params & params) {
            return iparams;
        }

-        int err = llama_apply_adapter_cvec(
-                lctx,
-                cvec.data.data(),
-                cvec.data.size(),
-                cvec.n_embd,
-                params.control_vector_layer_start,
-                params.control_vector_layer_end);
+        int err = llama_control_vector_apply(lctx,
+                                             cvec.data.data(),
+                                             cvec.data.size(),
+                                             cvec.n_embd,
+                                             params.control_vector_layer_start,
+                                             params.control_vector_layer_end);
        if (err) {
            llama_free(lctx);
            llama_model_free(model);
@@ -987,8 +924,8 @@ struct common_init_result common_init_from_params(common_params & params) {

    // load and optionally apply lora adapters
    for (auto & la : params.lora_adapters) {
-        llama_adapter_lora_ptr lora;
-        lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
+        llama_lora_adapter_ptr lora;
+        lora.reset(llama_lora_adapter_init(model, la.path.c_str()));
        if (lora == nullptr) {
            LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
            llama_free(lctx);
@@ -1001,17 +938,17 @@ struct common_init_result common_init_from_params(common_params & params) {
    }

    if (!params.lora_init_without_apply) {
-        common_set_adapter_lora(lctx, params.lora_adapters);
+        common_lora_adapters_apply(lctx, params.lora_adapters);
    }

-    if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
-        LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
+    if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
+        LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
        params.sampling.ignore_eos = false;
    }

    if (params.sampling.ignore_eos) {
-        for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
-            if (llama_vocab_is_eog(vocab, i)) {
+        for (llama_token i = 0; i < llama_n_vocab(model); i++) {
+            if (llama_token_is_eog(model, i)) {
                LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
                params.sampling.logit_bias.push_back({i, -INFINITY});
            }
@@ -1032,9 +969,8 @@ struct common_init_result common_init_from_params(common_params & params) {
        LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);

        std::vector<llama_token> tmp;
-        llama_token bos = llama_vocab_bos(vocab);
-        llama_token eos = llama_vocab_eos(vocab);
-
+        llama_token bos = llama_token_bos(model);
+        llama_token eos = llama_token_eos(model);
        // some models (e.g. T5) don't have a BOS token
        if (bos != LLAMA_TOKEN_NULL) {
            tmp.push_back(bos);
@@ -1069,11 +1005,11 @@ struct common_init_result common_init_from_params(common_params & params) {
    return iparams;
 }

-void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) {
-    llama_clear_adapter_lora(ctx);
+void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora) {
+    llama_lora_adapter_clear(ctx);
    for (auto & la : lora) {
        if (la.scale != 0.0f) {
-            llama_set_adapter_lora(ctx, la.ptr, la.scale);
+            llama_lora_adapter_set(ctx, la.ptr, la.scale);
        }
    }
 }
@@ -1087,6 +1023,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
    if (params.n_gpu_layers != -1) {
        mparams.n_gpu_layers = params.n_gpu_layers;
    }
+    mparams.rpc_servers     = params.rpc_servers.c_str();
    mparams.main_gpu        = params.main_gpu;
    mparams.split_mode      = params.split_mode;
    mparams.tensor_split    = params.tensor_split;
@@ -1189,8 +1126,7 @@ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int ma

 static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
    // Initialize libcurl
-    curl_ptr       curl(curl_easy_init(), &curl_easy_cleanup);
-    curl_slist_ptr http_headers;
+    std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
    if (!curl) {
        LOG_ERR("%s: error initializing libcurl\n", __func__);
        return false;
@@ -1204,9 +1140,11 @@ static bool common_download_file(const std::string & url, const std::string & pa

    // Check if hf-token or bearer-token was specified
    if (!hf_token.empty()) {
-        std::string auth_header = "Authorization: Bearer " + hf_token;
-        http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
-        curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
+      std::string auth_header = "Authorization: Bearer ";
+      auth_header += hf_token.c_str();
+      struct curl_slist *http_headers = NULL;
+      http_headers = curl_slist_append(http_headers, auth_header.c_str());
+      curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers);
    }

 #if defined(_WIN32)
@@ -1502,80 +1440,6 @@ struct llama_model * common_load_model_from_hf(
    return common_load_model_from_url(model_url, local_path, hf_token, params);
 }

-/**
- * Allow getting the HF file from the HF repo with tag (like ollama), for example:
- * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
- * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
- * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
- * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
- *
- * Return pair of <repo, file> (with "repo" already having tag removed)
- *
- * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
- */
-std::pair<std::string, std::string> common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & hf_token) {
-    auto parts = string_split<std::string>(hf_repo_with_tag, ':');
-    std::string tag = parts.size() > 1 ? parts.back() : "latest";
-    std::string hf_repo = parts[0];
-    if (string_split<std::string>(hf_repo, '/').size() != 2) {
-        throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
-    }
-
-    // fetch model info from Hugging Face Hub API
-    json model_info;
-    curl_ptr       curl(curl_easy_init(), &curl_easy_cleanup);
-    curl_slist_ptr http_headers;
-    std::string res_str;
-    std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag;
-    curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
-    curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
-    typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
-    auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
-        static_cast<std::string *>(data)->append((char * ) ptr, size * nmemb);
-        return size * nmemb;
-    };
-    curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
-    curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str);
-#if defined(_WIN32)
-    curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
-#endif
-    if (!hf_token.empty()) {
-        std::string auth_header = "Authorization: Bearer " + hf_token;
-        http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
-    }
-    // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
-    http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
-    http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json");
-    curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
-
-    CURLcode res = curl_easy_perform(curl.get());
-
-    if (res != CURLE_OK) {
-        throw std::runtime_error("error: cannot make GET request to HF API");
-    }
-
-    long res_code;
-    curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
-    if (res_code == 200) {
-        model_info = json::parse(res_str);
-    } else if (res_code == 401) {
-        throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
-    } else {
-        throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
-    }
-
-    // check response
-    if (!model_info.contains("ggufFile")) {
-        throw std::runtime_error("error: model does not have ggufFile");
-    }
-    json & gguf_file = model_info.at("ggufFile");
-    if (!gguf_file.contains("rfilename")) {
-        throw std::runtime_error("error: ggufFile does not have rfilename");
-    }
-
-    return std::make_pair(hf_repo, gguf_file.at("rfilename"));
-}
-
 #else

 struct llama_model * common_load_model_from_url(
@@ -1597,11 +1461,6 @@ struct llama_model * common_load_model_from_hf(
    return nullptr;
 }

-std::pair<std::string, std::string> common_get_hf_file(const std::string &, const std::string &) {
-    LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
-    return std::make_pair("", "");
-}
-
 #endif // LLAMA_USE_CURL

 //
@@ -1700,23 +1559,21 @@ std::vector<llama_token> common_tokenize(
           const std::string & text,
                        bool   add_special,
                        bool   parse_special) {
-    const llama_model * model = llama_get_model(ctx);
-    const llama_vocab * vocab = llama_model_get_vocab(model);
-    return common_tokenize(vocab, text, add_special, parse_special);
+    return common_tokenize(llama_get_model(ctx), text, add_special, parse_special);
 }

 std::vector<llama_token> common_tokenize(
-    const struct llama_vocab * vocab,
+    const struct llama_model * model,
           const std::string & text,
                        bool   add_special,
                        bool   parse_special) {
    // upper limit for the number of tokens
    int n_tokens = text.length() + 2 * add_special;
    std::vector<llama_token> result(n_tokens);
-    n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
+    n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
    if (n_tokens < 0) {
        result.resize(-n_tokens);
-        int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
+        int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
        GGML_ASSERT(check == -n_tokens);
    } else {
        result.resize(n_tokens);
@@ -1725,18 +1582,12 @@ std::vector<llama_token> common_tokenize(
 }

 std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
-    const llama_model * model = llama_get_model(ctx);
-    const llama_vocab * vocab = llama_model_get_vocab(model);
-    return common_token_to_piece(vocab, token, special);
-}
-
-std::string common_token_to_piece(const struct llama_vocab * vocab, llama_token token, bool special) {
    std::string piece;
    piece.resize(piece.capacity());  // using string internal cache, 15 bytes + '\n'
-    const int n_chars = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
+    const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
    if (n_chars < 0) {
        piece.resize(-n_chars);
-        int check = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
+        int check = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
        GGML_ASSERT(check == -n_chars);
    }
    else {
@@ -1746,19 +1597,13 @@ std::string common_token_to_piece(const struct llama_vocab * vocab, llama_token
    return piece;
 }

-std::string common_detokenize(const struct llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
-    const llama_model * model = llama_get_model(ctx);
-    const llama_vocab * vocab = llama_model_get_vocab(model);
-    return common_detokenize(vocab, tokens, special);
-}
-
-std::string common_detokenize(const struct llama_vocab * vocab, const std::vector<llama_token> & tokens, bool special) {
+std::string common_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
    std::string text;
    text.resize(std::max(text.capacity(), tokens.size()));
-    int32_t n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
+    int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
    if (n_chars < 0) {
        text.resize(-n_chars);
-        n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
+        n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
        GGML_ASSERT(n_chars <= (int32_t)text.size());  // whitespace trimming is performed after per-token detokenization
    }

@@ -1772,80 +1617,75 @@ std::string common_detokenize(const struct llama_vocab * vocab, const std::vecto
 // Chat template utils
 //

-bool common_chat_verify_template(const std::string & tmpl, bool use_jinja) {
-    if (use_jinja) {
-        try {
-            auto chat_template = common_chat_template(tmpl, "<s>", "</s>");
-            common_chat_inputs inputs;
-            inputs.messages = json::array({{
-                {"role", "user"},
-                {"content", "test"},
-            }});
-            common_chat_params_init(chat_template, inputs);
-            return true;
-        } catch (const std::exception & e) {
-            LOG_ERR("%s: failed to apply template: %s\n", __func__, e.what());
-            return false;
-        }
+std::string common_get_builtin_chat_template(const struct llama_model * model) {
+    static const char * template_key = "tokenizer.chat_template";
+    // call with NULL buffer to get the total size of the string
+    int32_t res = llama_model_meta_val_str(model, template_key, NULL, 0);
+    if (res > 0) {
+        std::vector<char> model_template(res + 1, 0);
+        llama_model_meta_val_str(model, template_key, model_template.data(), model_template.size());
+        return std::string(model_template.data(), model_template.size() - 1);
    }
+    return "";
+}
+
+bool common_chat_verify_template(const std::string & tmpl) {
    llama_chat_message chat[] = {{"user", "test"}};
-    const int res = llama_chat_apply_template(tmpl.c_str(), chat, 1, true, nullptr, 0);
+    int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
    return res >= 0;
 }

-std::string common_chat_apply_template(
-        const common_chat_template & tmpl,
+std::string common_chat_apply_template(const struct llama_model * model,
+        const std::string & tmpl,
        const std::vector<common_chat_msg> & msgs,
-        bool add_ass,
-        bool use_jinja) {
-    if (use_jinja) {
-        auto messages = json::array();
-        for (const auto & msg : msgs) {
-            messages.push_back({{"role", msg.role}, {"content", msg.content}});
-        }
-        common_chat_inputs inputs;
-        inputs.messages = messages;
-        inputs.add_generation_prompt = add_ass;
-        return common_chat_params_init(tmpl, inputs).prompt;
-    }
-
+        bool add_ass) {
    int alloc_size = 0;
+    bool fallback = false; // indicate if we must fallback to default chatml
    std::vector<llama_chat_message> chat;
-    for (const auto & msg : msgs) {
+    for (auto & msg : msgs) {
        chat.push_back({msg.role.c_str(), msg.content.c_str()});
        alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
    }

+    const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
    std::vector<char> buf(alloc_size);

    // run the first time to get the total output length
-    int32_t res = llama_chat_apply_template(tmpl.source().c_str(), chat.data(), chat.size(), add_ass, buf.data(), buf.size());
+    int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());

    // error: chat template is not supported
    if (res < 0) {
-        // if the custom "tmpl" is not supported, we throw an error
-        // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
-        throw std::runtime_error("this custom template is not supported");
+        if (ptr_tmpl != nullptr) {
+            // if the custom "tmpl" is not supported, we throw an error
+            // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
+            throw std::runtime_error("this custom template is not supported");
+        } else {
+            // If the built-in template is not supported, we default to chatml
+            res = llama_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
+            fallback = true;
+        }
    }

    // if it turns out that our buffer is too small, we resize it
    if ((size_t) res > buf.size()) {
        buf.resize(res);
-        res = llama_chat_apply_template(tmpl.source().c_str(), chat.data(), chat.size(), add_ass, buf.data(), buf.size());
+        res = llama_chat_apply_template(
+            fallback ? nullptr : model,
+            fallback ? "chatml" : ptr_tmpl,
+            chat.data(), chat.size(), add_ass, buf.data(), buf.size());
    }

    std::string formatted_chat(buf.data(), res);
    return formatted_chat;
 }

-std::string common_chat_format_single(
-        const common_chat_template & tmpl,
+std::string common_chat_format_single(const struct llama_model * model,
+        const std::string & tmpl,
        const std::vector<common_chat_msg> & past_msg,
        const common_chat_msg & new_msg,
-        bool add_ass,
-        bool use_jinja) {
+        bool add_ass) {
    std::ostringstream ss;
-    auto fmt_past_msg = past_msg.empty() ? "" : common_chat_apply_template(tmpl, past_msg, false, use_jinja);
+    auto fmt_past_msg = past_msg.empty() ? "" : common_chat_apply_template(model, tmpl, past_msg, false);
    std::vector<common_chat_msg> chat_new(past_msg);
    // if the past_msg ends with a newline, we must preserve it in the formatted version
    if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
@@ -1853,74 +1693,21 @@ std::string common_chat_format_single(
    };
    // format chat with new_msg
    chat_new.push_back(new_msg);
-    auto fmt_new_msg = common_chat_apply_template(tmpl, chat_new, add_ass, use_jinja);
+    auto fmt_new_msg = common_chat_apply_template(model, tmpl, chat_new, add_ass);
    // get the diff part
    ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
    return ss.str();
 }

-std::string common_chat_format_example(const common_chat_template & tmpl, bool use_jinja) {
+std::string common_chat_format_example(const struct llama_model * model,
+        const std::string & tmpl) {
    std::vector<common_chat_msg> msgs = {
-        {"system",    "You are a helpful assistant", {}},
-        {"user",      "Hello", {}},
-        {"assistant", "Hi there", {}},
-        {"user",      "How are you?", {}},
-    };
-    return common_chat_apply_template(tmpl, msgs, true, use_jinja);
-}
-
-common_chat_templates common_chat_templates_from_model(const struct llama_model * model, const std::string & chat_template_override)
-{
-    auto vocab = llama_model_get_vocab(model);
-    std::string default_template_src = chat_template_override;
-    std::string template_tool_use_src = chat_template_override;
-    bool has_explicit_template = !chat_template_override.empty();
-    if (chat_template_override.empty()) {
-        auto str = llama_model_chat_template(model, /* name */ nullptr);
-        if (str) {
-            default_template_src = str;
-            has_explicit_template = true;
-        }
-        str = llama_model_chat_template(model, /* name */ "tool_use");
-        if (str) {
-            template_tool_use_src = str;
-            has_explicit_template = true;
-        }
-    }
-    if (default_template_src.empty() || default_template_src == "chatml") {
-        if (!template_tool_use_src.empty()) {
-            default_template_src = template_tool_use_src;
-        } else {
-            default_template_src = R"(
-                {%- for message in messages -%}
-                    {{- "<|im_start|>" + message.role + "\n" + message.content + "<|im_end|>\n" -}}
-                {%- endfor -%}
-                {%- if add_generation_prompt -%}
-                    {{- "<|im_start|>assistant\n" -}}
-                {%- endif -%}
-            )";
-        }
-    }
-    const auto get_token = [&](llama_token token, const char * name, const char * jinja_variable_name) {
-        if (token == LLAMA_TOKEN_NULL) {
-            if (default_template_src.find(jinja_variable_name) != std::string::npos
-                || template_tool_use_src.find(jinja_variable_name) != std::string::npos) {
-                LOG_WRN("%s: warning: vocab does not have a %s token, jinja template won't work as intended.\n", __func__, name);
-            }
-            return std::string();
-        } else {
-            return common_token_to_piece(vocab, token, true);
-        }
-    };
-    auto token_bos = get_token(llama_vocab_bos(vocab), "BOS", "bos_token");
-    auto token_eos = get_token(llama_vocab_eos(vocab), "EOS", "eos_token");
-    return {
-        has_explicit_template,
-        std::make_unique<minja::chat_template>(default_template_src, token_bos, token_eos),
-        template_tool_use_src.empty()
-            ? nullptr
-            : std::make_unique<minja::chat_template>(template_tool_use_src, token_bos, token_eos)
+        {"system",    "You are a helpful assistant"},
+        {"user",      "Hello"},
+        {"assistant", "Hi there"},
+        {"user",      "How are you?"},
    };
+    return common_chat_apply_template(model, tmpl, msgs, true);
 }

 //
--- a/common/common.h
+++ b/common/common.h
@@ -24,11 +24,11 @@

 #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"

-struct common_adapter_lora_info {
+struct common_lora_adapter_info {
    std::string path;
    float scale;

-    struct llama_adapter_lora * ptr;
+    struct llama_lora_adapter * ptr;
 };

 using llama_tokens = std::vector<llama_token>;
@@ -103,17 +103,6 @@ enum dimre_method {
    DIMRE_METHOD_MEAN,
 };

-enum common_conversation_mode {
-    COMMON_CONVERSATION_MODE_DISABLED = 0,
-    COMMON_CONVERSATION_MODE_ENABLED  = 1,
-    COMMON_CONVERSATION_MODE_AUTO     = 2,
-};
-
-struct common_grammar_trigger {
-    std::string word;
-    bool at_start;
-};
-
 // sampling parameters
 struct common_params_sampling {
    uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
@@ -159,10 +148,7 @@ struct common_params_sampling {
        COMMON_SAMPLER_TYPE_TEMPERATURE,
    };

-    std::string                         grammar; // optional BNF-like grammar to constrain sampling
-    bool                                grammar_lazy = false;
-    std::vector<common_grammar_trigger> grammar_trigger_words;  // optional trigger words to trigger lazy grammar
-    std::vector<llama_token>            grammar_trigger_tokens; // optional trigger tokens to trigger lazy grammar and print trigger special tokens.
+    std::string grammar; // optional BNF-like grammar to constrain sampling

    std::vector<llama_logit_bias> logit_bias; // logit biases to apply

@@ -183,11 +169,7 @@ struct common_params_speculative {
    struct cpu_params cpuparams;
    struct cpu_params cpuparams_batch;

-    std::string hf_repo = ""; // HF repo                                                     // NOLINT
-    std::string hf_file = ""; // HF file                                                     // NOLINT
-
-    std::string model = "";     // draft model for speculative decoding                      // NOLINT
-    std::string model_url = ""; // model url to download                                     // NOLINT
+    std::string model = ""; // draft model for speculative decoding                          // NOLINT
 };

 struct common_params_vocoder {
@@ -196,8 +178,6 @@ struct common_params_vocoder {

    std::string model     = ""; // model path                                                // NOLINT
    std::string model_url = ""; // model url to download                                     // NOLINT
-
-    bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy            // NOLINT
 };

 struct common_params {
@@ -260,13 +240,14 @@ struct common_params {
    std::string lookup_cache_static  = ""; // path of static ngram cache file for lookup decoding           // NOLINT
    std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding          // NOLINT
    std::string logits_file          = ""; // file for saving *all* logits                                  // NOLINT
+    std::string rpc_servers          = ""; // comma separated list of RPC servers                           // NOLINT

    std::vector<std::string> in_files;   // all input files
    std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
    std::vector<llama_model_kv_override> kv_overrides;

-    bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
-    std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
+    bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
+    std::vector<common_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale

    std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale

@@ -294,6 +275,7 @@ struct common_params {
    bool special           = false; // enable special token output
    bool interactive       = false; // interactive mode
    bool interactive_first = false; // wait for user input immediately
+    bool conversation      = false; // conversation mode (does not print special tokens and suffix/prefix)
    bool prompt_cache_all  = false; // save user input and generations to prompt cache
    bool prompt_cache_ro   = false; // open the prompt cache read-only and do not update it

@@ -319,8 +301,6 @@ struct common_params {
    ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
    ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V

-    common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
-
    // multimodal models (see examples/llava)
    std::string mmproj = "";        // path to multimodal projector                                         // NOLINT
    std::vector<std::string> image; // path to image file(s)
@@ -342,7 +322,6 @@ struct common_params {
    std::string hostname      = "127.0.0.1";
    std::string public_path   = "";                                                                         // NOLINT
    std::string chat_template = "";                                                                         // NOLINT
-    bool use_jinja = false;                                                                                 // NOLINT
    bool enable_chat_template = true;

    std::vector<std::string> api_keys;
@@ -437,10 +416,6 @@ std::string string_format(const char * fmt, ...);
 std::string string_strip(const std::string & str);
 std::string string_get_sortable_timestamp();

-std::string string_join(const std::vector<std::string> & values, const std::string & separator);
-std::vector<std::string> string_split(const std::string & str, const std::string & delimiter);
-std::string string_repeat(const std::string & str, size_t n);
-
 void string_replace_all(std::string & s, const std::string & search, const std::string & replace);

 template<class T>
@@ -479,11 +454,6 @@ static bool string_starts_with(const std::string & str,
    return str.rfind(prefix, 0) == 0;
 }

-static bool string_ends_with(const std::string & str,
-                               const std::string & suffix) {  // While we wait for C++20's std::string::ends_with...
-    return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
-}
-
 bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
 void string_process_escapes(std::string & input);

@@ -511,7 +481,7 @@ struct common_init_result {
    llama_model_ptr   model;
    llama_context_ptr context;

-    std::vector<llama_adapter_lora_ptr> lora;
+    std::vector<llama_lora_adapter_ptr> lora;
 };

 struct common_init_result     common_init_from_params(common_params & params);
@@ -525,7 +495,6 @@ struct llama_model * common_load_model_from_url(
    const std::string & local_path,
    const std::string & hf_token,
    const struct llama_model_params & params);
-
 struct llama_model * common_load_model_from_hf(
    const std::string & repo,
    const std::string & remote_path,
@@ -533,12 +502,8 @@ struct llama_model * common_load_model_from_hf(
    const std::string & hf_token,
    const struct llama_model_params & params);

-std::pair<std::string, std::string> common_get_hf_file(
-    const std::string & hf_repo_with_tag,
-    const std::string & hf_token);
-
 // clear LoRA adapters from context, then apply new list of adapters
-void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
+void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora);

 //
 // Batch utils
@@ -576,7 +541,7 @@ std::vector<llama_token> common_tokenize(
                        bool   parse_special = false);

 std::vector<llama_token> common_tokenize(
-    const struct llama_vocab * vocab,
+    const struct llama_model * model,
           const std::string & text,
                        bool   add_special,
                        bool   parse_special = false);
@@ -588,21 +553,11 @@ std::string common_token_to_piece(
                       llama_token   token,
                       bool          special = true);

-std::string common_token_to_piece(
-          const struct llama_vocab * vocab,
-                       llama_token   token,
-                       bool          special = true);
-
 // detokenizes a vector of tokens into a string
 // should work similar to Python's `tokenizer.decode`
 // optionally renders special/control tokens
 std::string common_detokenize(
-            const struct llama_context * ctx,
-        const std::vector<llama_token> & tokens,
-                                  bool   special = true);
-
-std::string common_detokenize(
-              const struct llama_vocab * vocab,
+                         llama_context * ctx,
        const std::vector<llama_token> & tokens,
                                  bool   special = true);

@@ -610,56 +565,36 @@ std::string common_detokenize(
 // Chat template utils
 //

-struct common_tool_call {
-    std::string name;
-    std::string arguments;
-    std::string id;
-};
-
 // same with llama_chat_message, but uses std::string
 struct common_chat_msg {
    std::string role;
    std::string content;
-    std::vector<common_tool_call> tool_calls;
 };

+// Get the built-in chat template for the model. Return empty string if not present.
+std::string common_get_builtin_chat_template(const struct llama_model * model);
+
 // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
-bool common_chat_verify_template(const std::string & tmpl, bool use_jinja);
-
-namespace minja {
-    class chat_template;
-}
-
-typedef minja::chat_template common_chat_template;
-
-struct common_chat_templates {
-    bool has_explicit_template; // Model had builtin template or template overridde was specified.
-    std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
-    std::unique_ptr<common_chat_template> template_tool_use;
-};
+bool common_chat_verify_template(const std::string & tmpl);

 // CPP wrapper for llama_chat_apply_template
 // If the built-in template is not supported, we default to chatml
 // If the custom "tmpl" is not supported, we throw an error
-std::string common_chat_apply_template(
-        const common_chat_template & tmpl,
+std::string common_chat_apply_template(const struct llama_model * model,
+        const std::string & tmpl,
        const std::vector<common_chat_msg> & chat,
-        bool add_ass,
-        bool use_jinja);
+        bool add_ass);

 // Format single message, while taking into account the position of that message in chat history
-std::string common_chat_format_single(
-        const common_chat_template & tmpl,
+std::string common_chat_format_single(const struct llama_model * model,
+        const std::string & tmpl,
        const std::vector<common_chat_msg> & past_msg,
        const common_chat_msg & new_msg,
-        bool add_ass,
-        bool use_jinja);
+        bool add_ass);

 // Returns an example of formatted chat
-std::string common_chat_format_example(
-    const common_chat_template & tmpl, bool use_jinja);
-
-common_chat_templates common_chat_templates_from_model(const struct llama_model * model, const std::string & chat_template_override);
+std::string common_chat_format_example(const struct llama_model * model,
+        const std::string & tmpl);

 //
 // KV cache utils
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -1,6 +1,4 @@
 #include "json-schema-to-grammar.h"
-#include "common.h"
-
 #include <algorithm>
 #include <fstream>
 #include <map>
@@ -13,6 +11,11 @@

 using json = nlohmann::ordered_json;

+template <typename Iterator>
+static std::string join(Iterator begin, Iterator end, const std::string & separator);
+
+static std::string repeat(const std::string & str, size_t n);
+
 static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
    auto has_max = max_items != std::numeric_limits<int>::max();

@@ -125,8 +128,8 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
                if (sub_len > 0) {
                    auto from_sub = from.substr(i + 1);
                    auto to_sub = to.substr(i + 1);
-                    auto sub_zeros = string_repeat("0", sub_len);
-                    auto sub_nines = string_repeat("9", sub_len);
+                    auto sub_zeros = repeat("0", sub_len);
+                    auto sub_nines = repeat("9", sub_len);

                    auto to_reached = false;
                    out << "(";
@@ -185,8 +188,8 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
        auto max_digits = max_s.length();

        for (auto digits = min_digits; digits < max_digits; digits++) {
-            uniform_range(min_s, string_repeat("9", digits));
-            min_s = "1" + string_repeat("0", digits);
+            uniform_range(min_s, repeat("9", digits));
+            min_s = "1" + repeat("0", digits);
            out << " | ";
        }
        uniform_range(min_s, max_s);
@@ -315,6 +318,49 @@ std::unordered_map<char, std::string> GRAMMAR_LITERAL_ESCAPES = {
 std::unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
 std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'^', '$', '.', '[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};

+template <typename Iterator>
+std::string join(Iterator begin, Iterator end, const std::string & separator) {
+    std::ostringstream result;
+    if (begin != end) {
+        result << *begin;
+        for (Iterator it = begin + 1; it != end; ++it) {
+            result << separator << *it;
+        }
+    }
+    return result.str();
+}
+
+static std::vector<std::string> split(const std::string & str, const std::string & delimiter) {
+    std::vector<std::string> tokens;
+    size_t start = 0;
+    size_t end = str.find(delimiter);
+
+    while (end != std::string::npos) {
+        tokens.push_back(str.substr(start, end - start));
+        start = end + delimiter.length();
+        end = str.find(delimiter, start);
+    }
+
+    tokens.push_back(str.substr(start));
+
+    return tokens;
+}
+
+static std::string repeat(const std::string & str, size_t n) {
+    if (n == 0) {
+        return "";
+    }
+
+    std::string result;
+    result.reserve(str.length() * n);
+
+    for (size_t i = 0; i < n; ++i) {
+        result += str;
+    }
+
+    return result;
+}
+
 static std::string replacePattern(const std::string & input, const std::regex & regex, const std::function<std::string(const std::smatch  &)> & replacement) {
    std::smatch match;
    std::string result;
@@ -343,7 +389,6 @@ static std::string format_literal(const std::string & literal) {

 class SchemaConverter {
 private:
-    friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
    std::function<json(const std::string &)> _fetch_json;
    bool _dotall;
    std::map<std::string, std::string> _rules;
@@ -373,7 +418,7 @@ private:
        for (size_t i = 0; i < alt_schemas.size(); i++) {
            rules.push_back(visit(alt_schemas[i], name + (name.empty() ? "alternative-" : "-") + std::to_string(i)));
        }
-        return string_join(rules, " | ");
+        return join(rules.begin(), rules.end(), " | ");
    }

    std::string _visit_pattern(const std::string & pattern, const std::string & name) {
@@ -436,7 +481,7 @@ private:
                for (const auto & item : ret) {
                    results.push_back(to_rule(item));
                }
-                return std::make_pair(string_join(results, " "), false);
+                return std::make_pair(join(results.begin(), results.end(), " "), false);
            };

            while (i < length) {
@@ -494,7 +539,7 @@ private:
                    }
                    curly_brackets += '}';
                    i++;
-                    auto nums = string_split(curly_brackets.substr(1, curly_brackets.length() - 2), ",");
+                    auto nums = split(curly_brackets.substr(1, curly_brackets.length() - 2), ",");
                    int min_times = 0;
                    int max_times = std::numeric_limits<int>::max();
                    try {
@@ -764,11 +809,10 @@ private:
 public:
    SchemaConverter(
        const std::function<json(const std::string &)> & fetch_json,
-        bool dotall,
-        bool compact_spaces)
+        bool dotall)
          : _fetch_json(fetch_json), _dotall(dotall)
    {
-        _rules["space"] = compact_spaces ? "\" \"?" : SPACE_RULE;
+        _rules["space"] = SPACE_RULE;
    }

    void resolve_refs(json & schema, const std::string & url) {
@@ -810,7 +854,7 @@ public:
                            return;
                        }
                        std::string pointer = ref.substr(ref.find('#') + 1);
-                        std::vector<std::string> tokens = string_split(pointer, "/");
+                        std::vector<std::string> tokens = split(pointer, "/");
                        for (size_t i = 1; i < tokens.size(); ++i) {
                            std::string sel = tokens[i];
                            if (target.is_null() || !target.contains(sel)) {
@@ -861,7 +905,7 @@ public:
            for (const auto & v : schema["enum"]) {
                enum_values.push_back(_generate_constant_rule(v));
            }
-            return _add_rule(rule_name, "(" + string_join(enum_values, " | ") + ") space");
+            return _add_rule(rule_name, "(" + join(enum_values.begin(), enum_values.end(), " | ") + ") space");
        } else if ((schema_type.is_null() || schema_type == "object")
                && (schema.contains("properties") ||
                    (schema.contains("additionalProperties") && schema["additionalProperties"] != true))) {
@@ -975,10 +1019,10 @@ public:

    void check_errors() {
        if (!_errors.empty()) {
-            throw std::runtime_error("JSON schema conversion failed:\n" + string_join(_errors, "\n"));
+            throw std::runtime_error("JSON schema conversion failed:\n" + join(_errors.begin(), _errors.end(), "\n"));
        }
        if (!_warnings.empty()) {
-            fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n", string_join(_warnings, "; ").c_str());
+            fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n", join(_warnings.begin(), _warnings.end(), "; ").c_str());
        }
    }

@@ -992,27 +1036,10 @@ public:
 };

 std::string json_schema_to_grammar(const json & schema) {
-    return build_grammar([&](const common_grammar_builder & callbacks) {
-        auto copy = schema;
-        callbacks.resolve_refs(copy);
-        callbacks.add_schema("", copy);
-    });
-}
-
-std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options) {
-    SchemaConverter converter([&](const std::string &) { return json(); }, options.dotall, options.compact_spaces);
-    common_grammar_builder builder {
-        /* .add_rule = */ [&](const std::string & name, const std::string & rule) {
-            return converter._add_rule(name, rule);
-        },
-        /* .add_schema = */ [&](const std::string & name, const nlohmann::ordered_json & schema) {
-            return converter.visit(schema, name == "root" ? "" : name);
-        },
-        /* .resolve_refs = */ [&](nlohmann::ordered_json & schema) {
-            converter.resolve_refs(schema, "");
-        }
-    };
-    cb(builder);
+    SchemaConverter converter([](const std::string &) { return json::object(); }, /* dotall= */ false);
+    auto copy = schema;
+    converter.resolve_refs(copy, "input");
+    converter.visit(copy, "");
    converter.check_errors();
    return converter.format_grammar();
 }
--- a/common/json-schema-to-grammar.h
+++ b/common/json-schema-to-grammar.h
@@ -5,17 +5,4 @@
 #define JSON_ASSERT GGML_ASSERT
 #include "json.hpp"

-std::string json_schema_to_grammar(const nlohmann::ordered_json & schema);
-
-struct common_grammar_builder {
-    std::function<std::string(const std::string &, const std::string &)> add_rule;
-    std::function<std::string(const std::string &, const nlohmann::ordered_json &)> add_schema;
-    std::function<void(nlohmann::ordered_json &)> resolve_refs;
-};
-
-struct common_grammar_options {
-    bool dotall = false;
-    bool compact_spaces = false;
-};
-
-std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options = {});
+std::string json_schema_to_grammar(const nlohmann::ordered_json& schema);
--- a/common/log.cpp
+++ b/common/log.cpp
@@ -206,7 +206,6 @@ public:
                vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args_copy);
            }
 #endif
-            va_end(args_copy);
        }

        entry.level = level;
--- a/common/minja.hpp
+++ b/common/minja.hpp
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -113,10 +113,7 @@ struct common_sampler {
    void set_logits(struct llama_context * ctx, int idx) {
        const auto * logits = llama_get_logits_ith(ctx, idx);

-        const llama_model * model = llama_get_model(ctx);
-        const llama_vocab * vocab = llama_model_get_vocab(model);
-
-        const int n_vocab = llama_vocab_n_tokens(vocab);
+        const int n_vocab = llama_n_vocab(llama_get_model(ctx));

        cur.resize(n_vocab);

@@ -145,24 +142,13 @@ std::string common_params_sampling::print() const {
 }

 struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params) {
-    const llama_vocab * vocab = llama_model_get_vocab(model);
-
    llama_sampler_chain_params lparams = llama_sampler_chain_default_params();

    lparams.no_perf = params.no_perf;

-    std::vector<const char *> trigger_words;
-    trigger_words.reserve(params.grammar_trigger_words.size());
-    for (const auto & str : params.grammar_trigger_words) {
-        trigger_words.push_back(str.word.c_str());
-    }
    auto * result = new common_sampler {
        /* .params = */ params,
-        /* .grmr   = */ params.grammar_lazy
-            ? llama_sampler_init_grammar_lazy(vocab, params.grammar.c_str(), "root",
-                                              trigger_words.data(), trigger_words.size(),
-                                              params.grammar_trigger_tokens.data(), params.grammar_trigger_tokens.size())
-            :      llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root"),
+        /* .grmr   = */ llama_sampler_init_grammar(model, params.grammar.c_str(), "root"),
        /* .chain  = */ llama_sampler_chain_init(lparams),
        /* .prev   = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
        /* .cur    = */ {},
@@ -171,7 +157,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co

    llama_sampler_chain_add(result->chain,
            llama_sampler_init_logit_bias(
-                llama_vocab_n_tokens(vocab),
+                llama_n_vocab(model),
                params.logit_bias.size(),
                params.logit_bias.data()));

@@ -186,7 +172,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
                            c_breakers.push_back(str.c_str());
                        }

-                        llama_sampler_chain_add(result->chain, llama_sampler_init_dry      (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_dry      (model, params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
                    }
                    break;
                case COMMON_SAMPLER_TYPE_TOP_K:
@@ -208,7 +194,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
                    llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
                    break;
                case COMMON_SAMPLER_TYPE_INFILL:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_infill   (vocab));
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_infill   (model));
                    break;
                case COMMON_SAMPLER_TYPE_PENALTIES:
                    llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
@@ -220,7 +206,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
        llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
    } else if (params.mirostat == 1) {
        llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
-        llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
+        llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
    } else if (params.mirostat == 2) {
        llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
        llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -79,13 +79,10 @@ bool common_speculative_are_compatible(
    const struct llama_model * model_tgt = llama_get_model(ctx_tgt);
    const struct llama_model * model_dft = llama_get_model(ctx_dft);

-    const struct llama_vocab * vocab_tgt = llama_model_get_vocab(model_tgt);
-    const struct llama_vocab * vocab_dft = llama_model_get_vocab(model_dft);
-
-    const bool vocab_type_tgt = llama_vocab_type(vocab_tgt);
+    const bool vocab_type_tgt = llama_vocab_type(model_tgt);
    LOG_DBG("%s: vocab_type tgt: %d\n", __func__, vocab_type_tgt);

-    const bool vocab_type_dft = llama_vocab_type(vocab_dft);
+    const bool vocab_type_dft = llama_vocab_type(model_dft);
    LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft);

    if (vocab_type_tgt != vocab_type_dft) {
@@ -94,34 +91,34 @@ bool common_speculative_are_compatible(
        return false;
    }

-    if (llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
-        llama_vocab_get_add_eos(vocab_tgt) != llama_vocab_get_add_eos(vocab_dft) ||
-        llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft) ||
-        llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft)) {
-        LOG_ERR("%s: draft vocab special tokens must match target vocab to use speculation\n", __func__);
-        LOG_ERR("%s: tgt: bos = %d (%d), eos = %d (%d)\n", __func__, llama_vocab_bos(vocab_tgt), llama_vocab_get_add_bos(vocab_tgt), llama_vocab_eos(vocab_tgt), llama_vocab_get_add_eos(vocab_tgt));
-        LOG_ERR("%s: dft: bos = %d (%d), eos = %d (%d)\n", __func__, llama_vocab_bos(vocab_dft), llama_vocab_get_add_bos(vocab_dft), llama_vocab_eos(vocab_dft), llama_vocab_get_add_eos(vocab_dft));
+    if (llama_add_bos_token(model_tgt) != llama_add_bos_token(model_dft) ||
+        llama_add_eos_token(model_tgt) != llama_add_eos_token(model_dft) ||
+        llama_token_bos(model_tgt) != llama_token_bos(model_dft) ||
+        llama_token_eos(model_tgt) != llama_token_eos(model_dft)) {
+        LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__);
+        LOG_ERR("%s: tgt: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(model_tgt), llama_add_bos_token(model_tgt), llama_token_eos(model_tgt), llama_add_eos_token(model_tgt));
+        LOG_ERR("%s: dft: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(model_dft), llama_add_bos_token(model_dft), llama_token_eos(model_dft), llama_add_eos_token(model_dft));
        return false;
    }

    {
-        const int n_vocab_tgt = llama_vocab_n_tokens(vocab_tgt);
-        const int n_vocab_dft = llama_vocab_n_tokens(vocab_dft);
+        const int n_vocab_tgt = llama_n_vocab(model_tgt);
+        const int n_vocab_dft = llama_n_vocab(model_dft);

        const int vocab_diff = std::abs(n_vocab_tgt - n_vocab_dft);

        if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
            LOG_ERR("%s: draft model vocab must closely match target model to use speculation but "
                         "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
-                    __func__, n_vocab_tgt, llama_vocab_n_tokens(vocab_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
+                    __func__, n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
            return false;
        }

        for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) {
-            const char * token_text_tgt = llama_vocab_get_text(vocab_tgt, i);
-            const char * token_text_dft = llama_vocab_get_text(vocab_dft, i);
+            const char * token_text_tgt = llama_token_get_text(model_tgt, i);
+            const char * token_text_dft = llama_token_get_text(model_dft, i);
            if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
-                LOG_ERR("%s: draft vocab vocab must match target vocab to use speculation but "
+                LOG_ERR("%s: draft model vocab must match target model to use speculation but "
                             "token %d content differs - target '%s', draft '%s'\n", __func__, i,
                        common_token_to_piece(ctx_tgt, i).c_str(),
                        common_token_to_piece(ctx_dft, i).c_str());
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -696,9 +696,6 @@ class Model:
        if chkhsh == "877081d19cf6996e2c4ff0e1236341e9b7bde288f5311a56a937f0afbbb3aeb5":
            # ref: https://huggingface.co/deepseek-ai/DeepSeek-V3
            res = "deepseek-v3"
-        if chkhsh == "b3f499bb4255f8ca19fccd664443283318f2fd2414d5e0b040fbdd0cc195d6c5":
-            # ref: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
-            res = "deepseek-r1-qwen"

        if res is None:
            logger.warning("\n")
@@ -2885,66 +2882,6 @@ class InternLM2Model(Model):
            return [(self.map_tensor_name(name), data_torch)]


-@Model.register("InternLM3ForCausalLM")
-class InternLM3Model(Model):
-    model_arch = gguf.MODEL_ARCH.LLAMA
-
-    def set_vocab(self):
-        tokens, scores, toktypes = self._create_vocab_sentencepiece()
-
-        self.gguf_writer.add_tokenizer_model("llama")
-        self.gguf_writer.add_tokenizer_pre("default")
-        self.gguf_writer.add_token_list(tokens)
-        self.gguf_writer.add_token_scores(scores)
-        self.gguf_writer.add_token_types(toktypes)
-
-        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
-
-        tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
-        if tokenizer_config_file.is_file():
-            with open(tokenizer_config_file, "r", encoding="utf-8") as f:
-                tokenizer_config_json = json.load(f)
-                if "add_prefix_space" in tokenizer_config_json:
-                    self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
-
-                if "added_tokens_decoder" in tokenizer_config_json:
-                    for token_id, token_data in tokenizer_config_json["added_tokens_decoder"].items():
-                        if token_data.get("special"):
-                            token_id = int(token_id)
-                            token = token_data["content"]
-                            special_vocab._set_special_token(token, token_id)
-                            # update eos token
-                            if token == '<|im_end|>' and "eos" in special_vocab.special_token_ids:
-                                special_vocab.special_token_ids["eos"] = token_id
-
-        special_vocab.add_to_gguf(self.gguf_writer)
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        hparams = self.hparams
-        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
-
-        if "head_dim" in hparams:
-            rope_dim = hparams["head_dim"]
-        else:
-            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
-        self.gguf_writer.add_rope_dimension_count(rope_dim)
-
-        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
-            if self.hparams["rope_scaling"].get("type") == "linear" or self.hparams["rope_scaling"].get("rope_type") == "linear":
-                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
-                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        n_head = self.hparams["num_attention_heads"]
-        n_kv_head = self.hparams.get("num_key_value_heads")
-        if name.endswith(("q_proj.weight", "q_proj.bias")):
-            data_torch = LlamaModel.permute(data_torch, n_head, n_head)
-        if name.endswith(("k_proj.weight", "k_proj.bias")):
-            data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
-        return [(self.map_tensor_name(name), data_torch)]
-
-
@Model.register("BertModel", "BertForMaskedLM", "CamembertModel")
 class BertModel(Model):
    model_arch = gguf.MODEL_ARCH.BERT
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -65,50 +65,49 @@ else:

 # TODO: add models here, base models preferred
 models = [
-    {"name": "llama-spm",        "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
-    {"name": "llama-bpe",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
-    {"name": "phi-3",            "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", },
-    {"name": "deepseek-llm",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
-    {"name": "deepseek-coder",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
-    {"name": "falcon",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
-    {"name": "bert-bge",         "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
-    {"name": "falcon3",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon3-7B-Base", },
-    {"name": "bert-bge-large",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/BAAI/bge-large-zh-v1.5", },
-    {"name": "mpt",              "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
-    {"name": "starcoder",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
-    {"name": "gpt-2",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
-    {"name": "stablelm2",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b", },
-    {"name": "refact",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
-    {"name": "command-r",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
-    {"name": "qwen2",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
-    {"name": "olmo",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
-    {"name": "dbrx",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
-    {"name": "jina-v1-en",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-reranker-v1-tiny-en", },
-    {"name": "jina-v2-en",       "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
-    {"name": "jina-v2-es",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
-    {"name": "jina-v2-de",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
-    {"name": "smaug-bpe",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
-    {"name": "poro-chat",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
-    {"name": "jina-v2-code",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
-    {"name": "viking",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B
-    {"name": "gemma",            "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2b", },
-    {"name": "gemma-2",          "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", },
-    {"name": "jais",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", },
-    {"name": "t5",               "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", },
-    {"name": "codeshell",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", },
-    {"name": "tekken",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },
-    {"name": "smollm",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", },
-    {'name': "bloom",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", },
-    {'name': "gpt3-finnish",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", },
-    {"name": "exaone",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
-    {"name": "phi-2",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
-    {"name": "chameleon",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", },
-    {"name": "minerva-7b",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", },
-    {"name": "roberta-bpe",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"},
-    {"name": "gigachat",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct"},
-    {"name": "megrez",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Infinigence/Megrez-3B-Instruct"},
-    {"name": "deepseek-v3",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-V3"},
-    {"name": "deepseek-r1-qwen", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"},
+    {"name": "llama-spm",      "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
+    {"name": "llama-bpe",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
+    {"name": "phi-3",          "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", },
+    {"name": "deepseek-llm",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
+    {"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
+    {"name": "falcon",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
+    {"name": "bert-bge",       "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
+    {"name": "falcon3",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon3-7B-Base", },
+    {"name": "bert-bge-large", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/BAAI/bge-large-zh-v1.5", },
+    {"name": "mpt",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
+    {"name": "starcoder",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
+    {"name": "gpt-2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
+    {"name": "stablelm2",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b", },
+    {"name": "refact",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
+    {"name": "command-r",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
+    {"name": "qwen2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
+    {"name": "olmo",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
+    {"name": "dbrx",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
+    {"name": "jina-v1-en",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-reranker-v1-tiny-en", },
+    {"name": "jina-v2-en",     "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
+    {"name": "jina-v2-es",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
+    {"name": "jina-v2-de",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
+    {"name": "smaug-bpe",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
+    {"name": "poro-chat",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
+    {"name": "jina-v2-code",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
+    {"name": "viking",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B
+    {"name": "gemma",          "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2b", },
+    {"name": "gemma-2",        "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", },
+    {"name": "jais",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", },
+    {"name": "t5",             "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", },
+    {"name": "codeshell",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", },
+    {"name": "tekken",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },
+    {"name": "smollm",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", },
+    {'name': "bloom",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", },
+    {'name': "gpt3-finnish",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", },
+    {"name": "exaone",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
+    {"name": "phi-2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
+    {"name": "chameleon",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", },
+    {"name": "minerva-7b",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", },
+    {"name": "roberta-bpe",    "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"},
+    {"name": "gigachat",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct"},
+    {"name": "megrez",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Infinigence/Megrez-3B-Instruct"},
+    {"name": "deepseek-v3",    "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-V3"},
 ]


--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -133,7 +133,7 @@ The docker build option is currently limited to *intel GPU* targets.
 ### Build image
 ```sh
 # Using FP16
-docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" --target light -f .devops/intel.Dockerfile .
+docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" -f .devops/llama-cli-intel.Dockerfile .
 ```

 *Notes*:
--- a/docs/build.md
+++ b/docs/build.md
@@ -286,7 +286,7 @@ You don't need to install Vulkan SDK. It will be installed inside the container.

 ```sh
 # Build the image
-docker build -t llama-cpp-vulkan --target light -f .devops/vulkan.Dockerfile .
+docker build -t llama-cpp-vulkan -f .devops/llama-cli-vulkan.Dockerfile .

 # Then, use it:
 docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -60,9 +60,9 @@ Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia
 ## Building Docker locally

 ```bash
-docker build -t local/llama.cpp:full-cuda --target full -f .devops/cuda.Dockerfile .
-docker build -t local/llama.cpp:light-cuda --target light -f .devops/cuda.Dockerfile .
-docker build -t local/llama.cpp:server-cuda --target server -f .devops/cuda.Dockerfile .
+docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile .
+docker build -t local/llama.cpp:light-cuda -f .devops/llama-cli-cuda.Dockerfile .
+docker build -t local/llama.cpp:server-cuda -f .devops/llama-server-cuda.Dockerfile .
 ```

 You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture.
@@ -95,9 +95,9 @@ Assuming one has the [mt-container-toolkit](https://developer.mthreads.com/musa/
 ## Building Docker locally

 ```bash
-docker build -t local/llama.cpp:full-musa --target full -f .devops/musa.Dockerfile .
-docker build -t local/llama.cpp:light-musa --target light -f .devops/musa.Dockerfile .
-docker build -t local/llama.cpp:server-musa --target server -f .devops/musa.Dockerfile .
+docker build -t local/llama.cpp:full-musa -f .devops/full-musa.Dockerfile .
+docker build -t local/llama.cpp:light-musa -f .devops/llama-cli-musa.Dockerfile .
+docker build -t local/llama.cpp:server-musa -f .devops/llama-server-musa.Dockerfile .
 ```

 You may want to pass in some different `ARGS`, depending on the MUSA environment supported by your container host, as well as the GPU architecture.
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -50,7 +50,7 @@ int main(int argc, char ** argv) {
    // ensure enough sequences are available
    ctx_params.n_seq_max = n_pl.empty() ? 1 : *std::max_element(n_pl.begin(), n_pl.end());

-    llama_context * ctx = llama_init_from_model(model, ctx_params);
+    llama_context * ctx = llama_new_context_with_model(model, ctx_params);

    if (ctx == NULL) {
        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@@ -23,12 +23,12 @@ defer {
 }

 let model_params = llama_model_default_params()
-guard let model = llama_model_load_from_file(modelPath.cString(using: .utf8), model_params) else {
+guard let model = llama_load_model_from_file(modelPath.cString(using: .utf8), model_params) else {
    print("Failed to load model")
    exit(1)
 }
 defer {
-    llama_model_free(model)
+    llama_free_model(model)
 }

 var tokens = tokenize(text: prompt, add_bos: true)
@@ -141,7 +141,7 @@ while n_cur <= n_len {
        let new_token_id = llama_sampler_sample(smpl, context, i_batch[i])

        // is it an end of stream? -> mark the stream as finished
-        if llama_vocab_is_eog(model, new_token_id) || n_cur == n_len {
+        if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
            i_batch[i] = -1
            // print("")
            if n_parallel > 1 {
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -48,12 +48,10 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    const llama_vocab * vocab = llama_model_get_vocab(model);
-
    // tokenize the prompt

    std::vector<llama_token> tokens_list;
-    tokens_list = common_tokenize(vocab, params.prompt, true);
+    tokens_list = common_tokenize(model, params.prompt, true);

    const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size())*n_parallel;

@@ -64,7 +62,7 @@ int main(int argc, char ** argv) {
    ctx_params.n_ctx   = n_kv_req;
    ctx_params.n_batch = std::max(n_predict, n_parallel);

-    llama_context * ctx = llama_init_from_model(model, ctx_params);
+    llama_context * ctx = llama_new_context_with_model(model, ctx_params);

    auto sparams = llama_sampler_chain_default_params();
    sparams.no_perf = false;
@@ -123,7 +121,7 @@ int main(int argc, char ** argv) {

        llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
        if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
-            decoder_start_token_id = llama_vocab_bos(vocab);
+            decoder_start_token_id = llama_token_bos(model);
        }

        common_batch_clear(batch);
@@ -176,7 +174,7 @@ int main(int argc, char ** argv) {
            const llama_token new_token_id = llama_sampler_sample(smpl, ctx, i_batch[i]);

            // is it an end of generation? -> mark the stream as finished
-            if (llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_predict) {
+            if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
                i_batch[i] = -1;
                LOG("\n");
                if (n_parallel > 1) {
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -911,7 +911,7 @@ int main(int argc, char ** argv) {
    load_vocab(params.fn_vocab_model, &config, &vocab);

    struct my_llama_model model;
-    model.hparams.n_vocab   = config.vocab_size; //llama_vocab_n_vocab(lctx);
+    model.hparams.n_vocab   = config.vocab_size; //llama_n_vocab(lctx);
    model.hparams.n_ctx     = params.n_ctx;
    model.hparams.n_embd    = config.dim; //params.n_embd;
    model.hparams.n_ff      = config.hidden_dim;
--- a/examples/cvector-generator/cvector-generator.cpp
+++ b/examples/cvector-generator/cvector-generator.cpp
@@ -273,9 +273,7 @@ struct tokenized_prompt {
    size_t max_seq_len;

    tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
-        const llama_model * model = llama_get_model(ctx);
-        const llama_vocab * vocab = llama_model_get_vocab(model);
-        const bool add_bos = llama_vocab_get_add_bos(vocab);
+        const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
        tokens_pos = common_tokenize(ctx, pos, add_bos, true);
        tokens_neg = common_tokenize(ctx, neg, add_bos, true);
        max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
@@ -423,8 +421,8 @@ int main(int argc, char ** argv) {
    llama_context * ctx = llama_init.context.get();

    // int n_ctx = llama_n_ctx(ctx);
-    int n_layers = llama_model_n_layer(model);
-    int n_embd = llama_model_n_embd(model);
+    int n_layers = llama_n_layer(model);
+    int n_embd = llama_n_embd(model);

    // get model hint param (a.k.a model arch name)
    char model_hint[128];
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -105,9 +105,7 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    const llama_vocab * vocab = llama_model_get_vocab(model);
-
-    const int n_ctx_train = llama_model_n_ctx_train(model);
+    const int n_ctx_train = llama_n_ctx_train(model);
    const int n_ctx = llama_n_ctx(ctx);

    const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
@@ -150,7 +148,7 @@ int main(int argc, char ** argv) {
    // check if the last token is SEP
    // it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
    for (auto & inp : inputs) {
-        if (inp.empty() || inp.back() != llama_vocab_sep(vocab)) {
+        if (inp.empty() || inp.back() != llama_token_sep(model)) {
            LOG_WRN("%s: last token in the prompt is not SEP\n", __func__);
            LOG_WRN("%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
        }
@@ -183,7 +181,7 @@ int main(int argc, char ** argv) {
    }

    // allocate output
-    const int n_embd = llama_model_n_embd(model);
+    const int n_embd = llama_n_embd(model);
    std::vector<float> embeddings(n_embd_count * n_embd, 0);
    float * emb = embeddings.data();

--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -127,10 +127,7 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
 }

 static bool run(llama_context * ctx, const common_params & params) {
-    const llama_model * model = llama_get_model(ctx);
-    const llama_vocab * vocab = llama_model_get_vocab(model);
-
-    const bool add_bos = llama_vocab_get_add_bos(vocab);
+    const bool add_bos = llama_add_bos_token(llama_get_model(ctx));

    std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);

--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@@ -8,6 +8,7 @@
 #include <map>
 #include <vector>
 #include <string>
+#include <thread>
 #include <fstream>

 static bool g_verbose = false;
@@ -129,7 +130,7 @@ struct lora_merge_ctx {

    lora_merge_ctx(
            std::string & base_fname,
-            std::vector<common_adapter_lora_info> & lora_files,
+            std::vector<common_lora_adapter_info> & lora_files,
            std::string & outfile,
            int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) {
        fout.exceptions(std::ofstream::failbit); // fail fast on write errors
@@ -345,18 +346,8 @@ struct lora_merge_ctx {
            gf = ggml_new_graph(ctx0);
            struct ggml_tensor * cur = inp_base;
            for (size_t i = 0; i < adapters.size(); ++i) {
-                struct ggml_tensor * delta;
-                bool is_tok_embd = string_starts_with(name_base, "token_embd");
-                if (is_tok_embd) {
-                    printf("%s :     detected token embeddings tensor\n", __func__);
-                    delta = ggml_mul_mat(ctx0,
-                        ggml_cast(ctx0, inp_b[i], GGML_TYPE_F32),
-                        ggml_cast(ctx0, inp_a[i], GGML_TYPE_F32));
-                } else {
-                    delta = ggml_mul_mat(ctx0,
-                        ggml_cont(ctx0, ggml_transpose(ctx0, ggml_cast(ctx0, inp_a[i], GGML_TYPE_F32))),
-                        ggml_cast(ctx0, inp_b[i], GGML_TYPE_F32));
-                }
+                struct ggml_tensor * a_T = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_cast(ctx0, inp_a[i], GGML_TYPE_F32)));
+                struct ggml_tensor * delta = ggml_mul_mat(ctx0, a_T, ggml_cast(ctx0, inp_b[i], GGML_TYPE_F32));
                // scale
                const float alpha = adapters[i]->alpha;
                const float rank  = (float) inp_b[i]->ne[0];
--- a/examples/gbnf-validator/gbnf-validator.cpp
+++ b/examples/gbnf-validator/gbnf-validator.cpp
@@ -76,7 +76,7 @@ int main(int argc, char** argv) {
        grammar_str = buffer.str();
    }

-    llama_grammar * grammar = llama_grammar_init_impl(nullptr, grammar_str.c_str(), "root", false, nullptr, 0, nullptr, 0);
+    llama_grammar * grammar = llama_grammar_init_impl(nullptr, grammar_str.c_str(), "root");
    if (grammar == nullptr) {
        fprintf(stdout, "Failed to initialize llama_grammar\n");
        return 1;
--- a/examples/gguf-split/tests.sh
+++ b/examples/gguf-split/tests.sh
@@ -41,7 +41,7 @@ echo PASS
 echo

 # 2b. Test the sharded model is loading properly
-$MAIN -no-cnv --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --n-predict 32
+$MAIN --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --n-predict 32
 echo PASS
 echo

@@ -51,7 +51,7 @@ echo PASS
 echo

 # 3b. Test the merged model is loading properly
-$MAIN -no-cnv --model $WORK_PATH/ggml-model-merge.gguf --n-predict 32
+$MAIN --model $WORK_PATH/ggml-model-merge.gguf --n-predict 32
 echo PASS
 echo

@@ -61,7 +61,7 @@ echo PASS
 echo

 # 4b. Test the sharded model is loading properly
-$MAIN -no-cnv --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --n-predict 32
+$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --n-predict 32
 echo PASS
 echo

@@ -71,7 +71,7 @@ echo
 #echo

 # 5b. Test the merged model is loading properly
-#$MAIN -no-cnv --model $WORK_PATH/ggml-model-merge-2.gguf --n-predict 32
+#$MAIN --model $WORK_PATH/ggml-model-merge-2.gguf --n-predict 32
 #echo PASS
 #echo

@@ -81,7 +81,7 @@ echo PASS
 echo

 # 6b. Test the sharded model is loading properly
-$MAIN -no-cnv --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --n-predict 32
+$MAIN --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --n-predict 32
 echo PASS
 echo

--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@@ -11,7 +11,6 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
    std::vector<std::vector<float>> result;

    const llama_model * model = llama_get_model(ctx);
-    const llama_vocab * vocab = llama_model_get_vocab(model);

    llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1);

@@ -20,16 +19,16 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve

        const std::string input_string = instruction + sentences[i];

-        std::vector<llama_token> inputs = common_tokenize(vocab, input_string, true, false);
+        std::vector<llama_token> inputs = common_tokenize(model, input_string, true, false);

        const int32_t n_toks = inputs.size();

        // GritLM seems to have EOS = ""
        // https://github.com/ContextualAI/gritlm/blob/92025b16534712b31b3c4aaaf069350e222bd5f8/gritlm/gritlm.py#L18
-        // inputs.push_back(llama_vocab_eos(vocab));
+        // inputs.push_back(llama_token_eos(model));

        // we want to ignore instruction tokens for mean pooling
-        const int32_t n_inst = common_tokenize(vocab, instruction, true, false).size();
+        const int32_t n_inst = common_tokenize(model, instruction, true, false).size();

 #ifdef GRIT_DEBUG
        // debug tokens - should be matching as referenced in the GritLM sample
@@ -53,7 +52,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
        llama_decode(ctx, batch);

        // get embedding dimensions
-        uint64_t n_embd = llama_model_n_embd(model);
+        uint64_t n_embd = llama_n_embd(model);

        // allocate embedding output
        std::vector<float> emb_unorm(n_embd, 0.0f);
@@ -98,9 +97,7 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
    std::string result;

    const llama_model * model = llama_get_model(ctx);
-    const llama_vocab * vocab = llama_model_get_vocab(model);
-
-    llama_token eos_token = llama_vocab_eos(vocab);
+    llama_token eos_token = llama_token_eos(model);

    llama_kv_cache_clear(ctx);
    llama_set_embeddings(ctx, false);
@@ -108,7 +105,7 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std

    llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);

-    std::vector<llama_token> inputs = common_tokenize(vocab, prompt, false, true);
+    std::vector<llama_token> inputs = common_tokenize(model, prompt, false, true);
    int32_t i_current_token = 0;

    while (true) {
@@ -171,7 +168,7 @@ int main(int argc, char * argv[]) {
    llama_model * model = llama_model_load_from_file(params.model.c_str(), mparams);

    // create generation context
-    llama_context * ctx = llama_init_from_model(model, cparams);
+    llama_context * ctx = llama_new_context_with_model(model, cparams);

    auto sparams = llama_sampler_chain_default_params();

@@ -200,7 +197,7 @@ int main(int argc, char * argv[]) {
        const std::vector<std::vector<float>> d_rep = encode(ctx, documents, gritlm_instruction(""));
        const std::vector<std::vector<float>> q_rep = encode(ctx, queries,   gritlm_instruction(instruction));

-        const int n_embd = llama_model_n_embd(model);
+        const int n_embd = llama_n_embd(model);

        const float cosine_sim_q0_d0 = common_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd);
        const float cosine_sim_q0_d1 = common_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd);
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -7,6 +7,7 @@
 #include <cstdio>
 #include <cstring>
 #include <ctime>
+#include <sstream>
 #include <thread>
 #include <mutex>
 #include <vector>
@@ -39,7 +40,7 @@ public:
    void set_params(common_params params) { m_params = std::move(params); }
    bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
    void save_imatrix(int ncall = -1) const;
-    bool load_imatrix(const char * fname);
+    bool load_imatrix(const char * file_name);
 private:
    std::unordered_map<std::string, Stats> m_stats;
    common_params                          m_params;
@@ -428,13 +429,10 @@ static void process_logits(
 }

 static bool compute_imatrix(llama_context * ctx, const common_params & params) {
-    const llama_model * model = llama_get_model(ctx);
-    const llama_vocab * vocab = llama_model_get_vocab(model);
-
-    const bool add_bos = llama_vocab_get_add_bos(vocab);
+    const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
    const int n_ctx = llama_n_ctx(ctx);

-    GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
+    GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));

    auto tim1 = std::chrono::high_resolution_clock::now();
    LOG_INF("%s: tokenizing the input ..\n", __func__);
@@ -470,7 +468,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
    const int n_chunk_max = tokens.size() / n_ctx;

    const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
-    const int n_vocab = llama_vocab_n_tokens(vocab);
+    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
    const int n_batch = params.n_batch;

    int count = 0;
@@ -510,7 +508,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {

            // add BOS token for the first batch of each chunk
            if (add_bos && j == 0) {
-                tokens[batch_start] = llama_vocab_bos(vocab);
+                tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
            }

            common_batch_clear(batch);
@@ -629,7 +627,7 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    const int n_ctx_train = llama_model_n_ctx_train(model);
+    const int n_ctx_train = llama_n_ctx_train(model);
    if (params.n_ctx > n_ctx_train) {
        LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
                __func__, n_ctx_train, params.n_ctx);
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -139,9 +139,7 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    const llama_vocab * vocab = llama_model_get_vocab(model);
-
-    const int n_ctx_train = llama_model_n_ctx_train(model);
+    const int n_ctx_train = llama_n_ctx_train(model);
    const int n_ctx = llama_n_ctx(ctx);
    LOG_DBG("n_ctx: %d\n", n_ctx);

@@ -154,28 +152,28 @@ int main(int argc, char ** argv) {
        LOG_INF("\n");
        LOG_INF("%s\n", common_params_get_system_info(params).c_str());
    }
-    const bool add_bos = llama_vocab_get_add_bos(vocab);
-    GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
+    const bool add_bos = llama_add_bos_token(model);
+    GGML_ASSERT(!llama_add_eos_token(model));

    std::vector<llama_token> embd_inp;
    std::vector<llama_token> embd_end;
    std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
    std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);

-    GGML_ASSERT(llama_vocab_fim_pre(vocab) >= 0);
-    GGML_ASSERT(llama_vocab_fim_suf(vocab) >= 0);
+    GGML_ASSERT(llama_token_fim_pre(model) >= 0);
+    GGML_ASSERT(llama_token_fim_suf(model) >= 0);

-    inp_pfx.insert(inp_pfx.begin(), llama_vocab_fim_pre(vocab));
-    inp_sfx.insert(inp_sfx.begin(), llama_vocab_fim_suf(vocab));
+    inp_pfx.insert(inp_pfx.begin(), llama_token_fim_pre(model));
+    inp_sfx.insert(inp_sfx.begin(), llama_token_fim_suf(model));

    embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
    embd_end = params.spm_infill ? inp_pfx : inp_sfx;
    if (add_bos) {
-        embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab));
+        embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
    }
    embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());

-    const llama_token middle_token = llama_vocab_fim_mid(vocab);
+    const llama_token middle_token = llama_token_fim_mid(model);
    if (middle_token >= 0) {
        embd_inp.push_back(middle_token);
    }
@@ -187,7 +185,7 @@ int main(int argc, char ** argv) {

    // Should not run without any tokens
    if (embd_inp.empty()) {
-        embd_inp.push_back(llama_vocab_bos(vocab));
+        embd_inp.push_back(llama_token_bos(model));
        LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
    }

@@ -422,10 +420,10 @@ int main(int argc, char ** argv) {
        // if not currently processing queued inputs;
        if ((int) embd_inp.size() <= n_consumed) {
            // deal with eot token in infill mode
-            if ((common_sampler_last(smpl) == llama_vocab_eot(vocab) || is_interacting) && params.interactive){
+            if ((common_sampler_last(smpl) == llama_token_eot(model) || is_interacting) && params.interactive){
                if (is_interacting && !params.interactive_first) {
                    // print an eot token
-                    LOG("%s", common_token_to_piece(ctx, llama_vocab_eot(vocab)).c_str());
+                    LOG("%s", common_token_to_piece(ctx, llama_token_eot(model)).c_str());
                }
                LOG("\n");
                console::set_display(console::user_input);
@@ -465,13 +463,13 @@ int main(int argc, char ** argv) {
                std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
                std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);

-                inp_pfx.insert(inp_pfx.begin(), llama_vocab_fim_pre(vocab));
-                inp_sfx.insert(inp_sfx.begin(), llama_vocab_fim_suf(vocab));
+                inp_pfx.insert(inp_pfx.begin(), llama_token_fim_pre(model));
+                inp_sfx.insert(inp_sfx.begin(), llama_token_fim_suf(model));

                embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
                embd_end = params.spm_infill ? inp_pfx : inp_sfx;
                if (add_bos) {
-                    embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab));
+                    embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
                }
                embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());

@@ -486,7 +484,7 @@ int main(int argc, char ** argv) {
                is_interacting = false;
            }
            // deal with end of generation tokens in interactive mode
-            else if (llama_vocab_is_eog(vocab, common_sampler_last(smpl))) {
+            else if (llama_token_is_eog(model, common_sampler_last(smpl))) {
                LOG_DBG("found EOS token\n");

                if (params.interactive) {
@@ -502,7 +500,7 @@ int main(int argc, char ** argv) {

                if (params.input_prefix_bos) {
                    LOG_DBG("adding input prefix BOS token\n");
-                    embd_inp.push_back(llama_vocab_bos(vocab));
+                    embd_inp.push_back(llama_token_bos(model));
                }

                std::string buffer;
@@ -565,7 +563,7 @@ int main(int argc, char ** argv) {
        }

        // end of generation
-        if (!embd.empty() && llama_vocab_is_eog(vocab, embd.back()) && !params.interactive) {
+        if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !params.interactive) {
            break;
        }

@@ -577,7 +575,7 @@ int main(int argc, char ** argv) {
        }
    }
    if (!params.interactive && n_remain <= 0) {
-        LOG("%s", common_token_to_piece(ctx, llama_vocab_eot(vocab)).c_str());
+        LOG("%s", common_token_to_piece(ctx, llama_token_eot(model)).c_str());
    }

    LOG("\n");
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -683,7 +683,7 @@ struct cmd_params_instance {
    bool               cpu_strict;
    int                poll;
    int                n_gpu_layers;
-    std::string        rpc_servers_str;
+    std::string        rpc_servers;
    llama_split_mode   split_mode;
    int                main_gpu;
    bool               no_kv_offload;
@@ -696,37 +696,8 @@ struct cmd_params_instance {
        llama_model_params mparams = llama_model_default_params();

        mparams.n_gpu_layers = n_gpu_layers;
-        if (!rpc_servers_str.empty()) {
-            auto rpc_servers = string_split<std::string>(rpc_servers_str, ',');
-
-            // add RPC devices
-            if (!rpc_servers.empty()) {
-                ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
-                if (!rpc_reg) {
-                    fprintf(stderr, "%s: failed to find RPC backend\n", __func__);
-                    exit(1);
-                }
-
-                typedef ggml_backend_dev_t (*ggml_backend_rpc_add_device_t)(const char * endpoint);
-                ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
-                if (!ggml_backend_rpc_add_device_fn) {
-                    fprintf(stderr, "%s: failed to find RPC device add function\n", __func__);
-                    exit(1);
-                }
-                static std::vector<ggml_backend_dev_t> devices;
-                devices.clear();
-                for (const std::string & server : rpc_servers) {
-                    ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str());
-                    if (dev) {
-                        devices.push_back(dev);
-                    } else {
-                        fprintf(stderr, "%s: failed to add RPC device for server '%s'\n", __func__, server.c_str());
-                        exit(1);
-                    }
-                }
-                devices.push_back(nullptr);
-                mparams.devices = devices.data();
-            }
+        if (!rpc_servers.empty()) {
+            mparams.rpc_servers = rpc_servers.c_str();
        }
        mparams.split_mode   = split_mode;
        mparams.main_gpu     = main_gpu;
@@ -737,7 +708,7 @@ struct cmd_params_instance {
    }

    bool equal_mparams(const cmd_params_instance & other) const {
-        return model == other.model && n_gpu_layers == other.n_gpu_layers && rpc_servers_str == other.rpc_servers_str &&
+        return model == other.model && n_gpu_layers == other.n_gpu_layers && rpc_servers == other.rpc_servers &&
               split_mode == other.split_mode && main_gpu == other.main_gpu && use_mmap == other.use_mmap &&
               tensor_split == other.tensor_split;
    }
@@ -1430,8 +1401,7 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_th
    llama_set_n_threads(ctx, n_threads, n_threads);

    const llama_model * model   = llama_get_model(ctx);
-    const llama_vocab * vocab   = llama_model_get_vocab(model);
-    const int32_t       n_vocab = llama_vocab_n_tokens(vocab);
+    const int32_t       n_vocab = llama_n_vocab(model);

    std::vector<llama_token> tokens(n_batch);

@@ -1439,7 +1409,7 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_th

    while (n_processed < n_prompt) {
        int n_tokens = std::min(n_prompt - n_processed, n_batch);
-        tokens[0]    = n_processed == 0 && llama_vocab_get_add_bos(vocab) ? llama_vocab_bos(vocab) : std::rand() % n_vocab;
+        tokens[0]    = n_processed == 0 && llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab;
        for (int i = 1; i < n_tokens; i++) {
            tokens[i] = std::rand() % n_vocab;
        }
@@ -1454,10 +1424,9 @@ static void test_gen(llama_context * ctx, int n_gen, int n_threads) {
    llama_set_n_threads(ctx, n_threads, n_threads);

    const llama_model * model   = llama_get_model(ctx);
-    const llama_vocab * vocab   = llama_model_get_vocab(model);
-    const int32_t       n_vocab = llama_vocab_n_tokens(vocab);
+    const int32_t       n_vocab = llama_n_vocab(model);

-    llama_token token = llama_vocab_get_add_bos(vocab) ? llama_vocab_bos(vocab) : std::rand() % n_vocab;
+    llama_token token = llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab;

    for (int i = 0; i < n_gen; i++) {
        llama_decode(ctx, llama_batch_get_one(&token, 1));
@@ -1568,7 +1537,7 @@ int main(int argc, char ** argv) {
            prev_inst = &inst;
        }

-        llama_context * ctx = llama_init_from_model(lmodel, inst.to_llama_cparams());
+        llama_context * ctx = llama_new_context_with_model(lmodel, inst.to_llama_cparams());
        if (ctx == NULL) {
            fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, inst.model.c_str());
            llama_model_free(lmodel);
--- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
@@ -87,7 +87,7 @@ Java_android_llama_cpp_LLamaAndroid_load_1model(JNIEnv *env, jobject, jstring fi
    auto path_to_model = env->GetStringUTFChars(filename, 0);
    LOGi("Loading model from %s", path_to_model);

-    auto model = llama_model_load_from_file(path_to_model, model_params);
+    auto model = llama_load_model_from_file(path_to_model, model_params);
    env->ReleaseStringUTFChars(filename, path_to_model);

    if (!model) {
@@ -102,7 +102,7 @@ Java_android_llama_cpp_LLamaAndroid_load_1model(JNIEnv *env, jobject, jstring fi
 extern "C"
 JNIEXPORT void JNICALL
 Java_android_llama_cpp_LLamaAndroid_free_1model(JNIEnv *, jobject, jlong model) {
-    llama_model_free(reinterpret_cast<llama_model *>(model));
+    llama_free_model(reinterpret_cast<llama_model *>(model));
 }

 extern "C"
@@ -347,7 +347,6 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init(
        jlong context_pointer,
        jlong batch_pointer,
        jstring jtext,
-        jboolean format_chat,
        jint n_len
    ) {

@@ -357,8 +356,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init(
    const auto context = reinterpret_cast<llama_context *>(context_pointer);
    const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);

-    bool parse_special = (format_chat == JNI_TRUE);
-    const auto tokens_list = common_tokenize(context, text, true, parse_special);
+    const auto tokens_list = common_tokenize(context, text, 1);

    auto n_ctx = llama_n_ctx(context);
    auto n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
@@ -370,7 +368,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init(
    }

    for (auto id : tokens_list) {
-        LOGi("token: `%s`-> %d ", common_token_to_piece(context, id).c_str(), id);
+        LOGi("%s", common_token_to_piece(context, id).c_str());
    }

    common_batch_clear(*batch);
@@ -407,7 +405,6 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
    const auto batch   = reinterpret_cast<llama_batch   *>(batch_pointer);
    const auto sampler = reinterpret_cast<llama_sampler *>(sampler_pointer);
    const auto model = llama_get_model(context);
-    const auto vocab = llama_model_get_vocab(model);

    if (!la_int_var) la_int_var = env->GetObjectClass(intvar_ncur);
    if (!la_int_var_value) la_int_var_value = env->GetMethodID(la_int_var, "getValue", "()I");
@@ -417,7 +414,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
    const auto new_token_id = llama_sampler_sample(sampler, context, -1);

    const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
-    if (llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_len) {
+    if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
        return nullptr;
    }

--- a/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt
+++ b/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt
@@ -65,7 +65,6 @@ class LLamaAndroid {
        context: Long,
        batch: Long,
        text: String,
-        formatChat: Boolean,
        nLen: Int
    ): Int

@@ -116,10 +115,10 @@ class LLamaAndroid {
        }
    }

-    fun send(message: String, formatChat: Boolean = false): Flow<String> = flow {
+    fun send(message: String): Flow<String> = flow {
        when (val state = threadLocalState.get()) {
            is State.Loaded -> {
-                val ncur = IntVar(completion_init(state.context, state.batch, message, formatChat, nlen))
+                val ncur = IntVar(completion_init(state.context, state.batch, message, nlen))
                while (ncur.value <= nlen) {
                    val str = completion_loop(state.context, state.batch, state.sampler, nlen, ncur)
                    if (str == null) {
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@@ -52,8 +52,8 @@ actor LlamaContext {
    deinit {
        llama_sampler_free(sampling)
        llama_batch_free(batch)
-        llama_model_free(model)
        llama_free(context)
+        llama_free_model(model)
        llama_backend_free()
    }

@@ -65,7 +65,7 @@ actor LlamaContext {
        model_params.n_gpu_layers = 0
        print("Running on simulator, force use n_gpu_layers = 0")
 #endif
-        let model = llama_model_load_from_file(path, model_params)
+        let model = llama_load_model_from_file(path, model_params)
        guard let model else {
            print("Could not load model at \(path)")
            throw LlamaError.couldNotInitializeContext
@@ -151,7 +151,7 @@ actor LlamaContext {

        new_token_id = llama_sampler_sample(sampling, context, batch.n_tokens - 1)

-        if llama_vocab_is_eog(model, new_token_id) || n_cur == n_len {
+        if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
            print("\n")
            is_done = true
            let new_token_str = String(cString: temporary_invalid_cchars + [0])
--- a/examples/llava/README-minicpmo2.6.md
+++ b/examples/llava/README-minicpmo2.6.md
@@ -1,46 +0,0 @@
-## MiniCPM-o 2.6
-Currently, this readme only supports minicpm-omni's image capabilities, and we will update the full-mode support as soon as possible.
-
-### Prepare models and code
-
-Download [MiniCPM-o-2_6](https://huggingface.co/openbmb/MiniCPM-o-2_6) PyTorch model from huggingface to "MiniCPM-o-2_6" folder.
-
-Clone llama.cpp:
-```bash
-git clone git@github.com:OpenBMB/llama.cpp.git
-cd llama.cpp
-git checkout minicpm-omni
-```
-
-### Usage of MiniCPM-o 2.6
-
-Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf) by us)
-
-```bash
-python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-o-2_6
-python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-o-2_6 --minicpmv-projector ../MiniCPM-o-2_6/minicpmv.projector --output-dir ../MiniCPM-o-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 4
-python ./convert_hf_to_gguf.py ../MiniCPM-o-2_6/model
-
-# quantize int4 version
-./llama-quantize ../MiniCPM-o-2_6/model/ggml-model-f16.gguf ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M
-```
-
-Build llama.cpp using `CMake`:
-https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md
-
-```bash
-cmake -B build
-cmake --build build --config Release
-```
-
-Inference on Linux or Mac
-```
-# run f16 version
-./llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
-
-# run quantized int4 version
-./llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg  -p "What is in the image?"
-
-# or run in interactive mode
-./llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i
-```
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -718,9 +718,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
        else if (ctx->minicpmv_version == 3) {
            pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1);
        }
-        else if (ctx->minicpmv_version == 4) {
-            pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1);
-        }
        ggml_set_name(pos_embed, "pos_embed");
        ggml_set_input(pos_embed);
    }
@@ -1056,11 +1053,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
                    n_head = hidden_size/d_head;
                    num_query = 64;
                }
-                else if (ctx->minicpmv_version == 4) {
-                    hidden_size = 3584;
-                    n_head = hidden_size/d_head;
-                    num_query = 64;
-                }

                struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b);
                Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
@@ -2049,7 +2041,6 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
                images[images.size()-1].push_back(patch);
            }
        }
-        clip_image_u8_free(refine_image);
    }
    return images;
 }
@@ -2088,13 +2079,6 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
                clip_image_f32_free(res);
            }
        }
-        for (size_t i = 0; i < imgs.size(); ++i) {
-            for (size_t j = 0; j < imgs[i].size(); ++j) {
-                if (imgs[i][j] != nullptr) {
-                    clip_image_u8_free(imgs[i][j]);
-                }
-            }
-        }
        return true;
    }
    else if (ctx->has_qwen2vl_merger) {
@@ -2351,9 +2335,6 @@ int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * i
        else if (ctx->minicpmv_version == 3) {
            n_patches = 64;
        }
-        else if (ctx->minicpmv_version == 4) {
-            n_patches = 64;
-        }
    } else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
        int patch_size = params.patch_size * 2;
        int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0);
@@ -2533,8 +2514,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
            //    -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
            struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
            int* positions_data = (int*)malloc(ggml_nbytes(positions));
-            int bucket_coords_h[1024];
-            int bucket_coords_w[1024];
+            int bucket_coords_h[70];
+            int bucket_coords_w[70];
            for (int i = 0; i < pos_h; i++){
                bucket_coords_h[i] = std::floor(70.0*i/pos_h);
            }
@@ -2562,9 +2543,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
            else if (ctx->minicpmv_version == 3) {
                embed_dim = 3584;
            }
-            else if (ctx->minicpmv_version == 4) {
-                embed_dim = 3584;
-            }
            auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));

            float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed));
@@ -2808,9 +2786,6 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
        else if (ctx->minicpmv_version == 3) {
            return 3584;
        }
-        else if (ctx->minicpmv_version == 4) {
-            return 3584;
-        }
    }
    if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
        return ctx->vision_model.mm_1_b->ne[0];
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -47,12 +47,8 @@ static const char * sample(struct common_sampler * smpl,
                           int * n_past) {
    const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
    common_sampler_accept(smpl, id, true);
-
-    const llama_model * model = llama_get_model(ctx_llama);
-    const llama_vocab * vocab = llama_model_get_vocab(model);
-
    static std::string ret;
-    if (llama_vocab_is_eog(vocab, id)) {
+    if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
        ret = "</s>";
    } else {
        ret = common_token_to_piece(ctx_llama, id);
@@ -243,10 +239,11 @@ static struct llava_context * llava_init_context(common_params * params, llama_m

    auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);

+
    llama_context_params ctx_params = common_context_params_to_llama(*params);
    ctx_params.n_ctx           = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings

-    llama_context * ctx_llama = llama_init_from_model(model, ctx_params);
+    llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);

    if (ctx_llama == NULL) {
        LOG_ERR("%s: failed to create the llama_context\n" , __func__);
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -216,7 +216,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
    return true;
 }

-static clip_image_f32 * reshape_by_patch(clip_image_f32 * image, int patch_size) {
+static clip_image_f32 * only_v2_5_reshape_by_patch(clip_image_f32 * image, int patch_size) {
    int width = image->nx;
    int height = image->ny;
    int num_patches = (height / patch_size) * (width / patch_size);
@@ -277,7 +277,13 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
                encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
            }
            else {
-                encoded = clip_image_encode(ctx_clip, n_threads, reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
+                int has_minicpmv_projector = clip_is_minicpmv(ctx_clip);
+                if (has_minicpmv_projector == 2) {
+                    encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
+                }
+                else if (has_minicpmv_projector == 3) {
+                    encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
+                }
            }

            if (!encoded) {
@@ -307,9 +313,6 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
        load_image_size->height = img->ny;
        clip_add_load_image_size(ctx_clip, load_image_size);
        LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
-        delete[] img_res_v.data;
-        img_res_v.size = 0;
-        img_res_v.data = nullptr;
    }
    else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
        // flat / default llava-1.5 type embedding
@@ -381,7 +384,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli

 bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip) {
        // make sure that the correct mmproj was used, i.e., compare apples to apples
-    int n_llama_embd = llama_model_n_embd(llama_get_model(ctx_llama));
+    int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
    auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
    if (n_image_embd != n_llama_embd) {
        LOG_ERR("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
@@ -453,7 +456,7 @@ struct llava_embd_batch {
 };

 bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed, int n_batch, int * n_past) {
-    int n_embd  = llama_model_n_embd(llama_get_model(ctx_llama));
+    int n_embd  = llama_n_embd(llama_get_model(ctx_llama));

    for (int i = 0; i < image_embed->n_image_pos; i += n_batch) {
        int n_eval = image_embed->n_image_pos - i;
--- a/examples/llava/minicpmv-cli.cpp
+++ b/examples/llava/minicpmv-cli.cpp
@@ -54,7 +54,7 @@ static struct llava_context * llava_init_context(common_params * params, llama_m
        ctx_params.n_ctx = params->n_ctx;
    }

-    llama_context * ctx_llama = llama_init_from_model(model, ctx_params);
+    llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);

    if (ctx_llama == NULL) {
        LOG_ERR("%s: failed to create the llama_context\n" , __func__);
@@ -140,9 +140,6 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
    else if (has_minicpmv_projector == 3) {
        system_prompt = "<|im_start|>user\n";
    }
-    else if (has_minicpmv_projector == 4) {
-        system_prompt = "<|im_start|>user\n";
-    }
    LOG_INF("%s: image token past: %d\n", __func__, n_past);
    eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
    process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
@@ -170,12 +167,8 @@ static const char * sample(struct common_sampler * smpl,
                           int * n_past) {
    const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
    common_sampler_accept(smpl, id, true);
-
-    const llama_model * model = llama_get_model(ctx_llama);
-    const llama_vocab * vocab = llama_model_get_vocab(model);
-
    static std::string ret;
-    if (llama_vocab_is_eog(vocab, id)) {
+    if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
        ret = "</s>";
    } else {
        ret = common_token_to_piece(ctx_llama, id);
@@ -230,9 +223,6 @@ static struct common_sampler * llama_init(struct llava_context * ctx_llava, comm
        else if (has_minicpmv_projector == 3) {
            user_prompt = "<|im_start|>user\n" + prompt;
        }
-        else if (has_minicpmv_projector == 4) {
-            user_prompt = "<|im_start|>user\n" + prompt;
-        }
    }

    eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
@@ -242,9 +232,6 @@ static struct common_sampler * llama_init(struct llava_context * ctx_llava, comm
    else if (has_minicpmv_projector == 3) {
        eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false);
    }
-    else if (has_minicpmv_projector == 4) {
-        eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false);
-    }

    // generate the response

@@ -317,6 +304,7 @@ int main(int argc, char ** argv) {
                    const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
                    response += tmp;
                    if (strcmp(tmp, "</s>") == 0) break;
+                    if (strstr(tmp, "###")) break; // Yi-VL behavior
                    printf("%s", tmp);// mistral llava-1.6
                    if (strstr(response.c_str(), "<user>")) break; // minicpm-v
                    fflush(stdout);
--- a/examples/llava/minicpmv-convert-image-encoder-to-gguf.py
+++ b/examples/llava/minicpmv-convert-image-encoder-to-gguf.py
@@ -501,7 +501,7 @@ default_image_mean = [0.48145466, 0.4578275, 0.40821073]
 default_image_std = [0.26862954, 0.26130258, 0.27577711]
 ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
 ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
-ap.add_argument('--minicpmv_version', type=int, help='minicpmv_version: MiniCPM-V-2 use 1; MiniCPM-V-2.5 use 2; MiniCPM-V-2.6 use 3; MiniCPM-o-2.6 use 4', default=2)
+ap.add_argument('--minicpmv_version', type=int, help='minicpmv_version: MiniCPM-V-2 use 1; MiniCPM-V-2.5 use 2; MiniCPM-V-2.6 use 3', default=2)

 # with proper
 args = ap.parse_args()
@@ -545,19 +545,12 @@ if args.use_f32:

 minicpmv_version = args.minicpmv_version
 emb_dim = 4096
-block_count = 26
 if minicpmv_version == 1:
    emb_dim = 2304
-    block_count = 26
 elif minicpmv_version == 2:
    emb_dim = 4096
-    block_count = 27
 elif minicpmv_version == 3:
    emb_dim = 3584
-    block_count = 27
-elif minicpmv_version == 4:
-    emb_dim = 3584
-    block_count = 27

 default_vision_config = {
        "hidden_size": 1152,
@@ -574,9 +567,6 @@ model = Idefics2VisionTransformer(vision_config)
 if minicpmv_version == 3:
    vision_config = SiglipVisionConfig(**default_vision_config)
    model = SiglipVisionTransformer(vision_config)
-elif minicpmv_version == 4:
-    vision_config = SiglipVisionConfig(**default_vision_config)
-    model = SiglipVisionTransformer(vision_config)

 processor = None
 # if model.attn_pool is not None:
@@ -597,7 +587,7 @@ elif args.minicpmv_projector is not None:
    fname_middle = "mmproj-"
    has_text_encoder = False
    has_minicpmv_projector = True
-    minicpmv_version = 4
+    minicpmv_version = 3
 elif args.vision_only:
    fname_middle = "vision-"
    has_text_encoder = False
@@ -635,6 +625,7 @@ if has_vision_encoder:
    fout.add_uint32("clip.vision.projection_dim", 0)
    fout.add_uint32(add_key_str(KEY_ATTENTION_HEAD_COUNT, VISION), 16)
    fout.add_float32(add_key_str(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
+    block_count = 26
    fout.add_uint32(add_key_str(KEY_BLOCK_COUNT, VISION), block_count)

    if processor is not None:
--- a/examples/llava/minicpmv-surgery.py
+++ b/examples/llava/minicpmv-surgery.py
@@ -8,7 +8,7 @@ ap.add_argument("-m", "--model", help="Path to MiniCPM-V model")
 args = ap.parse_args()

 # find the model part that includes the the multimodal projector weights
-model = AutoModel.from_pretrained(args.model, trust_remote_code=True, local_files_only=True, torch_dtype=torch.bfloat16)
+model = AutoModel.from_pretrained(args.model, trust_remote_code=True, local_files_only=True)
 checkpoint = model.state_dict()

 # get a list of mm tensor names
--- a/examples/llava/qwen2vl-cli.cpp
+++ b/examples/llava/qwen2vl-cli.cpp
@@ -27,7 +27,7 @@

 static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed,
                                     int n_batch, int * n_past, int * st_pos_id, struct clip_image_size * image_size) {
-    int n_embd  = llama_model_n_embd(llama_get_model(ctx_llama));
+    int n_embd  = llama_n_embd(llama_get_model(ctx_llama));
    const int patch_size = 14 * 2;
    const int ph = image_size->height / patch_size + (image_size->height % patch_size > 0);
    const int pw = image_size->width / patch_size + (image_size->width % patch_size > 0);
@@ -132,12 +132,8 @@ static const char * sample(struct common_sampler * smpl,
                           int * n_past, int * st_pos_id) {
    const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
    common_sampler_accept(smpl, id, true);
-
-    const llama_model * model = llama_get_model(ctx_llama);
-    const llama_vocab * vocab = llama_model_get_vocab(model);
-
    static std::string ret;
-    if (llama_vocab_is_eog(vocab, id)) {
+    if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
        ret = "</s>";
    } else {
        ret = common_token_to_piece(ctx_llama, id);
@@ -332,10 +328,11 @@ static struct llava_context * llava_init_context(common_params * params, llama_m

    auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);

+
    llama_context_params ctx_params = common_context_params_to_llama(*params);
    ctx_params.n_ctx           = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings

-    llama_context * ctx_llama = llama_init_from_model(model, ctx_params);
+    llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);

    if (ctx_llama == NULL) {
        LOG_ERR("%s: failed to create the llama_context\n" , __func__);
@@ -484,7 +481,7 @@ static void debug_test_mrope_2d() {
 }

 static void debug_dump_img_embed(struct llava_context * ctx_llava) {
-    int n_embd  = llama_model_n_embd(llama_get_model(ctx_llava->ctx_llama));
+    int n_embd  = llama_n_embd(llama_get_model(ctx_llava->ctx_llama));
    int ne = n_embd * 4;
    float vals[56 * 56 * 3];
    // float embd[ne];
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@@ -61,8 +61,6 @@ int main(int argc, char ** argv) {
    llama_model * model = llama_init.model.get();
    llama_context * ctx = llama_init.context.get();

-    const llama_vocab * vocab = llama_model_get_vocab(model);
-
    // Tokenize the prompt
    std::vector<llama_token> inp;
    std::vector<llama_token> all;
@@ -149,7 +147,7 @@ int main(int argc, char ** argv) {
    }

    // here we keep adding new n-grams as we go
-    ngram_container ngrams_observed(llama_vocab_n_tokens(vocab), N, G);
+    ngram_container ngrams_observed(llama_n_vocab(model), N, G);

    // debug
    struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, W + G + 1);
@@ -299,7 +297,7 @@ int main(int argc, char ** argv) {
                }
                fflush(stdout);

-                if (llama_vocab_is_eog(vocab, id)) {
+                if (llama_token_is_eog(model, id)) {
                    has_eos = true;
                }

--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -36,8 +36,6 @@ int main(int argc, char ** argv){
    llama_model * model = llama_init.model.get();
    llama_context * ctx = llama_init.context.get();

-    const llama_vocab * vocab = llama_model_get_vocab(model);
-
    // tokenize the prompt
    std::vector<llama_token> inp;
    inp = common_tokenize(ctx, params.prompt, true, true);
@@ -138,7 +136,7 @@ int main(int argc, char ** argv){
                LOG("%s", token_str.c_str());
            }

-            if (llama_vocab_is_eog(vocab, id)) {
+            if (llama_token_is_eog(model, id)) {
                has_eos = true;
            }

--- a/examples/simple-cmake-pkg/.gitignore
+++ b/examples/simple-cmake-pkg/.gitignore
--- a/examples/main-cmake-pkg/CMakeLists.txt
+++ b/examples/main-cmake-pkg/CMakeLists.txt
@@ -0,0 +1,32 @@
+cmake_minimum_required(VERSION 3.12)
+project("llama-cli-cmake-pkg" C CXX)
+set(TARGET llama-cli-cmake-pkg)
+
+find_package(Llama 0.0.1 REQUIRED)
+
+# Bake common functionality in with target. Because applications
+# using the relocatable Llama package should be outside of the
+# source tree, llama-cli-cmake-pkg pretends the dependencies are built-in.
+set(_common_path "${CMAKE_CURRENT_LIST_DIR}/../../common")
+add_library(common OBJECT)
+file(GLOB _common_files
+    "${_common_path}/*.h"
+    "${_common_path}/*.cpp"
+)
+target_sources(common PRIVATE ${_common_files})
+
+# If the common project was part of "llama-cli-cmake-pkg" the transient
+# defines would automatically be attached. Because the common func-
+# tionality is separate, but dependent upon the defines, it must be
+# explicitly extracted from the "llama" target.
+#
+get_target_property(_llama_transient_defines llama
+    INTERFACE_COMPILE_DEFINITIONS)
+
+target_compile_definitions(common PRIVATE "${_llama_transient_defines}")
+
+add_executable(${TARGET} ${CMAKE_CURRENT_LIST_DIR}/../main/main.cpp)
+target_include_directories(${TARGET} PRIVATE ${_common_path})
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/main-cmake-pkg/README.md
+++ b/examples/main-cmake-pkg/README.md
@@ -0,0 +1,31 @@
+# llama.cpp/example/main-cmake-pkg
+
+This program builds [llama-cli](../main) using a relocatable CMake package. It serves as an example of using the `find_package()` CMake command to conveniently include [llama.cpp](https://github.com/ggerganov/llama.cpp) in projects which live outside of the source tree.
+
+## Building
+
+Because this example is "outside of the source tree", it is important to first build/install llama.cpp using CMake. An example is provided here, but please see the [llama.cpp build instructions](../..) for more detailed build instructions.
+
+### Considerations
+
+When hardware acceleration libraries are used (e.g. CUDA, Metal, etc.), CMake must be able to locate the associated CMake package.
+
+### Build llama.cpp and install to C:\LlamaCPP directory
+
+```cmd
+git clone https://github.com/ggerganov/llama.cpp
+cd llama.cpp
+cmake -B build -DBUILD_SHARED_LIBS=OFF -G "Visual Studio 17 2022" -A x64
+cmake --build build --config Release
+cmake --install build --prefix C:/LlamaCPP
+```
+
+### Build llama-cli-cmake-pkg
+
+
+```cmd
+cd ..\examples\main-cmake-pkg
+cmake -B build -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="C:/LlamaCPP/lib/cmake/Llama" -G "Visual Studio 17 2022" -A x64
+cmake --build build --config Release
+cmake --install build --prefix C:/MyLlamaApp
+```
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -310,9 +310,9 @@ These options help improve the performance and memory usage of the LLaMA models.

 ### Batch Size

- `-ub N`, `--ubatch-size N`: Physical batch size. This is the maximum number of tokens that may be processed at a time. Increasing this value may improve performance during prompt processing, at the expense of higher memory usage. Default: `512`.
+-   `-b N, --batch-size N`: Set the batch size for prompt processing (default: `2048`). This large batch size benefits users who have BLAS installed and enabled it during the build. If you don't have BLAS enabled ("BLAS=0"), you can use a smaller number, such as 8, to see the prompt progress as it's evaluated in some situations.

- `-b N`, `--batch-size N`: Logical batch size. Increasing this value above the value of the physical batch size may improve prompt processing performance when using multiple GPUs with pipeline parallelism. Default: `2048`.
+- `-ub N`, `--ubatch-size N`: physical maximum batch size. This is for pipeline parallelization. Default: `512`.

 ### Prompt Caching

--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -4,8 +4,8 @@
 #include "log.h"
 #include "sampling.h"
 #include "llama.h"
-#include "chat-template.hpp"

+#include <cassert>
 #include <cstdio>
 #include <cstring>
 #include <ctime>
@@ -31,8 +31,6 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

-static const char * DEFAULT_SYSTEM_MESSAGE = "You are a helpful assistant";
-
 static llama_context           ** g_ctx;
 static llama_model             ** g_model;
 static common_sampler          ** g_smpl;
@@ -85,6 +83,14 @@ static void sigint_handler(int signo) {
 }
 #endif

+static std::string chat_add_and_format(struct llama_model * model, std::vector<common_chat_msg> & chat_msgs, const std::string & role, const std::string & content) {
+    common_chat_msg new_msg{role, content};
+    auto formatted = common_chat_format_single(model, g_params->chat_template, chat_msgs, new_msg, role == "user");
+    chat_msgs.push_back({role, content});
+    LOG_DBG("formatted: '%s'\n", formatted.c_str());
+    return formatted;
+}
+
 int main(int argc, char ** argv) {
    common_params params;
    g_params = &params;
@@ -157,9 +163,6 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    const llama_vocab * vocab = llama_model_get_vocab(model);
-    auto chat_templates = common_chat_templates_from_model(model, params.chat_template);
-
    LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);

    auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
@@ -193,33 +196,17 @@ int main(int argc, char ** argv) {

    llama_attach_threadpool(ctx, threadpool, threadpool_batch);

-    const int n_ctx_train = llama_model_n_ctx_train(model);
+    const int n_ctx_train = llama_n_ctx_train(model);
    const int n_ctx = llama_n_ctx(ctx);

    if (n_ctx > n_ctx_train) {
        LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
    }

-    // auto enable conversation mode if chat template is available
-    const bool has_chat_template = chat_templates.has_explicit_template && chat_templates.template_default;
-    if (params.conversation_mode == COMMON_CONVERSATION_MODE_AUTO) {
-        if (has_chat_template) {
-            LOG_INF("%s: chat template is available, enabling conversation mode (disable it with -no-cnv)\n", __func__);
-            params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED;
-        } else {
-            params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
-        }
-    }
-
-    // in case user force-activate conversation mode (via -cnv) without proper chat template, we show a warning
-    if (params.conversation_mode && !has_chat_template) {
-        LOG_WRN("%s: chat template is not available or is not supported. This may cause the model to output suboptimal responses\n", __func__);
-    }
-
    // print chat template example in conversation mode
-    if (params.conversation_mode) {
+    if (params.conversation) {
        if (params.enable_chat_template) {
-            LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(*chat_templates.template_default, params.use_jinja).c_str());
+            LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(model, params.chat_template).c_str());
        } else {
            LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
        }
@@ -254,28 +241,18 @@ int main(int argc, char ** argv) {
        }
    }

-    const bool add_bos = llama_vocab_get_add_bos(vocab) && !params.use_jinja;
+    const bool add_bos = llama_add_bos_token(model);
    if (!llama_model_has_encoder(model)) {
-        GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
+        GGML_ASSERT(!llama_add_eos_token(model));
    }

    LOG_DBG("n_ctx: %d, add_bos: %d\n", n_ctx, add_bos);

    std::vector<llama_token> embd_inp;

-    auto chat_add_and_format = [&chat_msgs, &chat_templates](const std::string & role, const std::string & content) {
-        common_chat_msg new_msg{role, content, {}};
-        auto formatted = common_chat_format_single(*chat_templates.template_default, chat_msgs, new_msg, role == "user", g_params->use_jinja);
-        chat_msgs.push_back({role, content, {}});
-        LOG_DBG("formatted: '%s'\n", formatted.c_str());
-        return formatted;
-    };
-
    {
-        auto prompt = (params.conversation_mode && params.enable_chat_template)
-            // format the system prompt in conversation mode (fallback to default if empty)
-            ? chat_add_and_format("system", params.prompt.empty() ? DEFAULT_SYSTEM_MESSAGE : params.prompt)
-            // otherwise use the prompt as is
+        auto prompt = (params.conversation && params.enable_chat_template && !params.prompt.empty())
+            ? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
            : params.prompt;
        if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
            LOG_DBG("tokenize the prompt\n");
@@ -292,7 +269,7 @@ int main(int argc, char ** argv) {
    // Should not run without any tokens
    if (embd_inp.empty()) {
        if (add_bos) {
-            embd_inp.push_back(llama_vocab_bos(vocab));
+            embd_inp.push_back(llama_token_bos(model));
            LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
        } else {
            LOG_ERR("input is empty\n");
@@ -349,7 +326,7 @@ int main(int argc, char ** argv) {
        params.n_keep += add_bos; // always keep the BOS token
    }

-    if (params.conversation_mode) {
+    if (params.conversation) {
        params.interactive_first = true;
    }

@@ -473,11 +450,7 @@ int main(int argc, char ** argv) {
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
        LOG_INF(       " - Press Ctrl+C to interject at any time.\n");
 #endif
-        LOG_INF(       "%s", control_message);
-        if (params.conversation_mode && params.enable_chat_template && params.prompt.empty()) {
-            LOG_INF(   " - Using default system message. To change it, set a different value via -p PROMPT or -f FILE argument.\n");
-        }
-        LOG_INF("\n");
+        LOG_INF(       "%s\n", control_message);

        is_interacting = params.interactive_first;
    }
@@ -503,14 +476,12 @@ int main(int argc, char ** argv) {

    std::vector<llama_token> embd;

-    // single-token antiprompts
-    std::vector<llama_token> antiprompt_token;
+    // tokenized antiprompts
+    std::vector<std::vector<llama_token>> antiprompt_ids;

+    antiprompt_ids.reserve(params.antiprompt.size());
    for (const std::string & antiprompt : params.antiprompt) {
-        auto ids = ::common_tokenize(ctx, antiprompt, false, true);
-        if (ids.size() == 1) {
-            antiprompt_token.push_back(ids[0]);
-        }
+        antiprompt_ids.emplace_back(::common_tokenize(ctx, antiprompt, false, true));
    }

    if (llama_model_has_encoder(model)) {
@@ -524,7 +495,7 @@ int main(int argc, char ** argv) {

        llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
        if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
-            decoder_start_token_id = llama_vocab_bos(vocab);
+            decoder_start_token_id = llama_token_bos(model);
        }

        embd_inp.clear();
@@ -755,11 +726,14 @@ int main(int argc, char ** argv) {

                // check for reverse prompt using special tokens
                llama_token last_token = common_sampler_last(smpl);
-                if (std::find(antiprompt_token.begin(), antiprompt_token.end(), last_token) != antiprompt_token.end()) {
-                    if (params.interactive) {
-                        is_interacting = true;
+                for (std::vector<llama_token> ids : antiprompt_ids) {
+                    if (ids.size() == 1 && last_token == ids[0]) {
+                        if (params.interactive) {
+                            is_interacting = true;
+                        }
+                        is_antiprompt = true;
+                        break;
                    }
-                    is_antiprompt = true;
                }

                if (is_antiprompt) {
@@ -768,7 +742,7 @@ int main(int argc, char ** argv) {
            }

            // deal with end of generation tokens in interactive mode
-            if (llama_vocab_is_eog(vocab, common_sampler_last(smpl))) {
+            if (llama_token_is_eog(model, common_sampler_last(smpl))) {
                LOG_DBG("found an EOG token\n");

                if (params.interactive) {
@@ -780,7 +754,7 @@ int main(int argc, char ** argv) {
                    }

                    if (params.enable_chat_template) {
-                        chat_add_and_format("assistant", assistant_ss.str());
+                        chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str());
                    }
                    is_interacting = true;
                    LOG("\n");
@@ -788,7 +762,7 @@ int main(int argc, char ** argv) {
            }

            // if current token is not EOG, we add it to current assistant message
-            if (params.conversation_mode) {
+            if (params.conversation) {
                const auto id = common_sampler_last(smpl);
                assistant_ss << common_token_to_piece(ctx, id, false);
            }
@@ -796,17 +770,17 @@ int main(int argc, char ** argv) {
            if (n_past > 0 && is_interacting) {
                LOG_DBG("waiting for user input\n");

-                if (params.conversation_mode) {
+                if (params.conversation) {
                    LOG("\n> ");
                }

                if (params.input_prefix_bos) {
                    LOG_DBG("adding input prefix BOS token\n");
-                    embd_inp.push_back(llama_vocab_bos(vocab));
+                    embd_inp.push_back(llama_token_bos(model));
                }

                std::string buffer;
-                if (!params.input_prefix.empty() && !params.conversation_mode) {
+                if (!params.input_prefix.empty() && !params.conversation) {
                    LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
                    LOG("%s", params.input_prefix.c_str());
                }
@@ -830,7 +804,7 @@ int main(int argc, char ** argv) {
                // Entering a empty line lets the user pass control back
                if (buffer.length() > 1) {
                    // append input suffix if any
-                    if (!params.input_suffix.empty() && !params.conversation_mode) {
+                    if (!params.input_suffix.empty() && !params.conversation) {
                        LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
                        LOG("%s", params.input_suffix.c_str());
                    }
@@ -843,9 +817,9 @@ int main(int argc, char ** argv) {
                        string_process_escapes(buffer);
                    }

-                    bool format_chat = params.conversation_mode && params.enable_chat_template;
+                    bool format_chat = params.conversation && params.enable_chat_template;
                    std::string user_inp = format_chat
-                        ? chat_add_and_format("user", std::move(buffer))
+                        ? chat_add_and_format(model, chat_msgs, "user", std::move(buffer))
                        : std::move(buffer);
                    // TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix)
                    const auto line_pfx = common_tokenize(ctx, params.input_prefix, false, true);
@@ -856,8 +830,8 @@ int main(int argc, char ** argv) {

                    // if user stop generation mid-way, we must add EOT to finish model's last response
                    if (need_insert_eot && format_chat) {
-                        llama_token eot = llama_vocab_eot(vocab);
-                        embd_inp.push_back(eot == LLAMA_TOKEN_NULL ? llama_vocab_eos(vocab) : eot);
+                        llama_token eot = llama_token_eot(model);
+                        embd_inp.push_back(eot == LLAMA_TOKEN_NULL ? llama_token_eos(model) : eot);
                        need_insert_eot = false;
                    }

@@ -892,7 +866,7 @@ int main(int argc, char ** argv) {
        }

        // end of generation
-        if (!embd.empty() && llama_vocab_is_eog(vocab, embd.back()) && !(params.interactive)) {
+        if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.interactive)) {
            LOG(" [end of text]\n");
            break;
        }
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -135,8 +135,6 @@ int main(int argc, char ** argv) {
    llama_model * model = llama_init.model.get();
    llama_context * ctx = llama_init.context.get();

-    const llama_vocab * vocab = llama_model_get_vocab(model);
-
    // load the prompts from an external file if there are any
    if (params.prompt.empty()) {
        LOG_INF("\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
@@ -360,7 +358,7 @@ int main(int argc, char ** argv) {
                //        client.id, client.seq_id, id, client.n_decoded, client.i_batch, token_str.c_str());

                if (client.n_decoded > 2 &&
-                        (llama_vocab_is_eog(vocab, id) ||
+                        (llama_token_is_eog(model, id) ||
                         (params.n_predict > 0 && client.n_decoded + client.n_prompt >= params.n_predict) ||
                         client.response.find("User:") != std::string::npos ||
                         client.response.find('\n') != std::string::npos)) {
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -70,17 +70,15 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    const llama_vocab * vocab = llama_model_get_vocab(model);
-
    // initialize the context

    llama_context_params ctx_params = common_context_params_to_llama(params);

-    ctx_params.n_ctx = llama_model_n_ctx_train(model)*n_grp + n_keep;
+    ctx_params.n_ctx = llama_n_ctx_train(model)*n_grp + n_keep;

    GGML_ASSERT(ctx_params.n_batch % n_grp == 0 && "n_batch must be divisible by n_grp");

-    llama_context * ctx = llama_init_from_model(model, ctx_params);
+    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
    if (ctx == NULL) {
        LOG_ERR("%s: failed to create the llama_context\n" , __func__);
        return 1;
@@ -225,7 +223,7 @@ int main(int argc, char ** argv) {
            const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1);

            // is it an end of generation?
-            if (llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_len) {
+            if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
                LOG("\n");

                break;
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -296,11 +296,8 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
    // Output: `perplexity: 13.5106 [114/114]`
    // BOS tokens will be added for each chunk before eval

-    const llama_model * model = llama_get_model(ctx);
-    const llama_vocab * vocab = llama_model_get_vocab(model);
-
-    const bool add_bos = llama_vocab_get_add_bos(vocab);
-    GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
+    const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
+    GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));

    LOG_INF("%s: tokenizing the input ..\n", __func__);

@@ -341,7 +338,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
    const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
    const int n_batch = params.n_batch;

-    const int n_vocab = llama_vocab_n_tokens(vocab);
+    const int n_vocab = llama_n_vocab(llama_get_model(ctx));

    int count = 0;
    double nll = 0.0;
@@ -385,7 +382,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params

            // add BOS token for the first batch of each chunk
            if (add_bos && j == 0) {
-                tokens[batch_start] = llama_vocab_bos(vocab);
+                tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
            }

            const auto * batch_logits = llama_get_logits(ctx);
@@ -447,11 +444,8 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
    // Output: `perplexity: 13.5106 [114/114]`
    // BOS tokens will be added for each chunk before eval

-    const llama_model * model = llama_get_model(ctx);
-    const llama_vocab * vocab = llama_model_get_vocab(model);
-
-    const bool add_bos = llama_vocab_get_add_bos(vocab);
-    GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
+    const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
+    GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));

    std::ofstream logits_stream;
    if (!params.logits_file.empty()) {
@@ -491,7 +485,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
    const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
    const int n_batch = params.n_batch;

-    const int n_vocab = llama_vocab_n_tokens(vocab);
+    const int n_vocab = llama_n_vocab(llama_get_model(ctx));

    int count = 0;
    double nll = 0.0;
@@ -563,7 +557,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &

                // add BOS token for the first batch of each chunk
                if (add_bos && j == 0) {
-                    tokens[seq_start] = llama_vocab_bos(vocab);
+                    tokens[seq_start] = llama_token_bos(llama_get_model(ctx));
                }

                for (int k = 0; k < batch_size; ++k) {
@@ -738,9 +732,6 @@ static void compute_logprobs(const float * batch_logits, int n_vocab, std::vecto
 }

 static void hellaswag_score(llama_context * ctx, const common_params & params) {
-    const llama_model * model = llama_get_model(ctx);
-    const llama_vocab * vocab = llama_model_get_vocab(model);
-
    // Calculates hellaswag score (acc_norm) from prompt
    //
    // Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl
@@ -774,7 +765,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) {
    size_t hs_task_count = prompt_lines.size()/6;
    LOG_INF("%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);

-    const bool is_spm = llama_vocab_type(vocab) == LLAMA_VOCAB_TYPE_SPM;
+    const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
    LOG_INF("================================= is_spm = %d\n", is_spm);

    // The tasks should be randomized so the score stabilizes quickly.
@@ -857,7 +848,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) {
    const int n_ctx   = llama_n_ctx(ctx);
    const int n_batch = params.n_batch;

-    const int n_vocab = llama_vocab_n_tokens(vocab);
+    const int n_vocab = llama_n_vocab(llama_get_model(ctx));

    const int max_tasks_per_batch = 32;
    const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
@@ -1081,8 +1072,6 @@ static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string
 *
 */
 static void winogrande_score(llama_context * ctx, const common_params & params) {
-    const llama_model * model = llama_get_model(ctx);
-    const llama_vocab * vocab = llama_model_get_vocab(model);

    constexpr int k_min_trailing_ctx = 3;

@@ -1141,7 +1130,7 @@ static void winogrande_score(llama_context * ctx, const common_params & params)
    const int n_ctx   = llama_n_ctx(ctx);
    const int n_batch = params.n_batch;

-    const int n_vocab = llama_vocab_n_tokens(vocab);
+    const int n_vocab = llama_n_vocab(llama_get_model(ctx));

    const int max_tasks_per_batch = 128;
    const int max_seq = std::min(2*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
@@ -1385,8 +1374,6 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choic
 //     https://huggingface.co/datasets/truthful_qa
 //
 static void multiple_choice_score(llama_context * ctx, const common_params & params) {
-    const llama_model * model = llama_get_model(ctx);
-    const llama_vocab * vocab = llama_model_get_vocab(model);

    std::istringstream strstream(params.prompt);
    uint32_t n_task;
@@ -1495,7 +1482,7 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par
    const int n_ctx   = llama_n_ctx(ctx);
    const int n_batch = params.n_batch;

-    const int n_vocab = llama_vocab_n_tokens(vocab);
+    const int n_vocab = llama_n_vocab(llama_get_model(ctx));

    const int max_tasks_per_batch = 32;
    const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
@@ -1668,9 +1655,6 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par
 }

 static void kl_divergence(llama_context * ctx, const common_params & params) {
-    const llama_model * model = llama_get_model(ctx);
-    const llama_vocab * vocab = llama_model_get_vocab(model);
-
    if (params.logits_file.empty()) {
        LOG_ERR("%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
        return;
@@ -1704,8 +1688,8 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
        LOG_ERR("%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str());
        return;
    }
-    if (n_vocab != llama_vocab_n_tokens(vocab)) {
-        LOG_ERR("%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_vocab_n_tokens(vocab));
+    if (n_vocab != llama_n_vocab(llama_get_model(ctx))) {
+        LOG_ERR("%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx)));
    }

    std::vector<llama_token> tokens(size_t(n_ctx) * n_chunk);
@@ -1717,8 +1701,8 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
    const int n_batch = params.n_batch;
    const int num_batches = (n_ctx + n_batch - 1)/n_batch;
    const int nv = 2*((n_vocab + 1)/2) + 4;
-    const bool add_bos = llama_vocab_get_add_bos(vocab);
-    GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
+    const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
+    GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));

    std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv);
    std::vector<float>    kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
@@ -1777,7 +1761,7 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {

            // add BOS token for the first batch of each chunk
            if (add_bos && j == 0) {
-                tokens[batch_start] = llama_vocab_bos(vocab);
+                tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
            }

            common_batch_clear(batch);
@@ -2011,7 +1995,7 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    const int n_ctx_train = llama_model_n_ctx_train(model);
+    const int n_ctx_train = llama_n_ctx_train(model);

    if (params.n_ctx > n_ctx_train) {
        LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -319,7 +319,7 @@ int main(int argc, char ** argv) {
        auto cparams = llama_context_default_params();
        cparams.n_ctx = 256;

-        ctx = llama_init_from_model(model, cparams);
+        ctx = llama_new_context_with_model(model, cparams);

        if (ctx == NULL) {
            fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
--- a/examples/quantize/tests.sh
+++ b/examples/quantize/tests.sh
@@ -47,7 +47,7 @@ echo PASS
 echo

 # 3a. Test the requanted model is loading properly
-$MAIN -no-cnv --model $WORK_PATH/ggml-model-requant-00001-of-00006.gguf --n-predict 32
+$MAIN --model $WORK_PATH/ggml-model-requant-00001-of-00006.gguf --n-predict 32
 echo PASS
 echo

@@ -57,7 +57,7 @@ echo PASS
 echo

 # 4b. Test the requanted model is loading properly
-$MAIN -no-cnv --model $WORK_PATH/ggml-model-requant-merge.gguf --n-predict 32
+$MAIN --model $WORK_PATH/ggml-model-requant-merge.gguf --n-predict 32
 echo PASS
 echo

--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@@ -159,9 +159,7 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    const llama_vocab * vocab = llama_model_get_vocab(model);
-
-    const int n_ctx_train = llama_model_n_ctx_train(model);
+    const int n_ctx_train = llama_n_ctx_train(model);
    const int n_ctx = llama_n_ctx(ctx);

    const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
@@ -194,8 +192,8 @@ int main(int argc, char ** argv) {
            return 1;
        }
        // add eos if not present
-        if (llama_vocab_eos(vocab) >= 0 && (inp.empty() || inp.back() != llama_vocab_eos(vocab))) {
-            inp.push_back(llama_vocab_eos(vocab));
+        if (llama_token_eos(model) >= 0 && (inp.empty() || inp.back() != llama_token_eos(model))) {
+            inp.push_back(llama_token_eos(model));
        }
        chunk.tokens = inp;
    }
@@ -217,7 +215,7 @@ int main(int argc, char ** argv) {
    struct llama_batch batch = llama_batch_init(n_batch, 0, 1);

    // allocate output
-    const int n_embd = llama_model_n_embd(model);
+    const int n_embd = llama_n_embd(model);
    std::vector<float> embeddings(n_chunks * n_embd, 0);
    float * emb = embeddings.data();

--- a/examples/run/CMakeLists.txt
+++ b/examples/run/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-run)
-add_executable(${TARGET} run.cpp linenoise.cpp/linenoise.cpp)
+add_executable(${TARGET} run.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/run/README.md
+++ b/examples/run/README.md
@@ -3,10 +3,11 @@
 The purpose of this example is to demonstrate a minimal usage of llama.cpp for running models.

 ```bash
-llama-run granite3-moe
+llama-run granite-code
 ```

 ```bash
+llama-run -h
 Description:
  Runs a llm

@@ -16,7 +17,7 @@ Usage:
 Options:
  -c, --context-size <value>
      Context size (default: 2048)
-  -n, -ngl, --ngl <value>
+  -n, --ngl <value>
      Number of GPU layers (default: 0)
  --temp <value>
      Temperature (default: 0.8)
--- a/examples/run/linenoise.cpp/LICENSE
+++ b/examples/run/linenoise.cpp/LICENSE
@@ -1,26 +0,0 @@
-Copyright (c) 2010-2014, Salvatore Sanfilippo <antirez at gmail dot com>
-Copyright (c) 2010-2013, Pieter Noordhuis <pcnoordhuis at gmail dot com>
-Copyright (c) 2025, Eric Curtin <ericcurtin17 at gmail dot com>
-
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-* Redistributions of source code must retain the above copyright notice,
-  this list of conditions and the following disclaimer.
-
-* Redistributions in binary form must reproduce the above copyright notice,
-  this list of conditions and the following disclaimer in the documentation
-  and/or other materials provided with the distribution.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/examples/run/linenoise.cpp/linenoise.cpp
+++ b/examples/run/linenoise.cpp/linenoise.cpp
--- a/examples/run/linenoise.cpp/linenoise.h
+++ b/examples/run/linenoise.cpp/linenoise.h
@@ -1,128 +0,0 @@
-/* linenoise.h -- VERSION 1.0
- *
- * Guerrilla line editing library against the idea that a line editing lib
- * needs to be 20,000 lines of C++ code.
- *
- * See linenoise.cpp for more information.
- *
- * ------------------------------------------------------------------------
- *
- * Copyright (c) 2010-2023, Salvatore Sanfilippo <antirez at gmail dot com>
- * Copyright (c) 2010-2013, Pieter Noordhuis <pcnoordhuis at gmail dot com>
- * Copyright (c) 2025, Eric Curtin <ericcurtin17 at gmail dot com>
- *
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- *
- *  *  Redistributions of source code must retain the above copyright
- *     notice, this list of conditions and the following disclaimer.
- *
- *  *  Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in the
- *     documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __LINENOISE_H
-#define __LINENOISE_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <stddef.h> /* For size_t. */
-#include <stdlib.h>
-
-extern const char *linenoiseEditMore;
-
-/* The linenoiseState structure represents the state during line editing.
- * We pass this state to functions implementing specific editing
- * functionalities. */
-struct linenoiseState {
-    int in_completion;  /* The user pressed TAB and we are now in completion
-                         * mode, so input is handled by completeLine(). */
-    size_t completion_idx; /* Index of next completion to propose. */
-    int ifd;            /* Terminal stdin file descriptor. */
-    int ofd;            /* Terminal stdout file descriptor. */
-    char *buf;          /* Edited line buffer. */
-    size_t buflen;      /* Edited line buffer size. */
-    const char *prompt; /* Prompt to display. */
-    size_t plen;        /* Prompt length. */
-    size_t pos;         /* Current cursor position. */
-    size_t oldpos;      /* Previous refresh cursor position. */
-    size_t len;         /* Current edited line length. */
-    size_t cols;        /* Number of columns in terminal. */
-    size_t oldrows;     /* Rows used by last refrehsed line (multiline mode) */
-    int history_index;  /* The history index we are currently editing. */
-};
-
-struct linenoiseCompletions {
-    size_t  len     = 0;
-    char ** cvec    = nullptr;
-    bool    to_free = true;
-
-    ~linenoiseCompletions() {
-        if (!to_free) {
-            return;
-        }
-
-        for (size_t i = 0; i < len; ++i) {
-            free(cvec[i]);
-        }
-
-        free(cvec);
-    }
-};
-
-/* Non blocking API. */
-int linenoiseEditStart(struct linenoiseState *l, int stdin_fd, int stdout_fd, char *buf, size_t buflen, const char *prompt);
-const char *linenoiseEditFeed(struct linenoiseState *l);
-void linenoiseEditStop(struct linenoiseState *l);
-void linenoiseHide(struct linenoiseState *l);
-void linenoiseShow(struct linenoiseState *l);
-
-/* Blocking API. */
-const char *linenoise(const char *prompt);
-void linenoiseFree(void *ptr);
-
-/* Completion API. */
-typedef void(linenoiseCompletionCallback)(const char *, linenoiseCompletions *);
-typedef const char*(linenoiseHintsCallback)(const char *, int *color, int *bold);
-typedef void(linenoiseFreeHintsCallback)(const char *);
-void linenoiseSetCompletionCallback(linenoiseCompletionCallback *);
-void linenoiseSetHintsCallback(linenoiseHintsCallback *);
-void linenoiseSetFreeHintsCallback(linenoiseFreeHintsCallback *);
-void linenoiseAddCompletion(linenoiseCompletions *, const char *);
-
-/* History API. */
-int linenoiseHistoryAdd(const char *line);
-int linenoiseHistorySetMaxLen(int len);
-int linenoiseHistorySave(const char *filename);
-int linenoiseHistoryLoad(const char *filename);
-
-/* Other utilities. */
-void linenoiseClearScreen(void);
-void linenoiseSetMultiLine(int ml);
-void linenoisePrintKeyCodes(void);
-void linenoiseMaskModeEnable(void);
-void linenoiseMaskModeDisable(void);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* __LINENOISE_H */
--- a/examples/run/run.cpp
+++ b/examples/run/run.cpp
@@ -19,20 +19,17 @@
 #include <cstring>
 #include <filesystem>
 #include <iostream>
-#include <list>
 #include <sstream>
 #include <string>
 #include <vector>

 #include "common.h"
 #include "json.hpp"
-#include "linenoise.cpp/linenoise.h"
 #include "llama-cpp.h"
-#include "chat-template.hpp"

 #if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) || defined(_WIN32)
 [[noreturn]] static void sigint_handler(int) {
-    printf("\n\033[0m");
+    printf("\n");
    exit(0);  // not ideal, but it's the only way to guarantee exit in all cases
 }
 #endif
@@ -106,7 +103,6 @@ class Opt {
    llama_model_params   model_params;
    std::string model_;
    std::string          user;
-    bool                 use_jinja   = false;
    int                  context_size = -1, ngl = -1;
    float                temperature = -1;
    bool                 verbose     = false;
@@ -147,8 +143,7 @@ class Opt {
                if (handle_option_with_value(argc, argv, i, context_size) == 1) {
                    return 1;
                }
-            } else if (options_parsing &&
-                       (strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "-ngl") == 0 || strcmp(argv[i], "--ngl") == 0)) {
+            } else if (options_parsing && (strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "--ngl") == 0)) {
                if (handle_option_with_value(argc, argv, i, ngl) == 1) {
                    return 1;
                }
@@ -159,8 +154,6 @@ class Opt {
            } else if (options_parsing &&
                       (parse_flag(argv, i, "-v", "--verbose") || parse_flag(argv, i, "-v", "--log-verbose"))) {
                verbose = true;
-            } else if (options_parsing && strcmp(argv[i], "--jinja") == 0) {
-                use_jinja = true;
            } else if (options_parsing && parse_flag(argv, i, "-h", "--help")) {
                help = true;
                return 0;
@@ -181,10 +174,6 @@ class Opt {
            }
        }

-        if (model_.empty()){
-            return 1;
-        }
-
        return 0;
    }

@@ -199,7 +188,7 @@ class Opt {
            "Options:\n"
            "  -c, --context-size <value>\n"
            "      Context size (default: %d)\n"
-            "  -n, -ngl, --ngl <value>\n"
+            "  -n, --ngl <value>\n"
            "      Number of GPU layers (default: %d)\n"
            "  --temp <value>\n"
            "      Temperature (default: %.1f)\n"
@@ -323,10 +312,6 @@ class HttpClient {
  public:
    int init(const std::string & url, const std::vector<std::string> & headers, const std::string & output_file,
             const bool progress, std::string * response_str = nullptr) {
-        if (std::filesystem::exists(output_file)) {
-            return 0;
-        }
-
        std::string output_file_partial;
        curl = curl_easy_init();
        if (!curl) {
@@ -354,11 +339,7 @@ class HttpClient {
        data.file_size = set_resume_point(output_file_partial);
        set_progress_options(progress, data);
        set_headers(headers);
-        CURLcode res = perform(url);
-        if (res != CURLE_OK){
-            printe("Fetching resource '%s' failed: %s\n", url.c_str(), curl_easy_strerror(res));
-            return 1;
-        }
+        perform(url);
        if (!output_file.empty()) {
            std::filesystem::rename(output_file_partial, output_file);
        }
@@ -423,12 +404,16 @@ class HttpClient {
        }
    }

-    CURLcode perform(const std::string & url) {
+    void perform(const std::string & url) {
+        CURLcode res;
        curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
        curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
        curl_easy_setopt(curl, CURLOPT_DEFAULT_PROTOCOL, "https");
        curl_easy_setopt(curl, CURLOPT_FAILONERROR, 1L);
-        return curl_easy_perform(curl);
+        res = curl_easy_perform(curl);
+        if (res != CURLE_OK) {
+            printe("curl_easy_perform() failed: %s\n", curl_easy_strerror(res));
+        }
    }

    static std::string human_readable_time(double seconds) {
@@ -551,7 +536,7 @@ class LlamaData {
    llama_sampler_ptr               sampler;
    llama_context_ptr               context;
    std::vector<llama_chat_message> messages;
-    std::list<std::string>          msg_strs;
+    std::vector<std::string>        msg_strs;
    std::vector<char>               fmtted;

    int init(Opt & opt) {
@@ -566,14 +551,13 @@ class LlamaData {
        }

        sampler = initialize_sampler(opt);
-
        return 0;
    }

  private:
 #ifdef LLAMA_USE_CURL
-    int download(const std::string & url, const std::string & output_file, const bool progress,
-                 const std::vector<std::string> & headers = {}, std::string * response_str = nullptr) {
+    int download(const std::string & url, const std::vector<std::string> & headers, const std::string & output_file,
+                 const bool progress, std::string * response_str = nullptr) {
        HttpClient http;
        if (http.init(url, headers, output_file, progress, response_str)) {
            return 1;
@@ -582,85 +566,48 @@ class LlamaData {
        return 0;
    }
 #else
-    int download(const std::string &, const std::string &, const bool, const std::vector<std::string> & = {},
+    int download(const std::string &, const std::vector<std::string> &, const std::string &, const bool,
                 std::string * = nullptr) {
        printe("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
-
        return 1;
    }
 #endif

-    // Helper function to handle model tag extraction and URL construction
-    std::pair<std::string, std::string> extract_model_and_tag(std::string & model, const std::string & base_url) {
-        std::string  model_tag = "latest";
-        const size_t colon_pos = model.find(':');
+    int huggingface_dl(const std::string & model, const std::vector<std::string> headers, const std::string & bn) {
+        // Find the second occurrence of '/' after protocol string
+        size_t pos = model.find('/');
+        pos        = model.find('/', pos + 1);
+        if (pos == std::string::npos) {
+            return 1;
+        }
+
+        const std::string hfr = model.substr(0, pos);
+        const std::string hff = model.substr(pos + 1);
+        const std::string url = "https://huggingface.co/" + hfr + "/resolve/main/" + hff;
+        return download(url, headers, bn, true);
+    }
+
+    int ollama_dl(std::string & model, const std::vector<std::string> headers, const std::string & bn) {
+        if (model.find('/') == std::string::npos) {
+            model = "library/" + model;
+        }
+
+        std::string model_tag = "latest";
+        size_t      colon_pos = model.find(':');
        if (colon_pos != std::string::npos) {
            model_tag = model.substr(colon_pos + 1);
            model     = model.substr(0, colon_pos);
        }

-        std::string url = base_url + model + "/manifests/" + model_tag;
-
-        return { model, url };
-    }
-
-    // Helper function to download and parse the manifest
-    int download_and_parse_manifest(const std::string & url, const std::vector<std::string> & headers,
-                                    nlohmann::json & manifest) {
+        std::string manifest_url = "https://registry.ollama.ai/v2/" + model + "/manifests/" + model_tag;
        std::string manifest_str;
-        int         ret = download(url, "", false, headers, &manifest_str);
+        const int   ret = download(manifest_url, headers, "", false, &manifest_str);
        if (ret) {
            return ret;
        }

-        manifest = nlohmann::json::parse(manifest_str);
-
-        return 0;
-    }
-
-    int huggingface_dl(std::string & model, const std::string & bn) {
-        // Find the second occurrence of '/' after protocol string
-        size_t pos = model.find('/');
-        pos        = model.find('/', pos + 1);
-        std::string              hfr, hff;
-        std::vector<std::string> headers = { "User-Agent: llama-cpp", "Accept: application/json" };
-        std::string              url;
-
-        if (pos == std::string::npos) {
-            auto [model_name, manifest_url] = extract_model_and_tag(model, "https://huggingface.co/v2/");
-            hfr                             = model_name;
-
-            nlohmann::json manifest;
-            int            ret = download_and_parse_manifest(manifest_url, headers, manifest);
-            if (ret) {
-                return ret;
-            }
-
-            hff = manifest["ggufFile"]["rfilename"];
-        } else {
-            hfr = model.substr(0, pos);
-            hff = model.substr(pos + 1);
-        }
-
-        url = "https://huggingface.co/" + hfr + "/resolve/main/" + hff;
-
-        return download(url, bn, true, headers);
-    }
-
-    int ollama_dl(std::string & model, const std::string & bn) {
-        const std::vector<std::string> headers = { "Accept: application/vnd.docker.distribution.manifest.v2+json" };
-        if (model.find('/') == std::string::npos) {
-            model = "library/" + model;
-        }
-
-        auto [model_name, manifest_url] = extract_model_and_tag(model, "https://registry.ollama.ai/v2/");
-        nlohmann::json manifest;
-        int            ret = download_and_parse_manifest(manifest_url, {}, manifest);
-        if (ret) {
-            return ret;
-        }
-
-        std::string layer;
+        nlohmann::json manifest = nlohmann::json::parse(manifest_str);
+        std::string    layer;
        for (const auto & l : manifest["layers"]) {
            if (l["mediaType"] == "application/vnd.ollama.image.model") {
                layer = l["digest"];
@@ -668,34 +615,8 @@ class LlamaData {
            }
        }

-        std::string blob_url = "https://registry.ollama.ai/v2/" + model_name + "/blobs/" + layer;
-
-        return download(blob_url, bn, true, headers);
-    }
-
-    int github_dl(const std::string & model, const std::string & bn) {
-        std::string  repository = model;
-        std::string  branch     = "main";
-        const size_t at_pos     = model.find('@');
-        if (at_pos != std::string::npos) {
-            repository = model.substr(0, at_pos);
-            branch     = model.substr(at_pos + 1);
-        }
-
-        const std::vector<std::string> repo_parts = string_split(repository, "/");
-        if (repo_parts.size() < 3) {
-            printe("Invalid GitHub repository format\n");
-            return 1;
-        }
-
-        const std::string & org          = repo_parts[0];
-        const std::string & project      = repo_parts[1];
-        std::string         url          = "https://raw.githubusercontent.com/" + org + "/" + project + "/" + branch;
-        for (size_t i = 2; i < repo_parts.size(); ++i) {
-            url += "/" + repo_parts[i];
-        }
-
-        return download(url, bn, true);
+        std::string blob_url = "https://registry.ollama.ai/v2/" + model + "/blobs/" + layer;
+        return download(blob_url, headers, bn, true);
    }

    std::string basename(const std::string & path) {
@@ -707,41 +628,37 @@ class LlamaData {
        return path.substr(pos + 1);
    }

-    int rm_until_substring(std::string & model_, const std::string & substring) {
-        const std::string::size_type pos = model_.find(substring);
+    int remove_proto(std::string & model_) {
+        const std::string::size_type pos = model_.find("://");
        if (pos == std::string::npos) {
            return 1;
        }

-        model_ = model_.substr(pos + substring.size());  // Skip past the substring
+        model_ = model_.substr(pos + 3);  // Skip past "://"
        return 0;
    }

    int resolve_model(std::string & model_) {
        int                            ret     = 0;
        if (string_starts_with(model_, "file://") || std::filesystem::exists(model_)) {
-            rm_until_substring(model_, "://");
+            remove_proto(model_);

            return ret;
        }

-        const std::string bn = basename(model_);
-        if (string_starts_with(model_, "hf://") || string_starts_with(model_, "huggingface://") ||
-            string_starts_with(model_, "hf.co/")) {
-            rm_until_substring(model_, "hf.co/");
-            rm_until_substring(model_, "://");
-            ret = huggingface_dl(model_, bn);
-        } else if ((string_starts_with(model_, "https://") || string_starts_with(model_, "http://")) &&
-                   !string_starts_with(model_, "https://ollama.com/library/")) {
-            ret = download(model_, bn, true);
-        } else if (string_starts_with(model_, "github:") || string_starts_with(model_, "github://")) {
-            rm_until_substring(model_, "github:");
-            rm_until_substring(model_, "://");
-            ret = github_dl(model_, bn);
-        } else {  // ollama:// or nothing
-            rm_until_substring(model_, "ollama.com/library/");
-            rm_until_substring(model_, "://");
-            ret = ollama_dl(model_, bn);
+        const std::string              bn      = basename(model_);
+        const std::vector<std::string> headers = { "--header",
+                                                   "Accept: application/vnd.docker.distribution.manifest.v2+json" };
+        if (string_starts_with(model_, "hf://") || string_starts_with(model_, "huggingface://")) {
+            remove_proto(model_);
+            ret = huggingface_dl(model_, headers, bn);
+        } else if (string_starts_with(model_, "ollama://")) {
+            remove_proto(model_);
+            ret = ollama_dl(model_, headers, bn);
+        } else if (string_starts_with(model_, "https://")) {
+            download(model_, headers, bn, true);
+        } else {
+            ret = ollama_dl(model_, headers, bn);
        }

        model_ = bn;
@@ -768,7 +685,7 @@ class LlamaData {

    // Initializes the context with the specified parameters
    llama_context_ptr initialize_context(const llama_model_ptr & model, const Opt & opt) {
-        llama_context_ptr context(llama_init_from_model(model.get(), opt.ctx_params));
+        llama_context_ptr context(llama_new_context_with_model(model.get(), opt.ctx_params));
        if (!context) {
            printe("%s: error: failed to create the llama_context\n", __func__);
        }
@@ -794,31 +711,13 @@ static void add_message(const char * role, const std::string & text, LlamaData &
 }

 // Function to apply the chat template and resize `formatted` if needed
-static int apply_chat_template(const common_chat_template & tmpl, LlamaData & llama_data, const bool append, bool use_jinja) {
-    if (use_jinja) {
-        json messages = json::array();
-        for (const auto & msg : llama_data.messages) {
-            messages.push_back({
-                {"role", msg.role},
-                {"content", msg.content},
-            });
-        }
-        try {
-            auto result = tmpl.apply(messages, /* tools= */ json(), append);
-            llama_data.fmtted.resize(result.size() + 1);
-            memcpy(llama_data.fmtted.data(), result.c_str(), result.size() + 1);
-            return result.size();
-        } catch (const std::exception & e) {
-            printe("failed to render the chat template: %s\n", e.what());
-            return -1;
-        }
-    }
+static int apply_chat_template(LlamaData & llama_data, const bool append) {
    int result = llama_chat_apply_template(
-        tmpl.source().c_str(), llama_data.messages.data(), llama_data.messages.size(), append,
+        llama_data.model.get(), nullptr, llama_data.messages.data(), llama_data.messages.size(), append,
        append ? llama_data.fmtted.data() : nullptr, append ? llama_data.fmtted.size() : 0);
    if (append && result > static_cast<int>(llama_data.fmtted.size())) {
        llama_data.fmtted.resize(result);
-        result = llama_chat_apply_template(tmpl.source().c_str(), llama_data.messages.data(),
+        result = llama_chat_apply_template(llama_data.model.get(), nullptr, llama_data.messages.data(),
                                           llama_data.messages.size(), append, llama_data.fmtted.data(),
                                           llama_data.fmtted.size());
    }
@@ -827,13 +726,11 @@ static int apply_chat_template(const common_chat_template & tmpl, LlamaData & ll
 }

 // Function to tokenize the prompt
-static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt,
-                           std::vector<llama_token> & prompt_tokens, const LlamaData & llama_data) {
-    const bool is_first = llama_get_kv_cache_used_cells(llama_data.context.get()) == 0;
-
-    const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
+static int tokenize_prompt(const llama_model_ptr & model, const std::string & prompt,
+                           std::vector<llama_token> & prompt_tokens) {
+    const int n_prompt_tokens = -llama_tokenize(model.get(), prompt.c_str(), prompt.size(), NULL, 0, true, true);
    prompt_tokens.resize(n_prompt_tokens);
-    if (llama_tokenize(vocab, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), is_first,
+    if (llama_tokenize(model.get(), prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true,
                       true) < 0) {
        printe("failed to tokenize the prompt\n");
        return -1;
@@ -856,9 +753,9 @@ static int check_context_size(const llama_context_ptr & ctx, const llama_batch &
 }

 // convert the token to a string
-static int convert_token_to_string(const llama_vocab * vocab, const llama_token token_id, std::string & piece) {
+static int convert_token_to_string(const llama_model_ptr & model, const llama_token token_id, std::string & piece) {
    char buf[256];
-    int  n = llama_token_to_piece(vocab, token_id, buf, sizeof(buf), 0, true);
+    int  n = llama_token_to_piece(model.get(), token_id, buf, sizeof(buf), 0, true);
    if (n < 0) {
        printe("failed to convert token to piece\n");
        return 1;
@@ -876,10 +773,8 @@ static void print_word_and_concatenate_to_response(const std::string & piece, st

 // helper function to evaluate a prompt and generate a response
 static int generate(LlamaData & llama_data, const std::string & prompt, std::string & response) {
-    const llama_vocab * vocab = llama_model_get_vocab(llama_data.model.get());
-
    std::vector<llama_token> tokens;
-    if (tokenize_prompt(vocab, prompt, tokens, llama_data) < 0) {
+    if (tokenize_prompt(llama_data.model, prompt, tokens) < 0) {
        return 1;
    }

@@ -895,12 +790,12 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str

        // sample the next token, check is it an end of generation?
        new_token_id = llama_sampler_sample(llama_data.sampler.get(), llama_data.context.get(), -1);
-        if (llama_vocab_is_eog(vocab, new_token_id)) {
+        if (llama_token_is_eog(llama_data.model.get(), new_token_id)) {
            break;
        }

        std::string piece;
-        if (convert_token_to_string(vocab, new_token_id, piece)) {
+        if (convert_token_to_string(llama_data.model, new_token_id, piece)) {
            return 1;
        }

@@ -910,44 +805,24 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str
        batch = llama_batch_get_one(&new_token_id, 1);
    }

-    printf("\033[0m");
    return 0;
 }

-static int read_user_input(std::string & user_input) {
-    static const char * prompt_prefix = "> ";
-#ifdef WIN32
-    printf(
-        "\r%*s"
-        "\r\033[0m%s",
-        get_terminal_width(), " ", prompt_prefix);
-
-    std::getline(std::cin, user_input);
+static int read_user_input(std::string & user) {
+    std::getline(std::cin, user);
    if (std::cin.eof()) {
        printf("\n");
        return 1;
    }
-#else
-    std::unique_ptr<char, decltype(&std::free)> line(const_cast<char *>(linenoise(prompt_prefix)), free);
-    if (!line) {
+
+    if (user == "/bye") {
        return 1;
    }

-    user_input = line.get();
-#endif
-
-    if (user_input == "/bye") {
-        return 1;
-    }
-
-    if (user_input.empty()) {
+    if (user.empty()) {
        return 2;
    }

-#ifndef WIN32
-    linenoiseHistoryAdd(line.get());
-#endif
-
    return 0;  // Should have data in happy path
 }

@@ -970,8 +845,8 @@ static int generate_response(LlamaData & llama_data, const std::string & prompt,
 }

 // Helper function to apply the chat template and handle errors
-static int apply_chat_template_with_error_handling(const common_chat_template & tmpl, LlamaData & llama_data, const bool append, int & output_length, bool use_jinja) {
-    const int new_len = apply_chat_template(tmpl, llama_data, append, use_jinja);
+static int apply_chat_template_with_error_handling(LlamaData & llama_data, const bool append, int & output_length) {
+    const int new_len = apply_chat_template(llama_data, append);
    if (new_len < 0) {
        printe("failed to apply the chat template\n");
        return -1;
@@ -988,6 +863,10 @@ static int handle_user_input(std::string & user_input, const std::string & user)
        return 0;  // No need for interactive input
    }

+    printf(
+        "\r%*s"
+        "\r\033[32m> \033[0m",
+        get_terminal_width(), " ");
    return read_user_input(user_input);  // Returns true if input ends the loop
 }

@@ -1030,11 +909,9 @@ static int get_user_input(std::string & user_input, const std::string & user) {
 }

 // Main chat loop function
-static int chat_loop(LlamaData & llama_data, const std::string & user, bool use_jinja) {
+static int chat_loop(LlamaData & llama_data, const std::string & user) {
    int prev_len = 0;
    llama_data.fmtted.resize(llama_n_ctx(llama_data.context.get()));
-    auto chat_templates = common_chat_templates_from_model(llama_data.model.get(), "");
-    GGML_ASSERT(chat_templates.template_default);
    static const bool stdout_a_terminal = is_stdout_a_terminal();
    while (true) {
        // Get user input
@@ -1045,7 +922,7 @@ static int chat_loop(LlamaData & llama_data, const std::string & user, bool use_

        add_message("user", user.empty() ? user_input : user, llama_data);
        int new_len;
-        if (apply_chat_template_with_error_handling(*chat_templates.template_default, llama_data, true, new_len, use_jinja) < 0) {
+        if (apply_chat_template_with_error_handling(llama_data, true, new_len) < 0) {
            return 1;
        }

@@ -1060,7 +937,7 @@ static int chat_loop(LlamaData & llama_data, const std::string & user, bool use_
        }

        add_message("assistant", response, llama_data);
-        if (apply_chat_template_with_error_handling(*chat_templates.template_default, llama_data, false, prev_len, use_jinja) < 0) {
+        if (apply_chat_template_with_error_handling(llama_data, false, prev_len) < 0) {
            return 1;
        }
    }
@@ -1120,7 +997,7 @@ int main(int argc, const char ** argv) {
        return 1;
    }

-    if (chat_loop(llama_data, opt.user, opt.use_jinja)) {
+    if (chat_loop(llama_data, opt.user)) {
        return 1;
    }

--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -97,7 +97,7 @@ int main(int argc, char ** argv) {
    printf("\n\n");

    // make new context
-    llama_context * ctx2 = llama_init_from_model(model, common_context_params_to_llama(params));
+    llama_context * ctx2 = llama_new_context_with_model(model, common_context_params_to_llama(params));

    llama_sampler * smpl2 = llama_sampler_chain_init(sparams);

@@ -154,7 +154,7 @@ int main(int argc, char ** argv) {
    }

    // make new context
-    llama_context * ctx3 = llama_init_from_model(model, common_context_params_to_llama(params));
+    llama_context * ctx3 = llama_new_context_with_model(model, common_context_params_to_llama(params));

    llama_sampler * smpl3 = llama_sampler_chain_init(sparams);

--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -126,7 +126,7 @@ The project is under active development, and we are [looking for feedback and co
 | `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') |
 | `--grammar-file FNAME` | file to read grammar from |
 | `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
-| `--jinja` | Enable experimental Jinja templating engine (needed for tool use) |
+

 **Example-specific params**

@@ -236,13 +236,9 @@ npm i
 # to run the dev server
 npm run dev

-# to build the public/index.html.gz
+# to build the public/index.html
 npm run build
 ```
-After `public/index.html.gz` has been generated we need to generate the c++
-headers (like build/examples/server/index.html.gz.hpp) that will be included
-by server.cpp. This is done by building `llama-server` as described in the
-[build](#build) section above.

 NOTE: if you are using the vite dev server, you can change the API base URL to llama.cpp. To do that, run this code snippet in browser's console:

@@ -460,7 +456,7 @@ These words will not be included in the completion, so make sure to add them to
 - Note: In streaming mode (`stream`), only `content`, `tokens` and `stop` will be returned until end of completion. Responses are sent using the [Server-sent events](https://html.spec.whatwg.org/multipage/server-sent-events.html) standard. Note: the browser's `EventSource` interface cannot be used due to its lack of `POST` request support.

 - `completion_probabilities`: An array of token probabilities for each completion. The array's length is `n_predict`. Each item in the array has a nested array `top_logprobs`. It contains at **maximum** `n_probs` elements:
-  ```
+  ```json
  {
    "content": "<the generated completion text>",
    "tokens": [ generated token ids if requested ],
@@ -561,7 +557,7 @@ If `with_pieces` is `true`:
 ```

 With input 'á' (utf8 hex: C3 A1) on tinyllama/stories260k
-```
+```json
 {
  "tokens": [
    {"id": 198, "piece": [195]}, // hex C3
@@ -576,18 +572,6 @@ With input 'á' (utf8 hex: C3 A1) on tinyllama/stories260k

 `tokens`: Set the tokens to detokenize.

-### POST `/apply-template`: Apply chat template to a conversation
-
-Uses the server's prompt template formatting functionality to convert chat messages to a single string expected by a chat model as input, but does not perform inference. Instead, the prompt string is returned in the `prompt` field of the JSON response. The prompt can then be modified as desired (for example, to insert "Sure!" at the beginning of the model's response) before sending to `/completion` to generate the chat response.
-
-*Options:*
-
-`messages`: (Required) Chat turns in the same format as `/v1/chat/completions`.
-
-**Response format**
-
-Returns a JSON object with a field `prompt` containing a string of the input messages formatted according to the model's chat template format.
-
 ### POST `/embedding`: Generate embedding of a given text

 > [!IMPORTANT]
@@ -780,7 +764,7 @@ Same as the `/v1/embeddings` endpoint.

 **Response format**

-```
+```json
 [
  {
    "index": 0,
@@ -1117,82 +1101,6 @@ curl http://localhost:8080/v1/chat/completions \
 }'
 ```

-... and even tool usage (needs `--jinja` flag):
-
-  ```shell
-  llama-server --jinja -hfr lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF -hff Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf -fa
-
-  # https://huggingface.co/meetkai/functionary-medium-v3.2
-  llama-server --jinja -hfr bartowski/functionary-medium-v3.2-GGUF -hff functionary-medium-v3.2-IQ4_XS.gguf -fa
-
-  # https://huggingface.co/meetkai/functionary-medium-v3.1
-  llama-server --jinja -hfr meetkai/functionary-medium-v3.1-GGUF -hff functionary-medium-llama-3.1.Q4_0.gguf -fa
-
-  curl http://localhost:8080/v1/chat/completions -d '{
-    "model": "gpt-3.5-turbo",
-    "tools": [
-      {
-        "type":"function",
-        "function":{
-          "name":"get_current_weather",
-          "description":"Get the current weather in a given location",
-          "parameters":{
-            "type":"object",
-            "properties":{
-              "location":{
-                "type":"string",
-                "description":"The city and state, e.g. San Francisco, CA"
-              }
-            },
-            "required":["location"]
-          }
-        }
-      }
-    ],
-    "messages": [
-      {
-        "role": "user",
-        "content": "What is the weather like in Istanbul?."
-      }
-    ]
-  }'
-  ```
-
-  <details>
-  <summary>Show output</summary>
-
-  ```json
-  {
-    "choices": [
-      {
-        "finish_reason": "tool",
-        "index": 0,
-        "message": {
-          "content": null,
-          "tool_calls": [
-            {
-              "name": "python",
-              "arguments": "{\"code\":\" \\nprint(\\\"Hello, World!\\\")\"}"
-            }
-          ],
-          "role": "assistant"
-        }
-      }
-    ],
-    "created": 1727287211,
-    "model": "gpt-3.5-turbo",
-    "object": "chat.completion",
-    "usage": {
-      "completion_tokens": 16,
-      "prompt_tokens": 44,
-      "total_tokens": 60
-    },
-    "id": "chatcmpl-Htbgh9feMmGM0LEH2hmQvwsCxq3c6Ni8"
-  }
-  ```
-
-  </details>
-
 ### POST `/v1/embeddings`: OpenAI-compatible embeddings API

 This endpoint requires that the model uses a pooling different than type `none`. The embeddings are normalized using the Eucledian norm.
--- a/examples/server/httplib.h
+++ b/examples/server/httplib.h
--- a/examples/server/public/index.html.gz
+++ b/examples/server/public/index.html.gz
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -14,12 +14,11 @@
 // mime type for sending response
 #define MIMETYPE_JSON "application/json; charset=utf-8"

-// auto generated files (see README.md for details)
+// auto generated files (update with ./deps.sh)
 #include "index.html.gz.hpp"
 #include "loading.html.hpp"

 #include <atomic>
-#include <chrono>
 #include <condition_variable>
 #include <cstddef>
 #include <cinttypes>
@@ -33,8 +32,6 @@

 using json = nlohmann::ordered_json;

-constexpr int HTTP_POLLING_SECONDS = 1;
-
 enum stop_type {
    STOP_TYPE_NONE,
    STOP_TYPE_EOS,
@@ -101,7 +98,7 @@ struct slot_params {
    int64_t t_max_prompt_ms  = -1; // TODO: implement
    int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit

-    std::vector<common_adapter_lora_info> lora;
+    std::vector<common_lora_adapter_info> lora;

    std::vector<std::string> antiprompt;
    std::vector<std::string> response_fields;
@@ -113,11 +110,10 @@ struct slot_params {
    struct common_params_speculative speculative;

    // OAI-compat fields
-    bool                  verbose                   = false;
-    oaicompat_type        oaicompat                 = OAICOMPAT_TYPE_NONE;
-    std::string           oaicompat_model;
-    std::string           oaicompat_cmpl_id;
-    common_chat_format    oaicompat_chat_format     = COMMON_CHAT_FORMAT_CONTENT_ONLY;
+    bool           verbose        = false;
+    oaicompat_type oaicompat      = OAICOMPAT_TYPE_NONE;
+    std::string    oaicompat_model;
+    std::string    oaicompat_cmpl_id;

    json to_json() const {
        std::vector<std::string> samplers;
@@ -165,8 +161,6 @@ struct slot_params {
            {"n_probs",                   sampling.n_probs},
            {"min_keep",                  sampling.min_keep},
            {"grammar",                   sampling.grammar},
-            // {"grammar_trigger_words",     sampling.grammar_trigger_words},
-            {"grammar_trigger_tokens",    sampling.grammar_trigger_tokens},
            {"samplers",                  samplers},
            {"speculative.n_max",         speculative.n_max},
            {"speculative.n_min",         speculative.n_min},
@@ -204,17 +198,15 @@ struct server_task {
    bool metrics_reset_bucket = false;

    // used by SERVER_TASK_TYPE_SET_LORA
-    std::vector<common_adapter_lora_info> set_lora;
+    std::vector<common_lora_adapter_info> set_lora;

    server_task(server_task_type type) : type(type) {}

    static slot_params params_from_json_cmpl(
+            const llama_model * model,
            const llama_context * ctx,
            const common_params & params_base,
            const json & data) {
-        const llama_model * model = llama_get_model(ctx);
-        const llama_vocab * vocab = llama_model_get_vocab(model);
-
        slot_params params;

        // Sampling parameter defaults are loaded from the global server context (but individual requests can still override them)
@@ -270,11 +262,6 @@ struct server_task {
        params.speculative.n_min = std::max(params.speculative.n_min, 2);
        params.speculative.n_max = std::max(params.speculative.n_max, 0);

-        // Use OpenAI API logprobs only if n_probs wasn't provided
-        if (data.contains("logprobs") && params.sampling.n_probs == defaults.sampling.n_probs){
-            params.sampling.n_probs = json_value(data, "logprobs", defaults.sampling.n_probs);
-        }
-
        if (data.contains("lora")) {
            if (data.at("lora").is_array()) {
                params.lora = parse_lora_request(params_base.lora_adapters, data.at("lora"));
@@ -328,50 +315,12 @@ struct server_task {
        if (data.contains("json_schema") && !data.contains("grammar")) {
            try {
                auto schema                  = json_value(data, "json_schema", json::object());
-                LOG_DBG("JSON schema: %s\n", schema.dump(2).c_str());
-                params.sampling.grammar      = json_schema_to_grammar(schema);
-                LOG_DBG("Converted grammar: %s\n", params.sampling.grammar.c_str());
+                params.sampling.grammar = json_schema_to_grammar(schema);
            } catch (const std::exception & e) {
                throw std::runtime_error(std::string("\"json_schema\": ") + e.what());
            }
        } else {
-            params.sampling.grammar      = json_value(data, "grammar", defaults.sampling.grammar);
-            LOG_DBG("Grammar: %s\n", params.sampling.grammar.c_str());
-            params.sampling.grammar_lazy = json_value(data, "grammar_lazy", defaults.sampling.grammar_lazy);
-            LOG_DBG("Grammar lazy: %s\n", params.sampling.grammar_lazy ? "true" : "false");
-        }
-
-        {
-            auto it = data.find("chat_format");
-            if (it != data.end()) {
-                params.oaicompat_chat_format = static_cast<common_chat_format>(it->get<int>());
-                LOG_DBG("Chat format: %s\n", common_chat_format_name(params.oaicompat_chat_format).c_str());
-            } else {
-                params.oaicompat_chat_format = defaults.oaicompat_chat_format;
-            }
-        }
-
-        {
-            const auto grammar_triggers = data.find("grammar_triggers");
-            if (grammar_triggers != data.end()) {
-                for (const auto & t : *grammar_triggers) {
-                    common_grammar_trigger trigger;
-                    trigger.word = t.at("word");
-                    trigger.at_start = t.at("at_start");
-
-                    auto ids = common_tokenize(vocab, trigger.word, /* add_special= */ false, /* parse_special= */ true);
-                    if (ids.size() == 1) {
-                        LOG_DBG("Grammar trigger token: %d (`%s`)\n", ids[0], trigger.word.c_str());
-                        params.sampling.grammar_trigger_tokens.push_back(ids[0]);
-                        continue;
-                    }
-                    LOG_DBG("Grammar trigger word: `%s`\n", trigger.word.c_str());
-                    params.sampling.grammar_trigger_words.push_back(trigger);
-                }
-            }
-            if (params.sampling.grammar_lazy) {
-                GGML_ASSERT(params.sampling.grammar_trigger_tokens.size() > 0 || params.sampling.grammar_trigger_words.size() > 0);
-            }
+            params.sampling.grammar = json_value(data, "grammar", defaults.sampling.grammar);
        }

        {
@@ -380,7 +329,7 @@ struct server_task {

            const auto & logit_bias = data.find("logit_bias");
            if (logit_bias != data.end() && logit_bias->is_array()) {
-                const int n_vocab = llama_vocab_n_tokens(vocab);
+                const int n_vocab = llama_n_vocab(model);
                for (const auto & el : *logit_bias) {
                    // TODO: we may want to throw errors here, in case "el" is incorrect
                    if (el.is_array() && el.size() == 2) {
@@ -399,7 +348,7 @@ struct server_task {
                                params.sampling.logit_bias.push_back({tok, bias});
                            }
                        } else if (el[0].is_string()) {
-                            auto toks = common_tokenize(vocab, el[0].get<std::string>(), false);
+                            auto toks = common_tokenize(model, el[0].get<std::string>(), false);
                            for (auto tok : toks) {
                                params.sampling.logit_bias.push_back({tok, bias});
                            }
@@ -423,12 +372,22 @@ struct server_task {
        }

        {
-            const auto samplers = data.find("samplers");
+            const auto & samplers = data.find("samplers");
            if (samplers != data.end()) {
                if (samplers->is_array()) {
-                    params.sampling.samplers = common_sampler_types_from_names(*samplers, false);
+                    std::vector<std::string> sampler_names;
+                    for (const auto & name : *samplers) {
+                        if (name.is_string()) {
+                            sampler_names.emplace_back(name);
+                        }
+                    }
+                    params.sampling.samplers = common_sampler_types_from_names(sampler_names, false);
                } else if (samplers->is_string()){
-                    params.sampling.samplers = common_sampler_types_from_chars(samplers->get<std::string>());
+                    std::string sampler_string;
+                    for (const auto & name : *samplers) {
+                        sampler_string += name;
+                    }
+                    params.sampling.samplers = common_sampler_types_from_chars(sampler_string);
                }
            } else {
                params.sampling.samplers = defaults.sampling.samplers;
@@ -575,7 +534,7 @@ struct completion_token_output {
 struct server_task_result_cmpl_final : server_task_result {
    int index = 0;

-    std::string content;
+    std::string  content;
    llama_tokens tokens;

    bool stream;
@@ -597,11 +556,10 @@ struct server_task_result_cmpl_final : server_task_result {
    slot_params generation_params;

    // OAI-compat fields
-    bool                  verbose                  = false;
-    oaicompat_type        oaicompat                = OAICOMPAT_TYPE_NONE;
-    std::string           oaicompat_model;
-    std::string           oaicompat_cmpl_id;
-    common_chat_format    oaicompat_chat_format    = COMMON_CHAT_FORMAT_CONTENT_ONLY;
+    bool           verbose        = false;
+    oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE;
+    std::string    oaicompat_model;
+    std::string    oaicompat_cmpl_id;

    virtual int get_index() override {
        return index;
@@ -695,38 +653,18 @@ struct server_task_result_cmpl_final : server_task_result {

    json to_json_oaicompat_chat() {
        std::string finish_reason = "length";
-        common_chat_msg message;
        if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
-            message = common_chat_parse(content, oaicompat_chat_format);
-            finish_reason = message.tool_calls.empty() ? "stop" : "tool_calls";
-        } else {
-            message.content = content;
+            finish_reason = "stop";
        }

-        json tool_calls;
-        if (!message.tool_calls.empty()) {
-            tool_calls = json::array();
-            for (const auto & tc : message.tool_calls) {
-                tool_calls.push_back({
-                    {"type", "function"},
-                    {"function", {
-                        {"name", tc.name},
-                        {"arguments", tc.arguments},
-                    }},
-                    {"id", tc.id.empty() ? json() : json(tc.id)},
-                });
-            }
-        }
-
-        json choice {
+        json choice = json{
            {"finish_reason", finish_reason},
            {"index", 0},
            {"message", json {
-                {"content", message.content},
-                {"tool_calls", tool_calls},
-                {"role", "assistant"},
-            }},
-        };
+                {"content", content},
+                {"role",    "assistant"}
+            }
+        }};

        if (!stream && probs_output.size() > 0) {
            choice["logprobs"] = json{
@@ -768,7 +706,7 @@ struct server_task_result_cmpl_final : server_task_result {
            finish_reason = "stop";
        }

-        json choice = json {
+        json choice = json{
            {"finish_reason", finish_reason},
            {"index", 0},
            {"delta", json::object()}
@@ -1193,7 +1131,7 @@ struct server_slot {

    common_speculative * spec = nullptr;

-    std::vector<common_adapter_lora_info> lora;
+    std::vector<common_lora_adapter_info> lora;

    // the index relative to completion multi-task request
    size_t index = 0;
@@ -1243,8 +1181,6 @@ struct server_slot {

    llama_token sampled;

-    common_chat_format chat_format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
-
    // stats
    size_t n_sent_text        = 0; // number of sent text character

@@ -1481,10 +1417,6 @@ struct server_queue {
    int post(server_task task, bool front = false) {
        std::unique_lock<std::mutex> lock(mutex_tasks);
        GGML_ASSERT(task.id != -1);
-        // if this is cancel task make sure to clean up pending tasks
-        if (task.type == SERVER_TASK_TYPE_CANCEL) {
-            cleanup_pending_task(task.id_target);
-        }
        QUE_DBG("new task, id = %d, front = %d\n", task.id, front);
        if (front) {
            queue_tasks.push_front(std::move(task));
@@ -1502,10 +1434,6 @@ struct server_queue {
            if (task.id == -1) {
                task.id = id++;
            }
-            // if this is cancel task make sure to clean up pending tasks
-            if (task.type == SERVER_TASK_TYPE_CANCEL) {
-                cleanup_pending_task(task.id_target);
-            }
            QUE_DBG("new task, id = %d/%d, front = %d\n", task.id, (int) tasks.size(), front);
            if (front) {
                queue_tasks.push_front(std::move(task));
@@ -1606,20 +1534,6 @@ struct server_queue {
            }
        }
    }
-
-private:
-    void cleanup_pending_task(int id_target) {
-        // no need lock because this is called exclusively by post()
-        auto rm_func = [id_target](const server_task & task) {
-            return task.id_target == id_target;
-        };
-        queue_tasks.erase(
-            std::remove_if(queue_tasks.begin(),          queue_tasks.end(),          rm_func),
-            queue_tasks.end());
-        queue_tasks_deferred.erase(
-            std::remove_if(queue_tasks_deferred.begin(), queue_tasks_deferred.end(), rm_func),
-            queue_tasks_deferred.end());
-    }
 };

 struct server_response {
@@ -1655,12 +1569,6 @@ struct server_response {

        std::unique_lock<std::mutex> lock(mutex_results);
        waiting_task_ids.erase(id_task);
-        // make sure to clean up all pending results
-        queue_results.erase(
-            std::remove_if(queue_results.begin(), queue_results.end(), [id_task](const server_task_result_ptr & res) {
-                return res->id == id_task;
-            }),
-            queue_results.end());
    }

    void remove_waiting_task_ids(const std::unordered_set<int> & id_tasks) {
@@ -1680,24 +1588,6 @@ struct server_response {
                return !queue_results.empty();
            });

-            for (size_t i = 0; i < queue_results.size(); i++) {
-                if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) {
-                    server_task_result_ptr res = std::move(queue_results[i]);
-                    queue_results.erase(queue_results.begin() + i);
-                    return res;
-                }
-            }
-        }
-
-        // should never reach here
-    }
-
-    // same as recv(), but have timeout in seconds
-    // if timeout is reached, nullptr is returned
-    server_task_result_ptr recv_with_timeout(const std::unordered_set<int> & id_tasks, int timeout) {
-        while (true) {
-            std::unique_lock<std::mutex> lock(mutex_results);
-
            for (int i = 0; i < (int) queue_results.size(); i++) {
                if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) {
                    server_task_result_ptr res = std::move(queue_results[i]);
@@ -1705,11 +1595,6 @@ struct server_response {
                    return res;
                }
            }
-
-            std::cv_status cr_res = condition_results.wait_for(lock, std::chrono::seconds(timeout));
-            if (cr_res == std::cv_status::timeout) {
-                return nullptr;
-            }
        }

        // should never reach here
@@ -1748,8 +1633,6 @@ struct server_context {
    llama_model * model = nullptr;
    llama_context * ctx = nullptr;

-    const llama_vocab * vocab = nullptr;
-
    llama_model * model_dft = nullptr;

    llama_context_params cparams_dft;
@@ -1774,8 +1657,6 @@ struct server_context {
    // Necessary similarity of prompt for slot selection
    float slot_prompt_similarity = 0.0f;

-    common_chat_templates chat_templates;
-
    ~server_context() {
        // Clear any sampling context
        for (server_slot & slot : slots) {
@@ -1809,23 +1690,18 @@ struct server_context {
            return false;
        }

-        vocab = llama_model_get_vocab(model);
-
        n_ctx = llama_n_ctx(ctx);

-        add_bos_token = llama_vocab_get_add_bos(vocab);
-        has_eos_token = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
+        add_bos_token = llama_add_bos_token(model);
+        has_eos_token = llama_token_eos(model) != LLAMA_TOKEN_NULL;

-        if (!params_base.speculative.model.empty() || !params_base.speculative.hf_repo.empty()) {
+        if (!params_base.speculative.model.empty()) {
            SRV_INF("loading draft model '%s'\n", params_base.speculative.model.c_str());

            auto params_dft = params_base;

            params_dft.devices      = params_base.speculative.devices;
-            params_dft.hf_file      = params_base.speculative.hf_file;
-            params_dft.hf_repo      = params_base.speculative.hf_repo;
            params_dft.model        = params_base.speculative.model;
-            params_dft.model_url    = params_base.speculative.model_url;
            params_dft.n_ctx        = params_base.speculative.n_ctx == 0 ? params_base.n_ctx / params_base.n_parallel : params_base.speculative.n_ctx;
            params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
            params_dft.n_parallel   = 1;
@@ -1853,48 +1729,15 @@ struct server_context {
            // force F16 KV cache for the draft model for extra performance
            cparams_dft.type_k = GGML_TYPE_F16;
            cparams_dft.type_v = GGML_TYPE_F16;
-
-            // the context is not needed - we will create one for each slot
-            llama_init_dft.context.reset();
        }

-        if (params_base.chat_template.empty() && !validate_builtin_chat_template(params.use_jinja)) {
-            LOG_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
-            chat_templates = common_chat_templates_from_model(model, "chatml");
-        } else {
-            chat_templates = common_chat_templates_from_model(model, params_base.chat_template);
-        }
-        GGML_ASSERT(chat_templates.template_default.get() != nullptr);
-
        return true;
    }

-    bool validate_builtin_chat_template(bool use_jinja) const {
+    bool validate_builtin_chat_template() const {
        llama_chat_message chat[] = {{"user", "test"}};
-
-        if (use_jinja) {
-            auto templates = common_chat_templates_from_model(model, "");
-            common_chat_inputs inputs;
-            inputs.messages = json::array({{
-                {"role", "user"},
-                {"content", "test"},
-            }});
-            GGML_ASSERT(templates.template_default);
-            try {
-                common_chat_params_init(*templates.template_default, inputs);
-                if (templates.template_tool_use) {
-                    common_chat_params_init(*templates.template_tool_use, inputs);
-                }
-                return true;
-            } catch (const std::exception & e) {
-                SRV_ERR("failed to apply template: %s\n", e.what());
-                return false;
-            }
-        } else {
-            const char * tmpl = llama_model_chat_template(model, /* name */ nullptr);
-            const int32_t chat_res = llama_chat_apply_template(tmpl, chat, 1, true, nullptr, 0);
-            return chat_res > 0;
-        }
+        int32_t chat_res = llama_chat_apply_template(model, nullptr, chat, 1, true, nullptr, 0);
+        return chat_res > 0;
    }

    void init() {
@@ -1913,7 +1756,7 @@ struct server_context {
            if (model_dft) {
                slot.batch_spec = llama_batch_init(params_base.speculative.n_max + 1, 0, 1);

-                slot.ctx_dft = llama_init_from_model(model_dft, cparams_dft);
+                slot.ctx_dft = llama_new_context_with_model(model_dft, cparams_dft);
                if (slot.ctx_dft == nullptr) {
                    SRV_ERR("%s", "failed to create draft context\n");
                    return;
@@ -2048,7 +1891,7 @@ struct server_context {
        }

        if (slot.params.ignore_eos && has_eos_token) {
-            slot.params.sampling.logit_bias.push_back({llama_vocab_eos(vocab), -INFINITY});
+            slot.params.sampling.logit_bias.push_back({llama_token_eos(model), -INFINITY});
        }

        {
@@ -2204,14 +2047,14 @@ struct server_context {
                    slot.n_decoded, slot.n_prompt_tokens, slot.n_past, slot.n_ctx);
        }

-        if (llama_vocab_is_eog(vocab, result.tok)) {
+        if (llama_token_is_eog(model, result.tok)) {
            slot.stop           = STOP_TYPE_EOS;
            slot.has_next_token = false;

            SLT_DBG(slot, "%s", "stopped by EOS\n");
        }

-        const auto n_ctx_train = llama_model_n_ctx_train(model);
+        const auto n_ctx_train = llama_n_ctx_train(model);

        if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) {
            slot.truncated      = true;
@@ -2231,7 +2074,7 @@ struct server_context {

    void populate_token_probs(const server_slot & slot, completion_token_output & result, bool post_sampling, bool special, int idx) {
        size_t n_probs = slot.params.sampling.n_probs;
-        size_t n_vocab = llama_vocab_n_tokens(vocab);
+        size_t n_vocab = llama_n_vocab(llama_get_model(ctx));
        if (post_sampling) {
            const auto * cur_p = common_sampler_get_candidates(slot.smpl);
            const size_t max_probs = cur_p->size;
@@ -2333,11 +2176,11 @@ struct server_context {
        res->id_slot         = slot.id;

        res->index           = slot.index;
-        res->content         = std::move(slot.generated_text);
-        res->tokens          = std::move(slot.generated_tokens);
+        res->content         = slot.generated_text;
+        res->tokens          = slot.generated_tokens;
        res->timings         = slot.get_timings();
        res->prompt          = common_detokenize(ctx, slot.prompt_tokens, true);
-        res->response_fields = std::move(slot.params.response_fields);
+        res->response_fields = slot.params.response_fields;

        res->truncated           = slot.truncated;
        res->n_decoded           = slot.n_decoded;
@@ -2348,12 +2191,12 @@ struct server_context {
        res->stop                = slot.stop;
        res->post_sampling_probs = slot.params.post_sampling_probs;

-        res->verbose               = slot.params.verbose;
-        res->stream                = slot.params.stream;
-        res->oaicompat             = slot.params.oaicompat;
-        res->oaicompat_model       = slot.params.oaicompat_model;
-        res->oaicompat_cmpl_id     = slot.params.oaicompat_cmpl_id;
-        res->oaicompat_chat_format = slot.params.oaicompat_chat_format;
+        res->verbose           = slot.params.verbose;
+        res->stream            = slot.params.stream;
+        res->oaicompat         = slot.params.oaicompat;
+        res->oaicompat_model   = slot.params.oaicompat_model;
+        res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id;
+
        // populate res.probs_output
        if (slot.params.sampling.n_probs > 0) {
            if (!slot.params.stream && slot.stop == STOP_TYPE_WORD) {
@@ -2382,7 +2225,7 @@ struct server_context {
        res->n_tokens  = slot.n_prompt_tokens;
        res->oaicompat = slot.params.oaicompat;

-        const int n_embd = llama_model_n_embd(model);
+        const int n_embd = llama_n_embd(model);

        std::vector<float> embd_res(n_embd, 0.0f);

@@ -2461,8 +2304,8 @@ struct server_context {

            server_task task(SERVER_TASK_TYPE_CANCEL);
            task.id_target = id_task;
-            queue_results.remove_waiting_task_id(id_task);
            cancel_tasks.push_back(task);
+            queue_results.remove_waiting_task_id(id_task);
        }
        // push to beginning of the queue, so it has highest priority
        queue_tasks.post(cancel_tasks, true);
@@ -2472,21 +2315,10 @@ struct server_context {
    void receive_multi_results(
            const std::unordered_set<int> & id_tasks,
            const std::function<void(std::vector<server_task_result_ptr>&)> & result_handler,
-            const std::function<void(json)> & error_handler,
-            const std::function<bool()> & is_connection_closed) {
+            const std::function<void(json)> & error_handler) {
        std::vector<server_task_result_ptr> results(id_tasks.size());
-        for (int i = 0; i < (int)id_tasks.size(); i++) {
-            server_task_result_ptr result = queue_results.recv_with_timeout(id_tasks, HTTP_POLLING_SECONDS);
-
-            if (is_connection_closed()) {
-                cancel_tasks(id_tasks);
-                return;
-            }
-
-            if (result == nullptr) {
-                i--; // retry
-                continue;
-            }
+        for (size_t i = 0; i < id_tasks.size(); i++) {
+            server_task_result_ptr result = queue_results.recv(id_tasks);

            if (result->is_error()) {
                error_handler(result->to_json());
@@ -2510,20 +2342,10 @@ struct server_context {
    void receive_cmpl_results_stream(
            const std::unordered_set<int> & id_tasks,
            const std::function<bool(server_task_result_ptr&)> & result_handler,
-            const std::function<void(json)> & error_handler,
-            const std::function<bool()> & is_connection_closed) {
+            const std::function<void(json)> & error_handler) {
        size_t n_finished = 0;
        while (true) {
-            server_task_result_ptr result = queue_results.recv_with_timeout(id_tasks, HTTP_POLLING_SECONDS);
-
-            if (is_connection_closed()) {
-                cancel_tasks(id_tasks);
-                return;
-            }
-
-            if (result == nullptr) {
-                continue; // retry
-            }
+            server_task_result_ptr result = queue_results.recv(id_tasks);

            if (result->is_error()) {
                error_handler(result->to_json());
@@ -2831,11 +2653,6 @@ struct server_context {
        // track if given slot can be batched with slots already in the batch
        server_slot * slot_batched = nullptr;

-        auto accept_special_token = [&](server_slot & slot, llama_token token) {
-            const auto & trigger_tokens = slot.params.sampling.grammar_trigger_tokens;
-            return params_base.special || std::find(trigger_tokens.begin(), trigger_tokens.end(), token) != trigger_tokens.end();
-        };
-
        // frist, add sampled tokens from any ongoing sequences
        for (auto & slot : slots) {
            if (slot.state != SLOT_STATE_GENERATING) {
@@ -3110,7 +2927,7 @@ struct server_context {
            // make sure we're in the right embedding mode
            llama_set_embeddings(ctx, slot_batched->is_non_causal());
            // apply lora, only need to do it once per batch
-            common_set_adapter_lora(ctx, slot_batched->lora);
+            common_lora_adapters_apply(ctx, slot_batched->lora);
        }

        // process the created batch of tokens
@@ -3199,7 +3016,7 @@ struct server_context {

                completion_token_output result;
                result.tok          = id;
-                result.text_to_send = common_token_to_piece(ctx, result.tok, accept_special_token(slot, result.tok));
+                result.text_to_send = common_token_to_piece(ctx, result.tok, params_base.special);
                result.prob         = 1.0f; // TODO: set it here instead of doing inside populate_token_probs

                if (slot.params.sampling.n_probs > 0) {
@@ -3288,7 +3105,7 @@ struct server_context {
                    completion_token_output result;

                    result.tok          = ids[i];
-                    result.text_to_send = common_token_to_piece(ctx, result.tok, accept_special_token(slot, result.tok));
+                    result.text_to_send = common_token_to_piece(ctx, result.tok, params_base.special);
                    result.prob         = 1.0f; // set later

                    // TODO: set result.probs
@@ -3312,12 +3129,12 @@ struct server_context {

    json model_meta() const {
        return json {
-            {"vocab_type",  llama_vocab_type       (vocab)},
-            {"n_vocab",     llama_vocab_n_tokens   (vocab)},
-            {"n_ctx_train", llama_model_n_ctx_train(model)},
-            {"n_embd",      llama_model_n_embd     (model)},
-            {"n_params",    llama_model_n_params   (model)},
-            {"size",        llama_model_size       (model)},
+            {"vocab_type",  llama_vocab_type    (model)},
+            {"n_vocab",     llama_n_vocab       (model)},
+            {"n_ctx_train", llama_n_ctx_train   (model)},
+            {"n_embd",      llama_n_embd        (model)},
+            {"n_params",    llama_model_n_params(model)},
+            {"size",        llama_model_size    (model)},
        };
    }
 };
@@ -3638,11 +3455,11 @@ int main(int argc, char ** argv) {
                    {"value",  (uint64_t) res_metrics->kv_cache_tokens_count}
            },{
                    {"name",  "requests_processing"},
-                    {"help",  "Number of requests processing."},
+                    {"help",  "Number of request processing."},
                    {"value",  (uint64_t) res_metrics->n_processing_slots}
            },{
                    {"name",  "requests_deferred"},
-                    {"help",  "Number of requests deferred."},
+                    {"help",  "Number of request deferred."},
                    {"value",  (uint64_t) res_metrics->n_tasks_deferred}
            }}}
        };
@@ -3784,14 +3601,9 @@ int main(int argc, char ** argv) {
            { "default_generation_settings", ctx_server.default_generation_settings_for_props },
            { "total_slots",                 ctx_server.params_base.n_parallel },
            { "model_path",                  ctx_server.params_base.model },
-            { "chat_template",               ctx_server.chat_templates.template_default->source() },
-            { "bos_token",                   ctx_server.chat_templates.template_default->bos_token() },
-            { "eos_token",                   ctx_server.chat_templates.template_default->eos_token() },
+            { "chat_template",               common_get_builtin_chat_template(ctx_server.model) },
            { "build_info",                  build_info },
        };
-        if (ctx_server.params_base.use_jinja && ctx_server.chat_templates.template_tool_use) {
-            data["chat_template_tool_use"] = ctx_server.chat_templates.template_tool_use->source();
-        }

        res_ok(res, data);
    };
@@ -3814,7 +3626,6 @@ int main(int argc, char ** argv) {
    const auto handle_completions_impl = [&ctx_server, &res_error, &res_ok](
            server_task_type type,
            json & data,
-            std::function<bool()> is_connection_closed,
            httplib::Response & res,
            oaicompat_type oaicompat) {
        GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL);
@@ -3828,9 +3639,7 @@ int main(int argc, char ** argv) {
        std::vector<server_task> tasks;

        try {
-            const auto & prompt = data.at("prompt");
-            LOG_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get<std::string>().c_str() : prompt.dump(2).c_str());
-            std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true);
+            std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, data.at("prompt"), true, true);
            tasks.reserve(tokenized_prompts.size());
            for (size_t i = 0; i < tokenized_prompts.size(); i++) {
                server_task task = server_task(type);
@@ -3840,14 +3649,15 @@ int main(int argc, char ** argv) {

                task.prompt_tokens    = std::move(tokenized_prompts[i]);
                task.params           = server_task::params_from_json_cmpl(
+                                            ctx_server.model,
                                            ctx_server.ctx,
                                            ctx_server.params_base,
                                            data);
                task.id_selected_slot = json_value(data, "id_slot", -1);

                // OAI-compat
-                task.params.oaicompat                 = oaicompat;
-                task.params.oaicompat_cmpl_id         = completion_id;
+                task.params.oaicompat         = oaicompat;
+                task.params.oaicompat_cmpl_id = completion_id;
                // oaicompat_model is already populated by params_from_json_cmpl

                tasks.push_back(task);
@@ -3878,7 +3688,7 @@ int main(int argc, char ** argv) {
                }
            }, [&](const json & error_data) {
                res_error(res, error_data);
-            }, is_connection_closed);
+            });

            ctx_server.queue_results.remove_waiting_task_ids(task_ids);
        } else {
@@ -3888,7 +3698,6 @@ int main(int argc, char ** argv) {
                    if (res_json.is_array()) {
                        for (const auto & res : res_json) {
                            if (!server_sent_event(sink, "data", res)) {
-                                // sending failed (HTTP connection closed), cancel the generation
                                return false;
                            }
                        }
@@ -3898,9 +3707,6 @@ int main(int argc, char ** argv) {
                    }
                }, [&](const json & error_data) {
                    server_sent_event(sink, "error", error_data);
-                }, [&sink]() {
-                    // note: do not use req.is_connection_closed here because req is already destroyed
-                    return !sink.is_writable();
                });
                if (oaicompat != OAICOMPAT_TYPE_NONE) {
                    static const std::string ev_done = "data: [DONE]\n\n";
@@ -3923,7 +3729,6 @@ int main(int argc, char ** argv) {
        return handle_completions_impl(
            SERVER_TASK_TYPE_COMPLETION,
            data,
-            req.is_connection_closed,
            res,
            OAICOMPAT_TYPE_NONE);
    };
@@ -3933,7 +3738,6 @@ int main(int argc, char ** argv) {
        return handle_completions_impl(
            SERVER_TASK_TYPE_COMPLETION,
            data,
-            req.is_connection_closed,
            res,
            OAICOMPAT_TYPE_COMPLETION);
    };
@@ -3941,13 +3745,13 @@ int main(int argc, char ** argv) {
    const auto handle_infill = [&ctx_server, &res_error, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
        // check model compatibility
        std::string err;
-        if (llama_vocab_fim_pre(ctx_server.vocab) == LLAMA_TOKEN_NULL) {
+        if (llama_token_fim_pre(ctx_server.model) == LLAMA_TOKEN_NULL) {
            err += "prefix token is missing. ";
        }
-        if (llama_vocab_fim_suf(ctx_server.vocab) == LLAMA_TOKEN_NULL) {
+        if (llama_token_fim_suf(ctx_server.model) == LLAMA_TOKEN_NULL) {
            err += "suffix token is missing. ";
        }
-        if (llama_vocab_fim_mid(ctx_server.vocab) == LLAMA_TOKEN_NULL) {
+        if (llama_token_fim_mid(ctx_server.model) == LLAMA_TOKEN_NULL) {
            err += "middle token is missing. ";
        }
        if (!err.empty()) {
@@ -3993,10 +3797,10 @@ int main(int argc, char ** argv) {
        data["input_extra"] = input_extra; // default to empty array if it's not exist

        std::string prompt = json_value(data, "prompt", std::string());
-        std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, false, true);
+        std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, prompt, false, true);
        SRV_DBG("creating infill tasks, n_prompts = %d\n", (int) tokenized_prompts.size());
        data["prompt"] = format_infill(
-            ctx_server.vocab,
+            ctx_server.ctx,
            data.at("input_prefix"),
            data.at("input_suffix"),
            data.at("input_extra"),
@@ -4010,36 +3814,24 @@ int main(int argc, char ** argv) {
        return handle_completions_impl(
            SERVER_TASK_TYPE_INFILL,
            data,
-            req.is_connection_closed,
            res,
            OAICOMPAT_TYPE_NONE); // infill is not OAI compatible
    };

    const auto handle_chat_completions = [&ctx_server, &params, &res_error, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
-        LOG_DBG("request: %s\n", req.body.c_str());
        if (ctx_server.params_base.embedding) {
            res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
            return;
        }

-        auto body = json::parse(req.body);
-        json data = oaicompat_completion_params_parse(body, params.use_jinja, ctx_server.chat_templates);
-
+        json data = oaicompat_chat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template);
        return handle_completions_impl(
            SERVER_TASK_TYPE_COMPLETION,
            data,
-            req.is_connection_closed,
            res,
            OAICOMPAT_TYPE_CHAT);
    };

-    // same with handle_chat_completions, but without inference part
-    const auto handle_apply_template = [&ctx_server, &params, &res_ok](const httplib::Request & req, httplib::Response & res) {
-        auto body = json::parse(req.body);
-        json data = oaicompat_completion_params_parse(body, params.use_jinja, ctx_server.chat_templates);
-        res_ok(res, {{ "prompt", std::move(data.at("prompt")) }});
-    };
-
    const auto handle_models = [&params, &ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
        json models = {
            {"object", "list"},
@@ -4065,7 +3857,7 @@ int main(int argc, char ** argv) {
            const bool add_special = json_value(body, "add_special", false);
            const bool with_pieces = json_value(body, "with_pieces", false);

-            llama_tokens tokens = tokenize_mixed(ctx_server.vocab, body.at("content"), add_special, true);
+            llama_tokens tokens = tokenize_mixed(ctx_server.ctx, body.at("content"), add_special, true);

            if (with_pieces) {
                for (const auto& token : tokens) {
@@ -4141,7 +3933,7 @@ int main(int argc, char ** argv) {
            }
        }

-        std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true);
+        std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, prompt, true, true);
        for (const auto & tokens : tokenized_prompts) {
            // this check is necessary for models that do not add BOS token to the input
            if (tokens.empty()) {
@@ -4182,7 +3974,7 @@ int main(int argc, char ** argv) {
            }, [&](const json & error_data) {
                res_error(res, error_data);
                error = true;
-            }, req.is_connection_closed);
+            });

            ctx_server.queue_results.remove_waiting_task_ids(task_ids);
        }
@@ -4241,20 +4033,20 @@ int main(int argc, char ** argv) {
            return;
        }

-        llama_tokens tokenized_query = tokenize_input_prompts(ctx_server.vocab, query, /* add_special */ false, true)[0];
+        llama_tokens tokenized_query = tokenize_input_prompts(ctx_server.ctx, query, /* add_special */ false, true)[0];

        // create and queue the task
        json responses = json::array();
        bool error = false;
        {
            std::vector<server_task> tasks;
-            std::vector<llama_tokens> tokenized_docs = tokenize_input_prompts(ctx_server.vocab, documents, /* add_special */ false, true);
+            std::vector<llama_tokens> tokenized_docs = tokenize_input_prompts(ctx_server.ctx, documents, /* add_special */ false, true);
            tasks.reserve(tokenized_docs.size());
            for (size_t i = 0; i < tokenized_docs.size(); i++) {
                server_task task   = server_task(SERVER_TASK_TYPE_RERANK);
                task.id            = ctx_server.queue_tasks.get_new_id();
                task.index         = i;
-                task.prompt_tokens = format_rerank(ctx_server.vocab, tokenized_query, tokenized_docs[i]);
+                task.prompt_tokens = format_rerank(ctx_server.model, tokenized_query, tokenized_docs[i]);
                tasks.push_back(task);
            }

@@ -4272,7 +4064,7 @@ int main(int argc, char ** argv) {
            }, [&](const json & error_data) {
                res_error(res, error_data);
                error = true;
-            }, req.is_connection_closed);
+            });
        }

        if (error) {
@@ -4374,7 +4166,6 @@ int main(int argc, char ** argv) {
    svr->Post("/v1/reranking",        handle_rerank);
    svr->Post("/tokenize",            handle_tokenize);
    svr->Post("/detokenize",          handle_detokenize);
-    svr->Post("/apply-template",      handle_apply_template);
    // LoRA adapters hotswap
    svr->Get ("/lora-adapters",       handle_lora_adapters_list);
    svr->Post("/lora-adapters",       handle_lora_adapters_apply);
@@ -4440,18 +4231,24 @@ int main(int argc, char ** argv) {

    LOG_INF("%s: model loaded\n", __func__);

+    // if a custom chat template is not supplied, we will use the one that comes with the model (if any)
+    if (params.chat_template.empty()) {
+        if (!ctx_server.validate_builtin_chat_template()) {
+            LOG_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
+            params.chat_template = "chatml";
+        }
+    }
+
    // print sample chat example to make it clear which template is used
    LOG_INF("%s: chat template, chat_template: %s, example_format: '%s'\n", __func__,
-        ctx_server.chat_templates.template_default->source().c_str(),
-        common_chat_format_example(*ctx_server.chat_templates.template_default, ctx_server.params_base.use_jinja).c_str());
+        params.chat_template.empty() ? "(built-in)" : params.chat_template.c_str(),
+        common_chat_format_example(ctx_server.model, params.chat_template).c_str());

-    ctx_server.queue_tasks.on_new_task([&ctx_server](const server_task & task) {
-        ctx_server.process_single_task(task);
-    });
+    ctx_server.queue_tasks.on_new_task(std::bind(
+                &server_context::process_single_task, &ctx_server, std::placeholders::_1));

-    ctx_server.queue_tasks.on_update_slots([&ctx_server]() {
-        ctx_server.update_slots();
-    });
+    ctx_server.queue_tasks.on_update_slots(std::bind(
+                &server_context::update_slots, &ctx_server));

    shutdown_handler = [&](int) {
        ctx_server.queue_tasks.terminate();
--- a/examples/server/tests/README.md
+++ b/examples/server/tests/README.md
@@ -31,9 +31,8 @@ It's possible to override some scenario steps values with environment variables:
 | `LLAMA_SERVER_BIN_PATH`  | to change the server binary path, default: `../../../build/bin/llama-server`                         |
 | `DEBUG`                  | to enable steps and server verbose mode `--verbose`                                       |
 | `N_GPU_LAYERS`           | number of model layers to offload to VRAM `-ngl --n-gpu-layers`                                |
-| `LLAMA_CACHE`            | by default server tests re-download models to the `tmp` subfolder. Set this to your cache (e.g. `$HOME/Library/Caches/llama.cpp` on Mac or `$HOME/.cache/llama.cpp` on Unix) to avoid this |

-To run slow tests (will download many models, make sure to set `LLAMA_CACHE` if needed):
+To run slow tests:

 ```shell
 SLOW_TESTS=1 ./tests.sh
@@ -45,16 +44,10 @@ To run with stdout/stderr display in real time (verbose output, but useful for d
 DEBUG=1 ./tests.sh -s -v -x
 ```

-To run all the tests in a file:
+To run single test unit:

 ```shell
-./tests.sh unit/test_chat_completion.py.py -v -x
-```
-
-To run a single test:
-
-```shell
-./tests.sh unit/test_chat_completion.py::test_invalid_chat_completion_req
+./tests.sh unit/test_{name of test case here}.py -v -x
 ```

 Hint: You can compile and run test in single command, useful for local developement:
--- a/examples/server/tests/pytest.ini
+++ b/examples/server/tests/pytest.ini
@@ -1,4 +0,0 @@
-[pytest]
-markers =
-    slow: marks tests as slow (deselect with '-m "not slow"')
-    serial
--- a/examples/server/tests/tests.sh
+++ b/examples/server/tests/tests.sh
@@ -6,18 +6,9 @@ cd $SCRIPT_DIR

 set -eu

-if [[ "${SLOW_TESTS:-0}" == 1 ]]; then
-    # Slow tests for tool calls need quite a few models ahead of time to avoid timing out.
-    python $SCRIPT_DIR/../../../scripts/fetch_server_test_models.py
-fi
-
 if [ $# -lt 1 ]
 then
-    if [[ "${SLOW_TESTS:-0}" == 1 ]]; then
-        pytest -v -x
-    else
-        pytest -v -x -m "not slow"
-    fi
+    pytest -v -x
 else
    pytest "$@"
 fi
--- a/examples/server/tests/unit/test_chat_completion.py
+++ b/examples/server/tests/unit/test_chat_completion.py
@@ -2,28 +2,24 @@ import pytest
 from openai import OpenAI
 from utils import *

-server: ServerProcess
+server = ServerPreset.tinyllama2()

-@pytest.fixture(autouse=True)
+
+@pytest.fixture(scope="module", autouse=True)
 def create_server():
    global server
    server = ServerPreset.tinyllama2()


@pytest.mark.parametrize(
-    "model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,finish_reason,jinja,chat_template",
+    "model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,finish_reason",
    [
-        (None, "Book", "What is the best book", 8, "(Suddenly)+|\\{ \" Sarax.", 77, 8, "length", False, None),
-        (None, "Book", "What is the best book", 8, "(Suddenly)+|\\{ \" Sarax.", 77, 8, "length", True, None),
-        (None, "Book", "What is the best book", 8, "^ blue", 23, 8, "length", True, "This is not a chat template, it is"),
-        ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length", False, None),
-        ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length", True, None),
+        (None, "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, "length"),
+        ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length"),
    ]
 )
-def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason, jinja, chat_template):
+def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason):
    global server
-    server.jinja = jinja
-    server.chat_template = chat_template
    server.start()
    res = server.make_request("POST", "/chat/completions", data={
        "model": model,
@@ -121,21 +117,6 @@ def test_chat_template():
    assert res.body["__verbose"]["prompt"] == "<s> <|start_header_id|>system<|end_header_id|>\n\nBook<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat is the best book<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"


-def test_apply_chat_template():
-    global server
-    server.chat_template = "command-r"
-    server.start()
-    res = server.make_request("POST", "/apply-template", data={
-        "messages": [
-            {"role": "system", "content": "You are a test."},
-            {"role": "user", "content":"Hi there"},
-        ]
-    })
-    assert res.status_code == 200
-    assert "prompt" in res.body
-    assert res.body["prompt"] == "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>You are a test.<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hi there<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
-
-
@pytest.mark.parametrize("response_format,n_predicted,re_content", [
    ({"type": "json_object", "schema": {"const": "42"}}, 6, "\"42\""),
    ({"type": "json_object", "schema": {"items": [{"type": "integer"}]}}, 10, "[ -3000 ]"),
--- a/examples/server/tests/unit/test_completion.py
+++ b/examples/server/tests/unit/test_completion.py
@@ -1,5 +1,4 @@
 import pytest
-import requests
 import time
 from openai import OpenAI
 from utils import *
@@ -87,7 +86,7 @@ def test_completion_stream_vs_non_stream():
    assert content_stream == res_non_stream.body["content"]


-def test_completion_with_openai_library():
+def test_completion_stream_with_openai_library():
    global server
    server.start()
    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
@@ -102,7 +101,7 @@ def test_completion_with_openai_library():
    assert match_regex("(going|bed)+", res.choices[0].text)


-def test_completion_stream_with_openai_library():
+def test_completion_with_openai_library():
    global server
    server.start()
    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
@@ -406,23 +405,3 @@ def test_n_probs_post_sampling():
            assert "bytes" in prob and type(prob["bytes"]) == list
        # because the test model usually output token with either 100% or 0% probability, we need to check all the top_probs
        assert any(prob["prob"] == 1.0 for prob in tok["top_probs"])
-
-
-def test_cancel_request():
-    global server
-    server.n_ctx = 4096
-    server.n_predict = -1
-    server.n_slots = 1
-    server.server_slots = True
-    server.start()
-    # send a request that will take a long time, but cancel it before it finishes
-    try:
-        server.make_request("POST", "/completion", data={
-            "prompt": "I believe the meaning of life is",
-        }, timeout=0.1)
-    except requests.exceptions.ReadTimeout:
-        pass # expected
-    # make sure the slot is free
-    time.sleep(1) # wait for HTTP_POLLING_SECONDS
-    res = server.make_request("GET", "/slots")
-    assert res.body[0]["is_processing"] == False
--- a/examples/server/tests/unit/test_tool_call.py
+++ b/examples/server/tests/unit/test_tool_call.py
@@ -1,352 +0,0 @@
-import pytest
-from utils import *
-
-server: ServerProcess
-
-TIMEOUT_SERVER_START = 15*60
-TIMEOUT_HTTP_REQUEST = 60
-
-@pytest.fixture(autouse=True)
-def create_server():
-    global server
-    server = ServerPreset.tinyllama2()
-    server.model_alias = "tinyllama-2-tool-call"
-    server.server_port = 8081
-
-
-TEST_TOOL = {
-    "type":"function",
-    "function": {
-        "name": "test",
-        "description": "",
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "success": {"type": "boolean", "const": True},
-            },
-            "required": ["success"]
-        }
-    }
-}
-
-PYTHON_TOOL = {
-    "type": "function",
-    "function": {
-        "name": "python",
-        "description": "Runs code in an ipython interpreter and returns the result of the execution after 60 seconds.",
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "code": {
-                    "type": "string",
-                    "description": "The code to run in the ipython interpreter."
-                }
-            },
-            "required": ["code"]
-        }
-    }
-}
-
-WEATHER_TOOL = {
-  "type":"function",
-  "function":{
-    "name":"get_current_weather",
-    "description":"Get the current weather in a given location",
-    "parameters":{
-      "type":"object",
-      "properties":{
-        "location":{
-          "type":"string",
-          "description":"The city and country/state, e.g. 'San Francisco, CA', or 'Paris, France'"
-        }
-      },
-      "required":["location"]
-    }
-  }
-}
-
-
-def do_test_completion_with_required_tool_tiny(template_name: str, tool: dict, argument_key: str | None):
-    n_predict = 512
-    global server
-    # server = ServerPreset.stories15m_moe()
-    server.jinja = True
-    server.n_predict = n_predict
-    server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
-    server.start(timeout_seconds=TIMEOUT_SERVER_START)
-    res = server.make_request("POST", "/chat/completions", data={
-        "max_tokens": n_predict,
-        "messages": [
-            {"role": "system", "content": "You are a coding assistant."},
-            {"role": "user", "content": "Write an example"},
-        ],
-        "tool_choice": "required",
-        "tools": [tool],
-        "parallel_tool_calls": False,
-        "temperature": 0.0,
-        "top_k": 1,
-        "top_p": 1.0,
-    })
-    assert res.status_code == 200, f"Expected status code 200, got {res.status_code}"
-    choice = res.body["choices"][0]
-    tool_calls = choice["message"].get("tool_calls")
-    assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
-    tool_call = tool_calls[0]
-    expected_function_name = "python" if tool["type"] == "code_interpreter" else tool["function"]["name"]
-    assert expected_function_name == tool_call["function"]["name"]
-    actual_arguments = tool_call["function"]["arguments"]
-    assert isinstance(actual_arguments, str)
-    if argument_key is not None:
-        actual_arguments = json.loads(actual_arguments)
-        assert argument_key in actual_arguments, f"tool arguments: {json.dumps(actual_arguments)}, expected: {argument_key}"
-
-
-@pytest.mark.parametrize("template_name,tool,argument_key", [
-    ("google-gemma-2-2b-it",                          TEST_TOOL,            "success"),
-    ("meta-llama-Llama-3.3-70B-Instruct",             TEST_TOOL,            "success"),
-    ("meta-llama-Llama-3.3-70B-Instruct",             PYTHON_TOOL,          "code"),
-])
-def test_completion_with_required_tool_tiny_fast(template_name: str, tool: dict, argument_key: str | None):
-    do_test_completion_with_required_tool_tiny(template_name, tool, argument_key)
-
-
-@pytest.mark.slow
-@pytest.mark.parametrize("template_name,tool,argument_key", [
-    ("meta-llama-Llama-3.1-8B-Instruct",              TEST_TOOL,            "success"),
-    ("meta-llama-Llama-3.1-8B-Instruct",              PYTHON_TOOL,          "code"),
-    ("meetkai-functionary-medium-v3.1",               TEST_TOOL,            "success"),
-    ("meetkai-functionary-medium-v3.1",               PYTHON_TOOL,          "code"),
-    ("meetkai-functionary-medium-v3.2",               TEST_TOOL,            "success"),
-    ("meetkai-functionary-medium-v3.2",               PYTHON_TOOL,          "code"),
-    ("NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use", TEST_TOOL,            "success"),
-    ("NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use", PYTHON_TOOL,          "code"),
-    ("meta-llama-Llama-3.2-3B-Instruct",              TEST_TOOL,            "success"),
-    ("meta-llama-Llama-3.2-3B-Instruct",              PYTHON_TOOL,          "code"),
-    ("mistralai-Mistral-Nemo-Instruct-2407",          TEST_TOOL,            "success"),
-    ("mistralai-Mistral-Nemo-Instruct-2407",          PYTHON_TOOL,          "code"),
-    ("NousResearch-Hermes-3-Llama-3.1-8B-tool_use",   TEST_TOOL,            "success"),
-    ("NousResearch-Hermes-3-Llama-3.1-8B-tool_use",   PYTHON_TOOL,          "code"),
-    ("deepseek-ai-DeepSeek-R1-Distill-Llama-8B",      TEST_TOOL,            "success"),
-    ("deepseek-ai-DeepSeek-R1-Distill-Llama-8B",      PYTHON_TOOL,          "code"),
-    ("fireworks-ai-llama-3-firefunction-v2",          TEST_TOOL,            "success"),
-    ("fireworks-ai-llama-3-firefunction-v2",          PYTHON_TOOL,          "code"),
-])
-def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict, argument_key: str | None):
-    do_test_completion_with_required_tool_tiny(template_name, tool, argument_key)
-
-
-@pytest.mark.slow
-@pytest.mark.parametrize("tool,argument_key,hf_repo,template_override", [
-    (TEST_TOOL,    "success",  "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
-    (PYTHON_TOOL,  "code",     "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
-    (TEST_TOOL,    "success",  "bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              None),
-    (PYTHON_TOOL,  "code",     "bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              None),
-    (TEST_TOOL,    "success",  "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      None),
-    (PYTHON_TOOL,  "code",     "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      None),
-    (TEST_TOOL,    "success",  "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        None),
-    (PYTHON_TOOL,  "code",     "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        None),
-    (TEST_TOOL,    "success",  "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
-    (PYTHON_TOOL,  "code",     "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
-    (TEST_TOOL,    "success",  "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",   ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
-    (PYTHON_TOOL,  "code",     "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",   ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
-    (TEST_TOOL,    "success",  "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
-    (PYTHON_TOOL,  "code",     "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
-    (TEST_TOOL,    "success",  "bartowski/functionary-small-v3.2-GGUF:Q8_0",       ("meetkai/functionary-medium-v3.2", None)),
-    (PYTHON_TOOL,  "code",     "bartowski/functionary-small-v3.2-GGUF:Q8_0",       ("meetkai/functionary-medium-v3.2", None)),
-    (TEST_TOOL,    "success",  "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      ("meta-llama/Llama-3.2-3B-Instruct", None)),
-    (PYTHON_TOOL,  "code",     "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      ("meta-llama/Llama-3.2-3B-Instruct", None)),
-    (TEST_TOOL,    "success",  "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      ("meta-llama/Llama-3.2-3B-Instruct", None)),
-    (PYTHON_TOOL,  "code",     "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      ("meta-llama/Llama-3.2-3B-Instruct", None)),
-    # TODO: fix these
-    # (TEST_TOOL,    "success",  "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
-    # (PYTHON_TOOL,  "code",     "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
-])
-def test_completion_with_required_tool_real_model(tool: dict, argument_key: str | None, hf_repo: str, template_override: Tuple[str, str | None] | None):
-    n_predict = 512
-    server.n_slots = 1
-    server.jinja = True
-    server.n_ctx = 8192
-    server.n_predict = n_predict
-    server.model_hf_repo = hf_repo
-    server.model_hf_file = None
-    if template_override:
-        (template_hf_repo, template_variant) = template_override
-        server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja"
-        assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
-    server.start(timeout_seconds=TIMEOUT_SERVER_START)
-    res = server.make_request("POST", "/chat/completions", data={
-        "max_tokens": n_predict,
-        "messages": [
-            {"role": "system", "content": "You are a coding assistant."},
-            {"role": "user", "content": "Write an example"},
-        ],
-        "tool_choice": "required",
-        "tools": [tool],
-        "parallel_tool_calls": False,
-        "temperature": 0.0,
-        "top_k": 1,
-        "top_p": 1.0,
-    }, timeout=TIMEOUT_HTTP_REQUEST)
-    assert res.status_code == 200, f"Expected status code 200, got {res.status_code}"
-    choice = res.body["choices"][0]
-    tool_calls = choice["message"].get("tool_calls")
-    assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
-    tool_call = tool_calls[0]
-    expected_function_name = "python" if tool["type"] == "code_interpreter" else tool["function"]["name"]
-    assert expected_function_name == tool_call["function"]["name"]
-    actual_arguments = tool_call["function"]["arguments"]
-    assert isinstance(actual_arguments, str)
-    if argument_key is not None:
-        actual_arguments = json.loads(actual_arguments)
-        assert argument_key in actual_arguments, f"tool arguments: {json.dumps(actual_arguments)}, expected: {argument_key}"
-
-
-def do_test_completion_without_tool_call(template_name: str, n_predict: int, tools: list[dict], tool_choice: str | None):
-    global server
-    server.jinja = True
-    server.n_predict = n_predict
-    server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
-    server.start(timeout_seconds=TIMEOUT_SERVER_START)
-    res = server.make_request("POST", "/chat/completions", data={
-        "max_tokens": n_predict,
-        "messages": [
-            {"role": "system", "content": "You are a coding assistant."},
-            {"role": "user", "content": "say hello world with python"},
-        ],
-        "tools": tools if tools else None,
-        "tool_choice": tool_choice,
-        "temperature": 0.0,
-        "top_k": 1,
-        "top_p": 1.0,
-    }, timeout=TIMEOUT_HTTP_REQUEST)
-    assert res.status_code == 200, f"Expected status code 200, got {res.status_code}"
-    choice = res.body["choices"][0]
-    assert choice["message"].get("tool_calls") is None, f'Expected no tool call in {choice["message"]}'
-
-
-@pytest.mark.parametrize("template_name,n_predict,tools,tool_choice", [
-    ("meta-llama-Llama-3.3-70B-Instruct",         128, [],            None),
-    ("meta-llama-Llama-3.3-70B-Instruct",         128, [TEST_TOOL],   None),
-    ("meta-llama-Llama-3.3-70B-Instruct",         128, [PYTHON_TOOL], 'none'),
-])
-def test_completion_without_tool_call_fast(template_name: str, n_predict: int, tools: list[dict], tool_choice: str | None):
-    do_test_completion_without_tool_call(template_name, n_predict, tools, tool_choice)
-
-
-@pytest.mark.slow
-@pytest.mark.parametrize("template_name,n_predict,tools,tool_choice", [
-    ("meetkai-functionary-medium-v3.2",               256, [],            None),
-    ("meetkai-functionary-medium-v3.2",               256, [TEST_TOOL],   None),
-    ("meetkai-functionary-medium-v3.2",               256, [PYTHON_TOOL], 'none'),
-    ("meetkai-functionary-medium-v3.1",               256, [],            None),
-    ("meetkai-functionary-medium-v3.1",               256, [TEST_TOOL],   None),
-    ("meetkai-functionary-medium-v3.1",               256, [PYTHON_TOOL], 'none'),
-    ("meta-llama-Llama-3.2-3B-Instruct",              256, [],            None),
-    ("meta-llama-Llama-3.2-3B-Instruct",              256, [TEST_TOOL],   None),
-    ("meta-llama-Llama-3.2-3B-Instruct",              256, [PYTHON_TOOL], 'none'),
-])
-def test_completion_without_tool_call_slow(template_name: str, n_predict: int, tools: list[dict], tool_choice: str | None):
-    do_test_completion_without_tool_call(template_name, n_predict, tools, tool_choice)
-
-
-@pytest.mark.slow
-@pytest.mark.parametrize("hf_repo,template_override", [
-    ("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
-    ("bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              None),
-    ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      None),
-    ("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        None),
-    ("bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
-    ("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",   ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
-    ("bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
-    ("bartowski/functionary-small-v3.2-GGUF:Q8_0",       ("meetkai/functionary-medium-v3.2", None)),
-    ("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      ("meta-llama/Llama-3.2-3B-Instruct", None)),
-    # ("bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)),
-    # ("bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
-])
-def test_weather_tool_call(hf_repo: str, template_override: Tuple[str, str | None] | None):
-    global server
-    server.n_slots = 1
-    server.jinja = True
-    server.n_ctx = 8192
-    server.n_predict = 512
-    server.model_hf_repo = hf_repo
-    server.model_hf_file = None
-    if template_override:
-        (template_hf_repo, template_variant) = template_override
-        server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja"
-        assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
-    server.start(timeout_seconds=TIMEOUT_SERVER_START)
-    res = server.make_request("POST", "/chat/completions", data={
-        "max_tokens": 256,
-        "messages": [
-            {"role": "user", "content": "What is the weather in Istanbul?"},
-        ],
-        "tools": [WEATHER_TOOL],
-    }, timeout=TIMEOUT_HTTP_REQUEST)
-    assert res.status_code == 200, f"Expected status code 200, got {res.status_code}"
-    choice = res.body["choices"][0]
-    tool_calls = choice["message"].get("tool_calls")
-    assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
-    tool_call = tool_calls[0]
-    assert tool_call["function"]["name"] == WEATHER_TOOL["function"]["name"]
-    actual_arguments = json.loads(tool_call["function"]["arguments"])
-    assert 'location' in actual_arguments, f"location not found in {json.dumps(actual_arguments)}"
-    location = actual_arguments["location"]
-    assert isinstance(location, str), f"Expected location to be a string, got {type(location)}: {json.dumps(location)}"
-    assert re.match('^Istanbul(, (TR|Turkey|Türkiye))?$', location), f'Expected Istanbul for location, got {location}'
-
-
-@pytest.mark.slow
-@pytest.mark.parametrize("expected_arguments_override,hf_repo,template_override", [
-    (None,                 "bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              None),
-    (None,                 "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      None),
-    (None,                 "bartowski/functionary-small-v3.2-GGUF:Q8_0",       ("meetkai-functionary-medium-v3.2", None)),
-    ('{"code":"print("}',  "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
-    (None,                 "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      ("meta-llama-Llama-3.2-3B-Instruct", None)),
-    ('{"code":"print("}',  "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      ("meta-llama-Llama-3.2-3B-Instruct", None)),
-    (None,                 "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        None),
-    (None,                 "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",      ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
-    (None,                 "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",   ("NousResearch-Hermes-3-Llama-3.1-8B", "tool_use")),
-    (None,                 "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
-    # (None,                 "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
-])
-def test_hello_world_tool_call(expected_arguments_override: str | None, hf_repo: str, template_override: Tuple[str, str | None] | None):
-    global server
-    server.n_slots = 1
-    server.jinja = True
-    server.n_ctx = 8192
-    server.n_predict = 128
-    server.model_hf_repo = hf_repo
-    server.model_hf_file = None
-    if template_override:
-        (template_hf_repo, template_variant) = template_override
-        server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja"
-        assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
-    server.start(timeout_seconds=TIMEOUT_SERVER_START)
-    res = server.make_request("POST", "/chat/completions", data={
-        "max_tokens": 256,
-        "messages": [
-            {"role": "system", "content": "You are a coding assistant."},
-            {"role": "user", "content": "say hello world with python"},
-        ],
-        "tools": [PYTHON_TOOL],
-        # Note: without these greedy params, Functionary v3.2 writes `def hello_world():\n    print("Hello, World!")\nhello_world()` which is correct but a pain to test.
-        "temperature": 0.0,
-        "top_k": 1,
-        "top_p": 1.0,
-    }, timeout=TIMEOUT_HTTP_REQUEST)
-    assert res.status_code == 200, f"Expected status code 200, got {res.status_code}"
-    choice = res.body["choices"][0]
-    tool_calls = choice["message"].get("tool_calls")
-    assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
-    tool_call = tool_calls[0]
-    assert tool_call["function"]["name"] == PYTHON_TOOL["function"]["name"]
-    actual_arguments = tool_call["function"]["arguments"]
-    if expected_arguments_override is not None:
-        assert actual_arguments == expected_arguments_override
-    else:
-        actual_arguments = json.loads(actual_arguments)
-        assert 'code' in actual_arguments, f"code not found in {json.dumps(actual_arguments)}"
-        code = actual_arguments["code"]
-        assert isinstance(code, str), f"Expected code to be a string, got {type(code)}: {json.dumps(code)}"
-        assert re.match(r'''print\(("[Hh]ello,? [Ww]orld!?"|'[Hh]ello,? [Ww]orld!?')\)''', code), f'Expected hello world, got {code}'
--- a/examples/server/tests/utils.py
+++ b/examples/server/tests/utils.py
@@ -26,9 +26,6 @@ from re import RegexFlag
 import wget


-DEFAULT_HTTP_TIMEOUT = 12 if "LLAMA_SANITIZE" not in os.environ else 30
-
-
 class ServerResponse:
    headers: dict
    status_code: int
@@ -41,7 +38,7 @@ class ServerProcess:
    server_port: int = 8080
    server_host: str = "127.0.0.1"
    model_hf_repo: str = "ggml-org/models"
-    model_hf_file: str | None = "tinyllamas/stories260K.gguf"
+    model_hf_file: str = "tinyllamas/stories260K.gguf"
    model_alias: str = "tinyllama-2"
    temperature: float = 0.8
    seed: int = 42
@@ -72,14 +69,13 @@ class ServerProcess:
    pooling: str | None = None
    draft: int | None = None
    api_key: str | None = None
+    response_format: str | None = None
    lora_files: List[str] | None = None
    disable_ctx_shift: int | None = False
    draft_min: int | None = None
    draft_max: int | None = None
    no_webui: bool | None = None
-    jinja: bool | None = None
    chat_template: str | None = None
-    chat_template_file: str | None = None

    # session variables
    process: subprocess.Popen | None = None
@@ -92,7 +88,7 @@ class ServerProcess:
        if "PORT" in os.environ:
            self.server_port = int(os.environ["PORT"])

-    def start(self, timeout_seconds: int | None = DEFAULT_HTTP_TIMEOUT) -> None:
+    def start(self, timeout_seconds: int = 10) -> None:
        if "LLAMA_SERVER_BIN_PATH" in os.environ:
            server_path = os.environ["LLAMA_SERVER_BIN_PATH"]
        elif os.name == "nt":
@@ -170,12 +166,8 @@ class ServerProcess:
            server_args.extend(["--draft-min", self.draft_min])
        if self.no_webui:
            server_args.append("--no-webui")
-        if self.jinja:
-            server_args.append("--jinja")
        if self.chat_template:
            server_args.extend(["--chat-template", self.chat_template])
-        if self.chat_template_file:
-            server_args.extend(["--chat-template-file", self.chat_template_file])

        args = [str(arg) for arg in [server_path, *server_args]]
        print(f"bench: starting server with: {' '.join(args)}")
@@ -191,7 +183,7 @@ class ServerProcess:
            creationflags=flags,
            stdout=sys.stdout,
            stderr=sys.stdout,
-            env={**os.environ, "LLAMA_CACHE": "tmp"} if "LLAMA_CACHE" not in os.environ else None,
+            env={**os.environ, "LLAMA_CACHE": "tmp"},
        )
        server_instances.add(self)

@@ -227,18 +219,17 @@ class ServerProcess:
        path: str,
        data: dict | Any | None = None,
        headers: dict | None = None,
-        timeout: float | None = None,
    ) -> ServerResponse:
        url = f"http://{self.server_host}:{self.server_port}{path}"
        parse_body = False
        if method == "GET":
-            response = requests.get(url, headers=headers, timeout=timeout)
+            response = requests.get(url, headers=headers)
            parse_body = True
        elif method == "POST":
-            response = requests.post(url, headers=headers, json=data, timeout=timeout)
+            response = requests.post(url, headers=headers, json=data)
            parse_body = True
        elif method == "OPTIONS":
-            response = requests.options(url, headers=headers, timeout=timeout)
+            response = requests.options(url, headers=headers)
        else:
            raise ValueError(f"Unimplemented method: {method}")
        result = ServerResponse()
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -16,9 +16,6 @@
 // Change JSON_ASSERT from assert() to GGML_ASSERT:
 #define JSON_ASSERT GGML_ASSERT
 #include "json.hpp"
-#include "minja.hpp"
-#include "chat.hpp"
-#include "chat-template.hpp"

 #include <random>
 #include <sstream>
@@ -121,7 +118,7 @@ static json json_get_nested_values(const std::vector<std::string> & paths, const
 * - only string, example: "string"
 * - mixed string and tokens, example: [12, 34, "string", 56, 78]
 */
-static llama_tokens tokenize_mixed(const llama_vocab * vocab, const json & json_prompt, bool add_special, bool parse_special) {
+static llama_tokens tokenize_mixed(const llama_context * ctx, const json & json_prompt, bool add_special, bool parse_special) {
    // If `add_bos` is true, we only add BOS, when json_prompt is a string,
    // or the first element of the json_prompt array is a string.
    llama_tokens prompt_tokens;
@@ -134,10 +131,10 @@ static llama_tokens tokenize_mixed(const llama_vocab * vocab, const json & json_

                llama_tokens p;
                if (first) {
-                    p = common_tokenize(vocab, s, add_special, parse_special);
+                    p = common_tokenize(ctx, s, add_special, parse_special);
                    first = false;
                } else {
-                    p = common_tokenize(vocab, s, false, parse_special);
+                    p = common_tokenize(ctx, s, false, parse_special);
                }

                prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
@@ -151,7 +148,7 @@ static llama_tokens tokenize_mixed(const llama_vocab * vocab, const json & json_
        }
    } else {
        auto s = json_prompt.template get<std::string>();
-        prompt_tokens = common_tokenize(vocab, s, add_special, parse_special);
+        prompt_tokens = common_tokenize(ctx, s, add_special, parse_special);
    }

    return prompt_tokens;
@@ -169,11 +166,11 @@ static llama_tokens tokenize_mixed(const llama_vocab * vocab, const json & json_
 * - "prompt": [[12, 34, 56], [78, 90, 12]]
 * - "prompt": [[12, 34, "string", 56, 78], [12, 34, 56]]
 */
-static std::vector<llama_tokens> tokenize_input_prompts(const llama_vocab * vocab, const json & json_prompt, bool add_special, bool parse_special) {
+static std::vector<llama_tokens> tokenize_input_prompts(llama_context * ctx, const json & json_prompt, bool add_special, bool parse_special) {
    std::vector<llama_tokens> result;
    if (json_prompt.is_string() || json_is_array_of_mixed_numbers_strings(json_prompt)) {
        // string or mixed
-        result.push_back(tokenize_mixed(vocab, json_prompt, add_special, parse_special));
+        result.push_back(tokenize_mixed(ctx, json_prompt, add_special, parse_special));
    } else if (json_is_array_of_numbers(json_prompt)) {
        // array of tokens
        result.push_back(json_prompt.get<llama_tokens>());
@@ -182,7 +179,7 @@ static std::vector<llama_tokens> tokenize_input_prompts(const llama_vocab * voca
        result.reserve(json_prompt.size());
        for (const auto & p : json_prompt) {
            if (p.is_string() || json_is_array_of_mixed_numbers_strings(p)) {
-                result.push_back(tokenize_mixed(vocab, p, add_special, parse_special));
+                result.push_back(tokenize_mixed(ctx, p, add_special, parse_special));
            } else if (json_is_array_of_numbers(p)) {
                // array of tokens
                result.push_back(p.get<llama_tokens>());
@@ -234,23 +231,21 @@ static size_t validate_utf8(const std::string& text) {
 //

 // format rerank task: [BOS]query[EOS][SEP]doc[EOS]
-static llama_tokens format_rerank(const struct llama_vocab * vocab, const llama_tokens & query, const llama_tokens & doc) {
+static llama_tokens format_rerank(const struct llama_model * model, const llama_tokens & query, const llama_tokens & doc) {
    llama_tokens result;
-
    result.reserve(doc.size() + query.size() + 4);
-    result.push_back(llama_vocab_bos(vocab));
+    result.push_back(llama_token_bos(model));
    result.insert(result.end(), query.begin(), query.end());
-    result.push_back(llama_vocab_eos(vocab));
-    result.push_back(llama_vocab_sep(vocab));
+    result.push_back(llama_token_eos(model));
+    result.push_back(llama_token_sep(model));
    result.insert(result.end(), doc.begin(), doc.end());
-    result.push_back(llama_vocab_eos(vocab));
-
+    result.push_back(llama_token_eos(model));
    return result;
 }

 // format infill task
 static llama_tokens format_infill(
-        const llama_vocab * vocab,
+        const llama_context * ctx,
        const json & input_prefix,
        const json & input_suffix,
        const json & input_extra,
@@ -277,14 +272,15 @@ static llama_tokens format_infill(
    llama_tokens extra_tokens;
    extra_tokens.reserve(n_ctx);

-    auto tokens_prefix = tokenize_mixed(vocab, input_prefix, false, false);
-    auto tokens_suffix = tokenize_mixed(vocab, input_suffix, false, false);
+    auto model = llama_get_model(ctx);
+    auto tokens_prefix = tokenize_mixed(ctx, input_prefix, false, false);
+    auto tokens_suffix = tokenize_mixed(ctx, input_suffix, false, false);

-    if (llama_vocab_fim_rep(vocab) != LLAMA_TOKEN_NULL) {
+    if (llama_token_fim_rep(model) != LLAMA_TOKEN_NULL) {
        // TODO: make project name an input
-        static const auto k_fim_repo = common_tokenize(vocab, "myproject\n", false, false);
+        static const auto k_fim_repo = common_tokenize(ctx, "myproject\n", false, false);

-        extra_tokens.push_back(llama_vocab_fim_rep(vocab));
+        extra_tokens.push_back(llama_token_fim_rep(model));
        extra_tokens.insert(extra_tokens.end(), k_fim_repo.begin(), k_fim_repo.end());
    }
    for (const auto & chunk : input_extra) {
@@ -292,28 +288,28 @@ static llama_tokens format_infill(
        const std::string text     = json_value(chunk, "text",     std::string());
        const std::string filename = json_value(chunk, "filename", std::string("tmp"));

-        if (llama_vocab_fim_sep(vocab) != LLAMA_TOKEN_NULL) {
-            const auto k_fim_file = common_tokenize(vocab, filename + "\n", false, false);
+        if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) {
+            const auto k_fim_file = common_tokenize(ctx, filename + "\n", false, false);

-            extra_tokens.insert(extra_tokens.end(), llama_vocab_fim_sep(vocab));
+            extra_tokens.insert(extra_tokens.end(), llama_token_fim_sep(model));
            extra_tokens.insert(extra_tokens.end(), k_fim_file.begin(), k_fim_file.end());
        } else {
            // chunk separator in binary form to avoid confusing the AI
            static const char k_chunk_prefix_str[] = {0x0a, 0x0a, 0x2d, 0x2d, 0x2d, 0x20, 0x73, 0x6e, 0x69, 0x70, 0x70, 0x65, 0x74, 0x20, 0x2d, 0x2d, 0x2d, 0x0a, 0x0a, 0x00};
-            static const auto k_chunk_prefix_tokens = common_tokenize(vocab, k_chunk_prefix_str, false, false);
+            static const auto k_chunk_prefix_tokens = common_tokenize(ctx, k_chunk_prefix_str, false, false);

            extra_tokens.insert(extra_tokens.end(), k_chunk_prefix_tokens.begin(), k_chunk_prefix_tokens.end());
        }

-        const auto chunk_tokens = common_tokenize(vocab, text, false, false);
+        const auto chunk_tokens = common_tokenize(ctx, text, false, false);
        extra_tokens.insert(extra_tokens.end(), chunk_tokens.begin(), chunk_tokens.end());
    }

-    if (llama_vocab_fim_sep(vocab) != LLAMA_TOKEN_NULL) {
+    if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) {
        // TODO: current filename
-        static const auto k_fim_file = common_tokenize(vocab, "filename\n", false, false);
+        static const auto k_fim_file = common_tokenize(ctx, "filename\n", false, false);

-        extra_tokens.insert(extra_tokens.end(), llama_vocab_fim_sep(vocab));
+        extra_tokens.insert(extra_tokens.end(), llama_token_fim_sep(model));
        extra_tokens.insert(extra_tokens.end(), k_fim_file.begin(), k_fim_file.end());
    }

@@ -329,15 +325,15 @@ static llama_tokens format_infill(
    tokens_prefix.erase(tokens_prefix.begin(), tokens_prefix.begin() + tokens_prefix.size() - n_prefix_take);
    tokens_suffix.resize(n_suffix_take);

-    tokens_prefix.insert(tokens_prefix.begin(), llama_vocab_fim_pre(vocab));
+    tokens_prefix.insert(tokens_prefix.begin(), llama_token_fim_pre(model));
    tokens_prefix.insert(tokens_prefix.end(),   tokens_prompt.begin(), tokens_prompt.end());
-    tokens_suffix.insert(tokens_suffix.begin(), llama_vocab_fim_suf(vocab));
+    tokens_suffix.insert(tokens_suffix.begin(), llama_token_fim_suf(model));

    auto embd_inp = spm_infill ? tokens_suffix : tokens_prefix;
    auto embd_end = spm_infill ? tokens_prefix : tokens_suffix;

-    if (llama_vocab_get_add_bos(vocab)) {
-        embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab));
+    if (llama_add_bos_token(model)) {
+        embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
    }

    SRV_DBG("extra: n_ctx = %d, n_extra_take = %d, n_extra = %d\n", n_ctx, n_extra_take, (int) extra_tokens.size());
@@ -346,13 +342,13 @@ static llama_tokens format_infill(
    embd_inp.insert(embd_inp.begin(), extra_tokens.end() - n_extra_take, extra_tokens.end());

    embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
-    embd_inp.push_back(llama_vocab_fim_mid(vocab));
+    embd_inp.push_back(llama_token_fim_mid(model));

    return embd_inp;
 }

 // Format given chat. If tmpl is empty, we take the template from model metadata
-inline std::string format_chat(const common_chat_template & tmpl, const std::vector<json> & messages) {
+inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
    std::vector<common_chat_msg> chat;

    for (size_t i = 0; i < messages.size(); ++i) {
@@ -377,10 +373,10 @@ inline std::string format_chat(const common_chat_template & tmpl, const std::vec
            throw std::runtime_error("Missing 'content' (ref: https://github.com/ggerganov/llama.cpp/issues/8367)");
        }

-        chat.push_back({role, content, /* tool_calls= */ {}});
+        chat.push_back({role, content});
    }

-    const auto formatted_chat = common_chat_apply_template(tmpl, chat, true, /* use_jinja= */ false);
+    const auto formatted_chat = common_chat_apply_template(model, tmpl, chat, true);
    LOG_DBG("formatted_chat: '%s'\n", formatted_chat.c_str());

    return formatted_chat;
@@ -579,32 +575,14 @@ static json oaicompat_completion_params_parse(const json & body) {
    return llama_params;
 }

-static json oaicompat_completion_params_parse(
-    const json & body, /* openai api json semantics */
-    bool use_jinja,
-    const common_chat_templates & chat_templates)
-{
+static json oaicompat_chat_completion_params_parse(
+        const struct llama_model * model,
+        const json & body, /* openai api json semantics */
+        const std::string & chat_template) {
    json llama_params;
-    const auto & tmpl = body.contains("tools") && chat_templates.template_tool_use
-        ? *chat_templates.template_tool_use
-        : *chat_templates.template_default;

-    auto tools = json_value(body, "tools", json());
-    auto stream = json_value(body, "stream", false);
-
-    if (tools.is_array() && !tools.empty()) {
-        if (stream) {
-            throw std::runtime_error("Cannot use tools with stream");
-        }
-        if (!use_jinja) {
-            throw std::runtime_error("tools param requires --jinja flag");
-        }
-    }
-    if (!use_jinja) {
-        if (body.contains("tool_choice") && !body.at("tool_choice").is_null()) {
-            throw std::runtime_error("Unsupported param: tool_choice");
-        }
-    }
+    // Apply chat template to the list of messages
+    llama_params["prompt"] = format_chat(model, chat_template, body.at("messages"));

    // Handle "stop" field
    if (body.contains("stop") && body.at("stop").is_string()) {
@@ -627,44 +605,6 @@ static json oaicompat_completion_params_parse(
        }
    }

-    // Apply chat template to the list of messages
-    if (use_jinja) {
-        auto tool_choice = json_value(body, "tool_choice", std::string("auto"));
-        if (tool_choice != "none" && tool_choice != "auto" && tool_choice != "required") {
-            throw std::runtime_error("Invalid tool_choice: " + tool_choice);
-        }
-        if (tool_choice != "none" && llama_params.contains("grammar")) {
-            throw std::runtime_error("Cannot use custom grammar constraints with tools.");
-        }
-        common_chat_inputs inputs;
-        inputs.messages = body.at("messages");
-        inputs.tools = tools;
-        inputs.tool_choice = tool_choice;
-        inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false);
-        inputs.stream = stream;
-        // TODO: support mixing schema w/ tools beyond generic format.
-        inputs.json_schema = json_value(llama_params, "json_schema", json());
-        auto chat_params = common_chat_params_init(tmpl, inputs);
-
-        llama_params["chat_format"] = static_cast<int>(chat_params.format);
-        llama_params["prompt"] = chat_params.prompt;
-        llama_params["grammar"] = chat_params.grammar;
-        llama_params["grammar_lazy"] = chat_params.grammar_lazy;
-        auto grammar_triggers = json::array();
-        for (const auto & trigger : chat_params.grammar_triggers) {
-            grammar_triggers.push_back({
-                {"word", trigger.word},
-                {"at_start", trigger.at_start},
-            });
-        }
-        llama_params["grammar_triggers"] = grammar_triggers;
-        for (const auto & stop : chat_params.additional_stops) {
-            llama_params["stop"].push_back(stop);
-        }
-    } else {
-        llama_params["prompt"] = format_chat(tmpl, body.at("messages"));
-    }
-
    // Handle "n" field
    int n_choices = json_value(body, "n", 1);
    if (n_choices != 1) {
@@ -679,6 +619,14 @@ static json oaicompat_completion_params_parse(
        throw std::runtime_error("top_logprobs requires logprobs to be set to true");
    }

+    // Params supported by OAI but unsupported by llama.cpp
+    static const std::vector<std::string> unsupported_params { "tools", "tool_choice" };
+    for (const auto & param : unsupported_params) {
+        if (body.contains(param)) {
+            throw std::runtime_error("Unsupported param: " + param);
+        }
+    }
+
    // Copy remaining properties to llama_params
    // This allows user to use llama.cpp-specific params like "mirostat", ... via OAI endpoint.
    // See "launch_slot_with_task()" for a complete list of params supported by llama.cpp
@@ -816,18 +764,14 @@ static json format_logit_bias(const std::vector<llama_logit_bias> & logit_bias)
    return data;
 }

-static std::string safe_json_to_str(const json & data) {
+static std::string safe_json_to_str(json data) {
    return data.dump(-1, ' ', false, json::error_handler_t::replace);
 }

 static std::vector<llama_token_data> get_token_probabilities(llama_context * ctx, int idx) {
    std::vector<llama_token_data> cur;
    const auto * logits = llama_get_logits_ith(ctx, idx);
-
-    const llama_model * model = llama_get_model(ctx);
-    const llama_vocab * vocab = llama_model_get_vocab(model);
-
-    const int n_vocab = llama_vocab_n_tokens(vocab);
+    const int n_vocab = llama_n_vocab(llama_get_model(ctx));

    cur.resize(n_vocab);
    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
@@ -855,8 +799,8 @@ static std::vector<llama_token_data> get_token_probabilities(llama_context * ctx
 }

 static bool are_lora_equal(
-        const std::vector<common_adapter_lora_info> & l1,
-        const std::vector<common_adapter_lora_info> & l2) {
+        const std::vector<common_lora_adapter_info> & l1,
+        const std::vector<common_lora_adapter_info> & l2) {
    if (l1.size() != l2.size()) {
        return false;
    }
@@ -870,10 +814,10 @@ static bool are_lora_equal(
 }

 // parse lora config from JSON request, returned a copy of lora_base with updated scale
-static std::vector<common_adapter_lora_info> parse_lora_request(
-        const std::vector<common_adapter_lora_info> & lora_base,
+static std::vector<common_lora_adapter_info> parse_lora_request(
+        const std::vector<common_lora_adapter_info> & lora_base,
        const json & data) {
-    std::vector<common_adapter_lora_info> lora(lora_base);
+    std::vector<common_lora_adapter_info> lora(lora_base);
    int max_idx = lora.size();

    // clear existing value
--- a/examples/server/webui/index.html
+++ b/examples/server/webui/index.html
@@ -37,7 +37,7 @@
          <div v-for="conv in conversations" :class="{
            'btn btn-ghost justify-start font-normal': true,
            'btn-active': conv.id === viewingConvId,
-          }" @click="setViewingConv(conv.id)" dir="auto">
+          }" @click="setViewingConv(conv.id)">
            <span class="truncate">{{ conv.messages[0].content }}</span>
          </div>
          <div class="text-center text-xs opacity-40 mt-auto mx-4">
@@ -141,7 +141,6 @@
              :msg="pendingMsg"
              :key="pendingMsg.id"
              :is-generating="isGenerating"
-              :show-thought-in-progress="config.showThoughtInProgress"
              :edit-user-msg-and-regenerate="() => {}"
              :regenerate-msg="() => {}"></message-bubble>
          </div>
@@ -157,7 +156,6 @@
            @keydown.enter.shift.exact.prevent="inputMsg += '\n'"
            :disabled="isGenerating"
            id="msg-input"
-            dir="auto"
          ></textarea>
          <button v-if="!isGenerating" class="btn btn-primary ml-2" @click="sendMessage" :disabled="inputMsg.length === 0">Send</button>
          <button v-else class="btn btn-neutral ml-2" @click="stopGeneration">Stop</button>
@@ -203,20 +201,6 @@
              </template>
            </div>
          </details>
-          <!-- Section: Reasoning models -->
-          <details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
-            <summary class="collapse-title font-bold">Reasoning models</summary>
-            <div class="collapse-content">
-              <div class="flex flex-row items-center mb-2">
-                <input type="checkbox" class="checkbox" v-model="config.showThoughtInProgress" />
-                <span class="ml-4">Expand though process by default for generating message</span>
-              </div>
-              <div class="flex flex-row items-center mb-2">
-                <input type="checkbox" class="checkbox" v-model="config.excludeThoughtOnReq" />
-                <span class="ml-4">Exclude thought process when sending request to API (Recommended for DeepSeek-R1)</span>
-              </div>
-            </div>
-          </details>
          <!-- Section: Advanced config -->
          <details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
            <summary class="collapse-title font-bold">Advanced config</summary>
@@ -264,7 +248,6 @@
        <!-- textarea for editing message -->
        <template v-if="editingContent !== null">
          <textarea
-            dir="auto"
            class="textarea textarea-bordered bg-base-100 text-base-content w-[calc(90vw-8em)] lg:w-96"
            v-model="editingContent"></textarea>
          <br/>
@@ -275,19 +258,7 @@
          <!-- show loading dots for pending message -->
          <span v-if="msg.content === null" class="loading loading-dots loading-md"></span>
          <!-- render message as markdown -->
-          <div v-else dir="auto">
-            <details v-if="msg.role === 'assistant' && splitMsgContent.cot" class="collapse bg-base-200 collapse-arrow mb-4" :open="splitMsgContent.isThinking && showThoughtInProgress">
-              <summary class="collapse-title">
-                <span v-if="splitMsgContent.isThinking">
-                  <span v-if="isGenerating" class="loading loading-spinner loading-md mr-2" style="vertical-align: middle;"></span>
-                  <b>Thinking</b>
-                </span>
-                <b v-else>Thought Process</b>
-              </summary>
-              <vue-markdown :source="splitMsgContent.cot" dir="auto" class="collapse-content"></vue-markdown>
-            </details>
-            <vue-markdown :source="splitMsgContent.content"></vue-markdown>
-          </div>
+          <vue-markdown v-else :source="msg.content"></vue-markdown>
          <!-- render timings if enabled -->
          <div class="dropdown dropdown-hover dropdown-top mt-2" v-if="timings && config.showTokensPerSecond">
            <div tabindex="0" role="button" class="cursor-pointer font-semibold text-sm opacity-60">Speed: {{ timings.predicted_per_second.toFixed(1) }} t/s</div>
--- a/examples/server/webui/src/main.js
+++ b/examples/server/webui/src/main.js
@@ -17,11 +17,6 @@ import { asyncIterator } from '@sec-ant/readable-stream/ponyfill/asyncIterator';

 const isDev = import.meta.env.MODE === 'development';

-// types
-/** @typedef {{ id: number, role: 'user' | 'assistant', content: string, timings: any }} Message */
-/** @typedef {{ role: 'user' | 'assistant', content: string }} APIMessage */
-/** @typedef {{ id: string, lastModified: number, messages: Array<Message> }} Conversation */
-
 // utility functions
 const isString = (x) => !!x.toLowerCase;
 const isBoolean = (x) => x === true || x === false;
@@ -55,8 +50,6 @@ const CONFIG_DEFAULT = {
  apiKey: '',
  systemMessage: 'You are a helpful assistant.',
  showTokensPerSecond: false,
-  showThoughtInProgress: false,
-  excludeThoughtOnReq: true,
  // make sure these default values are in sync with `common.h`
  samplers: 'edkypmxt',
  temperature: 0.8,
@@ -118,12 +111,12 @@ const VueMarkdown = defineComponent(
      highlight: function (str, lang) { // Add highlight.js
        if (lang && hljs.getLanguage(lang)) {
          try {
-            return '<pre dir="auto"><code class="hljs">' +
+            return '<pre><code class="hljs">' +
                   hljs.highlight(str, { language: lang, ignoreIllegals: true }).value +
                   '</code></pre>';
          } catch (__) {}
        }
-        return '<pre dir="auto"><code class="hljs">' + md.value.utils.escapeHtml(str) + '</code></pre>';
+        return '<pre><code class="hljs">' + md.value.utils.escapeHtml(str) + '</code></pre>';
      }
    }));
    // support latex with double dollar sign and square brackets
@@ -179,7 +172,6 @@ const MessageBubble = defineComponent({
    config: Object,
    msg: Object,
    isGenerating: Boolean,
-    showThoughtInProgress: Boolean,
    editUserMsgAndRegenerate: Function,
    regenerateMsg: Function,
  },
@@ -196,31 +188,7 @@ const MessageBubble = defineComponent({
        prompt_per_second: this.msg.timings.prompt_n / (this.msg.timings.prompt_ms / 1000),
        predicted_per_second: this.msg.timings.predicted_n / (this.msg.timings.predicted_ms / 1000),
      };
-    },
-    splitMsgContent() {
-      const content = this.msg.content;
-      if (this.msg.role !== 'assistant') {
-        return { content };
-      }
-      let actualContent = '';
-      let cot = '';
-      let isThinking = false;
-      let thinkSplit = content.split('<think>', 2);
-      actualContent += thinkSplit[0];
-      while (thinkSplit[1] !== undefined) {
-        // <think> tag found
-        thinkSplit = thinkSplit[1].split('</think>', 2);
-        cot += thinkSplit[0];
-        isThinking = true;
-        if (thinkSplit[1] !== undefined) {
-          // </think> closing tag found
-          isThinking = false;
-          thinkSplit = thinkSplit[1].split('<think>', 2);
-          actualContent += thinkSplit[0];
-        }
-      }
-      return { content: actualContent, cot, isThinking };
-    },
+    }
  },
  methods: {
    copyMsg() {
@@ -240,10 +208,7 @@ const MessageBubble = defineComponent({
 // format: { [convId]: { id: string, lastModified: number, messages: [...] } }
 // convId is a string prefixed with 'conv-'
 const StorageUtils = {
-  /**
-   * manage conversations
-   * @returns {Array<Conversation>}
-   */
+  // manage conversations
  getAllConversations() {
    const res = [];
    for (const key in localStorage) {
@@ -254,19 +219,11 @@ const StorageUtils = {
    res.sort((a, b) => b.lastModified - a.lastModified);
    return res;
  },
-  /**
-   * can return null if convId does not exist
-   * @param {string} convId
-   * @returns {Conversation | null}
-   */
+  // can return null if convId does not exist
  getOneConversation(convId) {
    return JSON.parse(localStorage.getItem(convId) || 'null');
  },
-  /**
-   * if convId does not exist, create one
-   * @param {string} convId
-   * @param {Message} msg
-   */
+  // if convId does not exist, create one
  appendMsg(convId, msg) {
    if (msg.content === null) return;
    const conv = StorageUtils.getOneConversation(convId) || {
@@ -278,24 +235,12 @@ const StorageUtils = {
    conv.lastModified = Date.now();
    localStorage.setItem(convId, JSON.stringify(conv));
  },
-  /**
-   * Get new conversation id
-   * @returns {string}
-   */
  getNewConvId() {
    return `conv-${Date.now()}`;
  },
-  /**
-   * remove conversation by id
-   * @param {string} convId
-   */
  remove(convId) {
    localStorage.removeItem(convId);
  },
-  /**
-   * remove all conversations
-   * @param {string} convId
-   */
  filterAndKeepMsgs(convId, predicate) {
    const conv = StorageUtils.getOneConversation(convId);
    if (!conv) return;
@@ -303,11 +248,6 @@ const StorageUtils = {
    conv.lastModified = Date.now();
    localStorage.setItem(convId, JSON.stringify(conv));
  },
-  /**
-   * remove last message from conversation
-   * @param {string} convId
-   * @returns {Message | undefined}
-   */
  popMsg(convId) {
    const conv = StorageUtils.getOneConversation(convId);
    if (!conv) return;
@@ -382,12 +322,10 @@ const mainApp = createApp({
  data() {
    return {
      conversations: StorageUtils.getAllConversations(),
-      /** @type {Array<Message>} */
-      messages: [],
+      messages: [], // { id: number, role: 'user' | 'assistant', content: string }
      viewingConvId: StorageUtils.getNewConvId(),
      inputMsg: '',
      isGenerating: false,
-      /** @type {Array<Message> | null} */
      pendingMsg: null, // the on-going message from assistant
      stopGeneration: () => {},
      selectedTheme: StorageUtils.getTheme(),
@@ -395,7 +333,6 @@ const mainApp = createApp({
      showConfigDialog: false,
      // const
      themes: THEMES,
-      /** @type {CONFIG_DEFAULT} */
      configDefault: {...CONFIG_DEFAULT},
      configInfo: {...CONFIG_INFO},
      isDev,
@@ -488,50 +425,42 @@ const mainApp = createApp({
      this.isGenerating = true;

      try {
-        /** @type {CONFIG_DEFAULT} */
-        const config = this.config;
        const abortController = new AbortController();
        this.stopGeneration = () => abortController.abort();
-        /** @type {Array<APIMessage>} */
-        let messages = [
-          { role: 'system', content: config.systemMessage },
-          ...normalizeMsgsForAPI(this.messages),
-        ];
-        if (config.excludeThoughtOnReq) {
-          messages = filterThoughtFromMsgs(messages);
-        }
-        if (isDev) console.log({messages});
        const params = {
-          messages,
+          messages: [
+            { role: 'system', content: this.config.systemMessage },
+            ...this.messages,
+          ],
          stream: true,
          cache_prompt: true,
-          samplers: config.samplers,
-          temperature: config.temperature,
-          dynatemp_range: config.dynatemp_range,
-          dynatemp_exponent: config.dynatemp_exponent,
-          top_k: config.top_k,
-          top_p: config.top_p,
-          min_p: config.min_p,
-          typical_p: config.typical_p,
-          xtc_probability: config.xtc_probability,
-          xtc_threshold: config.xtc_threshold,
-          repeat_last_n: config.repeat_last_n,
-          repeat_penalty: config.repeat_penalty,
-          presence_penalty: config.presence_penalty,
-          frequency_penalty: config.frequency_penalty,
-          dry_multiplier: config.dry_multiplier,
-          dry_base: config.dry_base,
-          dry_allowed_length: config.dry_allowed_length,
-          dry_penalty_last_n: config.dry_penalty_last_n,
-          max_tokens: config.max_tokens,
-          timings_per_token: !!config.showTokensPerSecond,
-          ...(config.custom.length ? JSON.parse(config.custom) : {}),
+          samplers: this.config.samplers,
+          temperature: this.config.temperature,
+          dynatemp_range: this.config.dynatemp_range,
+          dynatemp_exponent: this.config.dynatemp_exponent,
+          top_k: this.config.top_k,
+          top_p: this.config.top_p,
+          min_p: this.config.min_p,
+          typical_p: this.config.typical_p,
+          xtc_probability: this.config.xtc_probability,
+          xtc_threshold: this.config.xtc_threshold,
+          repeat_last_n: this.config.repeat_last_n,
+          repeat_penalty: this.config.repeat_penalty,
+          presence_penalty: this.config.presence_penalty,
+          frequency_penalty: this.config.frequency_penalty,
+          dry_multiplier: this.config.dry_multiplier,
+          dry_base: this.config.dry_base,
+          dry_allowed_length: this.config.dry_allowed_length,
+          dry_penalty_last_n: this.config.dry_penalty_last_n,
+          max_tokens: this.config.max_tokens,
+          timings_per_token: !!this.config.showTokensPerSecond,
+          ...(this.config.custom.length ? JSON.parse(this.config.custom) : {}),
        };
        const chunks = sendSSEPostRequest(`${BASE_URL}/v1/chat/completions`, {
          method: 'POST',
          headers: {
            'Content-Type': 'application/json',
-            ...(config.apiKey ? {'Authorization': `Bearer ${config.apiKey}`} : {})
+            ...(this.config.apiKey ? {'Authorization': `Bearer ${this.config.apiKey}`} : {})
          },
          body: JSON.stringify(params),
          signal: abortController.signal,
@@ -548,7 +477,7 @@ const mainApp = createApp({
            };
          }
          const timings = chunk.timings;
-          if (timings && config.showTokensPerSecond) {
+          if (timings && this.config.showTokensPerSecond) {
            // only extract what's really needed, to save some space
            this.pendingMsg.timings = {
              prompt_n: timings.prompt_n,
@@ -669,33 +598,3 @@ try {
    <button class="btn" onClick="localStorage.clear(); window.location.reload();">Clear localStorage</button>
  </div>`;
 }
-
-/**
- * filter out redundant fields upon sending to API
- * @param {Array<APIMessage>} messages
- * @returns {Array<APIMessage>}
- */
-function normalizeMsgsForAPI(messages) {
-  return messages.map((msg) => {
-    return {
-      role: msg.role,
-      content: msg.content,
-    };
-  });
-}
-
-/**
- * recommended for DeepsSeek-R1, filter out content between <think> and </think> tags
- * @param {Array<APIMessage>} messages
- * @returns {Array<APIMessage>}
- */
-function filterThoughtFromMsgs(messages) {
-  return messages.map((msg) => {
-    return {
-      role: msg.role,
-      content: msg.role === 'assistant'
-        ? msg.content.split('</think>').at(-1).trim()
-        : msg.content,
-    };
-  });
-}
--- a/examples/simple-chat/simple-chat.cpp
+++ b/examples/simple-chat/simple-chat.cpp
@@ -75,14 +75,12 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    const llama_vocab * vocab = llama_model_get_vocab(model);
-
    // initialize the context
    llama_context_params ctx_params = llama_context_default_params();
    ctx_params.n_ctx = n_ctx;
    ctx_params.n_batch = n_ctx;

-    llama_context * ctx = llama_init_from_model(model, ctx_params);
+    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
    if (!ctx) {
        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
        return 1;
@@ -98,12 +96,10 @@ int main(int argc, char ** argv) {
    auto generate = [&](const std::string & prompt) {
        std::string response;

-        const bool is_first = llama_get_kv_cache_used_cells(ctx) == 0;
-
        // tokenize the prompt
-        const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
+        const int n_prompt_tokens = -llama_tokenize(model, prompt.c_str(), prompt.size(), NULL, 0, true, true);
        std::vector<llama_token> prompt_tokens(n_prompt_tokens);
-        if (llama_tokenize(vocab, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), is_first, true) < 0) {
+        if (llama_tokenize(model, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), llama_get_kv_cache_used_cells(ctx) == 0, true) < 0) {
            GGML_ABORT("failed to tokenize the prompt\n");
        }

@@ -128,13 +124,13 @@ int main(int argc, char ** argv) {
            new_token_id = llama_sampler_sample(smpl, ctx, -1);

            // is it an end of generation?
-            if (llama_vocab_is_eog(vocab, new_token_id)) {
+            if (llama_token_is_eog(model, new_token_id)) {
                break;
            }

            // convert the token to a string, print it and add it to the response
            char buf[256];
-            int n = llama_token_to_piece(vocab, new_token_id, buf, sizeof(buf), 0, true);
+            int n = llama_token_to_piece(model, new_token_id, buf, sizeof(buf), 0, true);
            if (n < 0) {
                GGML_ABORT("failed to convert token to piece\n");
            }
@@ -163,14 +159,12 @@ int main(int argc, char ** argv) {
            break;
        }

-        const char * tmpl = llama_model_chat_template(model, /* name */ nullptr);
-
        // add the user input to the message list and format it
        messages.push_back({"user", strdup(user.c_str())});
-        int new_len = llama_chat_apply_template(tmpl, messages.data(), messages.size(), true, formatted.data(), formatted.size());
+        int new_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), true, formatted.data(), formatted.size());
        if (new_len > (int)formatted.size()) {
            formatted.resize(new_len);
-            new_len = llama_chat_apply_template(tmpl, messages.data(), messages.size(), true, formatted.data(), formatted.size());
+            new_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), true, formatted.data(), formatted.size());
        }
        if (new_len < 0) {
            fprintf(stderr, "failed to apply the chat template\n");
@@ -187,7 +181,7 @@ int main(int argc, char ** argv) {

        // add the response to the messages
        messages.push_back({"assistant", strdup(response.c_str())});
-        prev_len = llama_chat_apply_template(tmpl, messages.data(), messages.size(), false, nullptr, 0);
+        prev_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), false, nullptr, 0);
        if (prev_len < 0) {
            fprintf(stderr, "failed to apply the chat template\n");
            return 1;
--- a/examples/simple-cmake-pkg/CMakeLists.txt
+++ b/examples/simple-cmake-pkg/CMakeLists.txt
@@ -1,11 +0,0 @@
-cmake_minimum_required(VERSION 3.12)
-project(llama-simple-cmake-pkg)
-
-set(TARGET llama-simple-cmake-pkg)
-
-find_package(Llama REQUIRED)
-
-add_executable(${TARGET} ${CMAKE_CURRENT_LIST_DIR}/../simple/simple.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama ggml::all ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/simple-cmake-pkg/README.md
+++ b/examples/simple-cmake-pkg/README.md
@@ -1,34 +0,0 @@
-# llama.cpp/example/simple-cmake-pkg
-
-This program builds [simple](../simple) using a relocatable CMake package. It serves as an example of using the `find_package()` CMake command to conveniently include [llama.cpp](https://github.com/ggerganov/llama.cpp) in projects which live outside of the source tree.
-
-## Building
-
-Because this example is "outside of the source tree", it is important to first build/install llama.cpp using CMake. An example is provided here, but please see the [llama.cpp build instructions](../..) for more detailed build instructions.
-
-### Considerations
-
-When hardware acceleration libraries are used (e.g. CUDA, Metal, Vulkan, etc.), the appropriate dependencies will be searched for automatically. So, for example, when finding a package
-
-### Build llama.cpp and install to llama.cpp/inst
-
-```sh
-git clone https://github.com/ggerganov/llama.cpp
-cd llama.cpp
-cmake -S . -B build
-cmake --build build
-cmake --install build --prefix inst
-
-### Build simple-cmake-pkg
-
-```sh
-cd examples/simple-cmake-pkg
-cmake -S . -B build -DCMAKE_PREFIX_PATH=../../inst/lib/cmake
-cmake --build build
-```
-
-### Run simple-cmake-pkg
-
-```sh
-./build/llama-simple-cmake-pkg -m ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is"
-```
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -84,7 +84,6 @@ int main(int argc, char ** argv) {
    model_params.n_gpu_layers = ngl;

    llama_model * model = llama_model_load_from_file(model_path.c_str(), model_params);
-    const llama_vocab * vocab = llama_model_get_vocab(model);

    if (model == NULL) {
        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
@@ -94,11 +93,11 @@ int main(int argc, char ** argv) {
    // tokenize the prompt

    // find the number of tokens in the prompt
-    const int n_prompt = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, true, true);
+    const int n_prompt = -llama_tokenize(model, prompt.c_str(), prompt.size(), NULL, 0, true, true);

    // allocate space for the tokens and tokenize the prompt
    std::vector<llama_token> prompt_tokens(n_prompt);
-    if (llama_tokenize(vocab, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0) {
+    if (llama_tokenize(model, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0) {
        fprintf(stderr, "%s: error: failed to tokenize the prompt\n", __func__);
        return 1;
    }
@@ -113,7 +112,7 @@ int main(int argc, char ** argv) {
    // enable performance counters
    ctx_params.no_perf = false;

-    llama_context * ctx = llama_init_from_model(model, ctx_params);
+    llama_context * ctx = llama_new_context_with_model(model, ctx_params);

    if (ctx == NULL) {
        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
@@ -132,7 +131,7 @@ int main(int argc, char ** argv) {

    for (auto id : prompt_tokens) {
        char buf[128];
-        int n = llama_token_to_piece(vocab, id, buf, sizeof(buf), 0, true);
+        int n = llama_token_to_piece(model, id, buf, sizeof(buf), 0, true);
        if (n < 0) {
            fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__);
            return 1;
@@ -165,12 +164,12 @@ int main(int argc, char ** argv) {
            new_token_id = llama_sampler_sample(smpl, ctx, -1);

            // is it an end of generation?
-            if (llama_vocab_is_eog(vocab, new_token_id)) {
+            if (llama_token_is_eog(model, new_token_id)) {
                break;
            }

            char buf[128];
-            int n = llama_token_to_piece(vocab, new_token_id, buf, sizeof(buf), 0, true);
+            int n = llama_token_to_piece(model, new_token_id, buf, sizeof(buf), 0, true);
            if (n < 0) {
                fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__);
                return 1;
--- a/examples/speculative-simple/speculative-simple.cpp
+++ b/examples/speculative-simple/speculative-simple.cpp
@@ -45,8 +45,6 @@ int main(int argc, char ** argv) {
    model_tgt = llama_init_tgt.model.get();
    ctx_tgt   = llama_init_tgt.context.get();

-    const llama_vocab * vocab = llama_model_get_vocab(model_tgt);
-
    // load the draft model
    params.devices      = params.speculative.devices;
    params.model        = params.speculative.model;
@@ -198,7 +196,7 @@ int main(int argc, char ** argv) {

            id_last = ids[i];

-            if (llama_vocab_is_eog(vocab, id_last)) {
+            if (llama_token_is_eog(model_tgt, id_last)) {
                has_eos = true;
                break;
            }
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -90,13 +90,10 @@ int main(int argc, char ** argv) {
    model_dft = llama_init_dft.model.get();
    ctx_dft   = llama_init_dft.context.get();

-    const llama_vocab * vocab_tgt = llama_model_get_vocab(model_tgt);
-    const llama_vocab * vocab_dft = llama_model_get_vocab(model_dft);
-
-    const bool vocab_type_tgt = llama_vocab_type(vocab_tgt);
+    const bool vocab_type_tgt = llama_vocab_type(model_tgt);
    LOG_DBG("vocab_type tgt: %d\n", vocab_type_tgt);

-    const bool vocab_type_dft = llama_vocab_type(vocab_dft);
+    const bool vocab_type_dft = llama_vocab_type(model_dft);
    LOG_DBG("vocab_type dft: %d\n", vocab_type_dft);

    if (vocab_type_tgt != vocab_type_dft) {
@@ -106,18 +103,18 @@ int main(int argc, char ** argv) {
    }

    if (
-        llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
-        llama_vocab_get_add_eos(vocab_tgt) != llama_vocab_get_add_eos(vocab_dft) ||
-        llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft) ||
-        llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft)
+        llama_add_bos_token(model_tgt) != llama_add_bos_token(model_dft) ||
+        llama_add_eos_token(model_tgt) != llama_add_eos_token(model_dft) ||
+        llama_token_bos(model_tgt) != llama_token_bos(model_dft) ||
+        llama_token_eos(model_tgt) != llama_token_eos(model_dft)
    ) {
        LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__);
        return 1;
    }

    {
-        const int n_vocab_tgt = llama_vocab_n_tokens(vocab_tgt);
-        const int n_vocab_dft = llama_vocab_n_tokens(vocab_dft);
+        const int n_vocab_tgt = llama_n_vocab(model_tgt);
+        const int n_vocab_dft = llama_n_vocab(model_dft);
        const int vocab_diff  = n_vocab_tgt > n_vocab_dft
            ? n_vocab_tgt - n_vocab_dft
            : n_vocab_dft - n_vocab_tgt;
@@ -125,13 +122,13 @@ int main(int argc, char ** argv) {
        if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
            LOG_ERR("%s: draft model vocab must closely match target model to use speculation but ", __func__);
            LOG_ERR("target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
-                    n_vocab_tgt, llama_vocab_n_tokens(vocab_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
+                    n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
            return 1;
        }

        for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) {
-            const char * token_text_tgt = llama_vocab_get_text(vocab_tgt, i);
-            const char * token_text_dft = llama_vocab_get_text(vocab_dft, i);
+            const char * token_text_tgt = llama_token_get_text(model_tgt, i);
+            const char * token_text_dft = llama_token_get_text(model_dft, i);
            if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
                LOG_ERR("%s: draft model vocab must match target model to use speculation but ", __func__);
                LOG_ERR("token %d content differs - target '%s', draft '%s'\n", i,
@@ -173,7 +170,7 @@ int main(int argc, char ** argv) {
    const auto t_enc_end = ggml_time_us();

    // the 2 models should have the same vocab
-    //GGML_ASSERT(n_vocab == llama_vocab_n_tokens(model_dft));
+    //GGML_ASSERT(n_vocab == llama_n_vocab(model_dft));

    // how many tokens to draft each time
    int n_draft = params.speculative.n_max;
@@ -389,7 +386,7 @@ int main(int argc, char ** argv) {
                    }
                }

-                if (llama_vocab_is_eog(vocab_tgt, token_id)) {
+                if (llama_token_is_eog(model_tgt, token_id)) {
                    has_eos = true;
                }
                ++n_predict;
--- a/examples/tokenize/tokenize.cpp
+++ b/examples/tokenize/tokenize.cpp
@@ -344,10 +344,8 @@ int main(int raw_argc, char ** raw_argv) {
        return 1;
    }

-    const llama_vocab * vocab = llama_model_get_vocab(model);
-
    llama_context_params ctx_params = llama_context_default_params();
-    llama_context * ctx = llama_init_from_model(model, ctx_params);
+    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
    if (!ctx) {
        fprintf(stderr, "Error: could not create context.\n");
        return 1;
@@ -367,7 +365,7 @@ int main(int raw_argc, char ** raw_argv) {
        prompt = stdin_buffer.str();
    }

-    const bool model_wants_add_bos = llama_vocab_get_add_bos(vocab);
+    const bool model_wants_add_bos = llama_add_bos_token(model);
    const bool add_bos = model_wants_add_bos && !no_bos;
    const bool parse_special = !no_parse_special;
    const bool escape = !no_escape;
@@ -377,7 +375,7 @@ int main(int raw_argc, char ** raw_argv) {
    }

    std::vector<llama_token> tokens;
-    tokens = common_tokenize(vocab, prompt, add_bos, parse_special);
+    tokens = common_tokenize(model, prompt, add_bos, parse_special);

    if (printing_ids) {
        printf("[");
--- a/examples/tts/README.md
+++ b/examples/tts/README.md
@@ -78,40 +78,3 @@ play the audio:
 $ aplay output.wav
 ```

-### Running the example with llama-server
-Running this example with `llama-server` is also possible and requires two
-server instances to be started. One will serve the LLM model and the other
-will serve the voice decoder model.
-
-The LLM model server can be started with the following command:
-```console
-$ ./build/bin/llama-server -m ./models/outetts-0.2-0.5B-q8_0.gguf --port 8020
-```
-
-And the voice decoder model server can be started using:
-```console
-./build/bin/llama-server -m ./models/wavtokenizer-large-75-f16.gguf --port 8021 --embeddings --pooling none
-```
-
-Then we can run [tts-outetts.py](tts-outetts.py) to generate the audio.
-
-First create a virtual environment for python and install the required
-dependencies (this in only required to be done once):
-```console
-$ python3 -m venv venv
-$ source venv/bin/activate
-(venv) pip install requests numpy
-```
-
-And then run the python script using:
-```conole
-(venv) python ./examples/tts/tts-outetts.py http://localhost:8020 http://localhost:8021 "Hello world"
-spectrogram generated: n_codes: 90, n_embd: 1282
-converting to audio ...
-audio generated: 28800 samples
-audio written to file "output.wav"
-```
-And to play the audio we can again use aplay or any other media player:
-```console
-$ aplay output.wav
-```
--- a/Show More
+++ b/Show More