CUDA/HIP: Fix fattn-vec-* when device warp size is not 32 (#12315 )

When fattn-wmma was ported over to warp64 various bits that also touch fattn-vec where converted to selectable warp size, however the fattn-vec kernels dont work with 64 wide warps for now, so we need to avoid launching them with parameters for warp64
llama : Add Gemma 3 support (+ experimental vision capability) (#12343 )
2026-02-05 13:53:23 +02:00 · 2025-03-12 10:14:11 +01:00 · 2025-03-12 09:30:24 +01:00 · 2025-03-12 06:59:19 +01:00 · 2025-03-11 20:16:03 +01:00 · 2025-03-11 14:25:17 +01:00
448 changed files with 54562 additions and 12938 deletions
--- a/.devops/cpu.Dockerfile
+++ b/.devops/cpu.Dockerfile
@@ -2,6 +2,10 @@ ARG UBUNTU_VERSION=22.04

 FROM ubuntu:$UBUNTU_VERSION AS build

+ARG TARGETARCH
+
+ARG GGML_CPU_ARM_ARCH=armv8-a
+
 RUN apt-get update && \
    apt-get install -y build-essential git cmake libcurl4-openssl-dev

@@ -9,7 +13,14 @@ WORKDIR /app

 COPY . .

-RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \
+RUN if [ "$TARGETARCH" = "amd64" ]; then \
+        cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
+    elif [ "$TARGETARCH" = "arm64" ]; then \
+        cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=${GGML_CPU_ARM_ARCH}; \
+    else \
+        echo "Unsupported architecture"; \
+        exit 1; \
+    fi && \
    cmake --build build -j $(nproc)

 RUN mkdir -p /app/lib && \
--- a/.devops/cuda.Dockerfile
+++ b/.devops/cuda.Dockerfile
@@ -1,6 +1,6 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
-ARG CUDA_VERSION=12.6.0
+ARG CUDA_VERSION=12.4.0
 # Target the CUDA build image
 ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}

--- a/.devops/llama-cpp-cuda.srpm.spec
+++ b/.devops/llama-cpp-cuda.srpm.spec
@@ -17,10 +17,10 @@ Version:        %( date "+%%Y%%m%%d" )
 Release:        1%{?dist}
 Summary:        CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
 License:        MIT
-Source0:        https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
+Source0:        https://github.com/ggml-org/llama.cpp/archive/refs/heads/master.tar.gz
 BuildRequires:  coreutils make gcc-c++ git cuda-toolkit
 Requires:       cuda-toolkit
-URL:            https://github.com/ggerganov/llama.cpp
+URL:            https://github.com/ggml-org/llama.cpp

 %define debug_package %{nil}
 %define source_date_epoch_from_changelog 0
--- a/.devops/llama-cpp.srpm.spec
+++ b/.devops/llama-cpp.srpm.spec
@@ -18,10 +18,10 @@ Version:        %( date "+%%Y%%m%%d" )
 Release:        1%{?dist}
 Summary:        CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
 License:        MIT
-Source0:        https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
+Source0:        https://github.com/ggml-org/llama.cpp/archive/refs/heads/master.tar.gz
 BuildRequires:  coreutils make gcc-c++ git libstdc++-devel
 Requires:       libstdc++
-URL:            https://github.com/ggerganov/llama.cpp
+URL:            https://github.com/ggml-org/llama.cpp

 %define debug_package %{nil}
 %define source_date_epoch_from_changelog 0
--- a/.devops/musa.Dockerfile
+++ b/.devops/musa.Dockerfile
@@ -1,6 +1,6 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
-ARG MUSA_VERSION=rc3.1.0
+ARG MUSA_VERSION=rc3.1.1
 # Target the MUSA build image
 ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}

--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -133,12 +133,12 @@ effectiveStdenv.mkDerivation (finalAttrs: {
      --replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
  '';

-  # With PR#6015 https://github.com/ggerganov/llama.cpp/pull/6015,
+  # With PR#6015 https://github.com/ggml-org/llama.cpp/pull/6015,
  # `default.metallib` may be compiled with Metal compiler from XCode
  # and we need to escape sandbox on MacOS to access Metal compiler.
  # `xcrun` is used find the path of the Metal compiler, which is varible
  # and not on $PATH
-  # see https://github.com/ggerganov/llama.cpp/pull/6118 for discussion
+  # see https://github.com/ggml-org/llama.cpp/pull/6118 for discussion
  __noChroot = effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders;

  nativeBuildInputs =
@@ -220,7 +220,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
    broken = (useMetalKit && !effectiveStdenv.isDarwin);

    description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
-    homepage = "https://github.com/ggerganov/llama.cpp/";
+    homepage = "https://github.com/ggml-org/llama.cpp/";
    license = lib.licenses.mit;

    # Accommodates `nix run` and `lib.getExe`
--- a/.devops/rocm.Dockerfile
+++ b/.devops/rocm.Dockerfile
@@ -11,7 +11,7 @@ ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-co
 FROM ${BASE_ROCM_DEV_CONTAINER} AS build

 # Unless otherwise specified, we make a fat build.
-# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
+# List from https://github.com/ggml-org/llama.cpp/pull/1087#issuecomment-1682807878
 # This is mostly tied to rocBLAS supported archs.
 # gfx803, gfx900, gfx1032, gfx1101, gfx1102,not officialy supported
 # gfx906 is deprecated
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@@ -13,9 +13,13 @@ elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
    exec ./llama-quantize "$@"
 elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
    exec ./llama-cli "$@"
+elif [[ "$arg1" == '--bench' || "$arg1" == '-b' ]]; then
+    exec ./llama-bench "$@"
+elif [[ "$arg1" == '--perplexity' || "$arg1" == '-p' ]]; then
+    exec ./llama-perplexity "$@"
 elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
    echo "Converting PTH to GGML..."
-    for i in `ls $1/$2/ggml-model-f16.bin*`; do
+    for i in $(ls $1/$2/ggml-model-f16.bin*); do
        if [ -f "${i/f16/q4_0}" ]; then
            echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
        else
@@ -30,6 +34,10 @@ else
    echo "Available commands: "
    echo "  --run (-r): Run a model previously converted into ggml"
    echo "              ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
+    echo "  --bench (-b): Benchmark the performance of the inference for various parameters."
+    echo "              ex: -m model.gguf"
+    echo "  --perplexity (-p): Measure the perplexity of a model over a given text."
+    echo "              ex: -m model.gguf -f file.txt"
    echo "  --convert (-c): Convert a llama model into ggml"
    echo "              ex: --outtype f16 \"/models/7B/\" "
    echo "  --quantize (-q): Optimize with quantization process ggml"
--- a/.devops/vulkan.Dockerfile
+++ b/.devops/vulkan.Dockerfile
@@ -1,4 +1,4 @@
-ARG UBUNTU_VERSION=jammy
+ARG UBUNTU_VERSION=24.04

 FROM ubuntu:$UBUNTU_VERSION AS build

@@ -7,7 +7,7 @@ RUN apt update && apt install -y git build-essential cmake wget

 # Install Vulkan SDK and cURL
 RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
-    wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
+    wget -qO /etc/apt/sources.list.d/lunarg-vulkan-noble.list https://packages.lunarg.com/vulkan/lunarg-vulkan-noble.list && \
    apt update -y && \
    apt-get install -y vulkan-sdk libcurl4-openssl-dev curl

@@ -34,7 +34,7 @@ RUN mkdir -p /app/full \
 FROM ubuntu:$UBUNTU_VERSION AS base

 RUN apt-get update \
-    && apt-get install -y libgomp1 curl\
+    && apt-get install -y libgomp1 curl libvulkan-dev \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
@@ -55,8 +55,9 @@ RUN apt-get update \
    git \
    python3 \
    python3-pip \
-    && pip install --upgrade pip setuptools wheel \
-    && pip install -r requirements.txt \
+    python3-wheel \
+    && pip install --break-system-packages --upgrade setuptools \
+    && pip install --break-system-packages -r requirements.txt \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
--- a/.editorconfig
+++ b/.editorconfig
@@ -40,3 +40,11 @@ indent_style = tab
 [examples/cvector-generator/*.txt]
 trim_trailing_whitespace = unset
 insert_final_newline = unset
+
+[models/templates/*.jinja]
+indent_style = unset
+indent_size = unset
+end_of_line = unset
+charset = unset
+trim_trailing_whitespace = unset
+insert_final_newline = unset
--- a/.github/ISSUE_TEMPLATE/020-enhancement.yml
+++ b/.github/ISSUE_TEMPLATE/020-enhancement.yml
@@ -6,7 +6,7 @@ body:
  - type: markdown
    attributes:
      value: |
-        [Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggerganov/llama.cpp/discussions/categories/ideas)
+        [Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggml-org/llama.cpp/discussions/categories/ideas)

  - type: checkboxes
    id: prerequisites
@@ -16,11 +16,11 @@ body:
      options:
        - label: I am running the latest code. Mention the version if possible as well.
          required: true
-        - label: I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
+        - label: I carefully followed the [README.md](https://github.com/ggml-org/llama.cpp/blob/master/README.md).
          required: true
        - label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed).
          required: true
-        - label: I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new and useful enhancement to share.
+        - label: I reviewed the [Discussions](https://github.com/ggml-org/llama.cpp/discussions), and have a new and useful enhancement to share.
          required: true

  - type: textarea
--- a/.github/ISSUE_TEMPLATE/030-research.yml
+++ b/.github/ISSUE_TEMPLATE/030-research.yml
@@ -6,7 +6,7 @@ body:
  - type: markdown
    attributes:
      value: |
-        Don't forget to check for any [duplicate research issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)
+        Don't forget to check for any [duplicate research issue tickets](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)

  - type: checkboxes
    id: research-stage
--- a/.github/ISSUE_TEMPLATE/040-refactor.yml
+++ b/.github/ISSUE_TEMPLATE/040-refactor.yml
@@ -6,8 +6,8 @@ body:
  - type: markdown
    attributes:
      value: |
-        Don't forget to [check for existing refactor issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
-        Also you may want to check [Pull request refactor label as well](https://github.com/ggerganov/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.
+        Don't forget to [check for existing refactor issue tickets](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
+        Also you may want to check [Pull request refactor label as well](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.

  - type: textarea
    id: background-description
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -1,11 +1,11 @@
 blank_issues_enabled: true
 contact_links:
  - name: Got an idea?
-    url: https://github.com/ggerganov/llama.cpp/discussions/categories/ideas
+    url: https://github.com/ggml-org/llama.cpp/discussions/categories/ideas
    about: Pop it there. It may then become an enhancement ticket.
  - name: Got a question?
-    url: https://github.com/ggerganov/llama.cpp/discussions/categories/q-a
+    url: https://github.com/ggml-org/llama.cpp/discussions/categories/q-a
    about: Ask a question there!
  - name: Want to contribute?
-    url: https://github.com/ggerganov/llama.cpp/wiki/contribute
+    url: https://github.com/ggml-org/llama.cpp/wiki/contribute
    about: Head to the contribution guide page of the wiki for areas you can help with
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -1 +1 @@
-*Make sure to read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md) before submitting a PR*
+*Make sure to read the [contributing guidelines](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md) before submitting a PR*
--- a/.github/workflows/bench.yml.disabled
+++ b/.github/workflows/bench.yml.disabled
@@ -1,5 +1,5 @@
 # TODO: there have been some issues with the workflow, so disabling for now
-#       https://github.com/ggerganov/llama.cpp/issues/7893
+#       https://github.com/ggml-org/llama.cpp/issues/7893
 #
 # Benchmark
 name: Benchmark
@@ -57,17 +57,7 @@ jobs:

    if: |
      inputs.gpu-series == 'Standard_NC4as_T4_v3'
-      || (
-        github.event_name == 'schedule'
-        && github.ref_name == 'master'
-        && github.repository_owner == 'ggerganov'
-      )
      || github.event_name == 'pull_request_target'
-      || (
-        github.event_name == 'push'
-        && github.event.ref == 'refs/heads/master'
-        && github.repository_owner == 'ggerganov'
-      )
    steps:
      - name: Clone
        id: checkout
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -10,10 +10,10 @@ on:
  push:
    branches:
      - master
-    paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
+    paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
  pull_request:
    types: [opened, synchronize, reopened]
-    paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
+    paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@@ -43,6 +43,12 @@ jobs:
        with:
          fetch-depth: 0

+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: macOS-latest-cmake-arm64
+          evict-old-files: 1d
+
      - name: Dependencies
        id: depends
        continue-on-error: true
@@ -53,15 +59,14 @@ jobs:
        id: cmake_build
        run: |
          sysctl -a
-          mkdir build
-          cd build
-          cmake .. \
+          cmake -B build \
+            -DCMAKE_BUILD_RPATH="@loader_path" \
            -DLLAMA_FATAL_WARNINGS=ON \
            -DLLAMA_CURL=ON \
            -DGGML_METAL_USE_BF16=ON \
            -DGGML_METAL_EMBED_LIBRARY=ON \
            -DGGML_RPC=ON
-          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
+          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)

      - name: Test
        id: cmake_test
@@ -87,6 +92,7 @@ jobs:
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        run: |
          cp LICENSE ./build/bin/
+          cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
          zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*

      - name: Upload artifacts
@@ -106,6 +112,12 @@ jobs:
        with:
          fetch-depth: 0

+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: macOS-latest-cmake-x64
+          evict-old-files: 1d
+
      - name: Dependencies
        id: depends
        continue-on-error: true
@@ -117,8 +129,9 @@ jobs:
        run: |
          sysctl -a
          # Metal is disabled due to intermittent failures with Github runners not having a GPU:
-          # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
+          # https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
          cmake -B build \
+            -DCMAKE_BUILD_RPATH="@loader_path" \
            -DLLAMA_FATAL_WARNINGS=ON \
            -DLLAMA_CURL=ON \
            -DGGML_METAL=OFF \
@@ -149,6 +162,7 @@ jobs:
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        run: |
          cp LICENSE ./build/bin/
+          cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
          zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*

      - name: Upload artifacts
@@ -158,8 +172,16 @@ jobs:
          path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
          name: llama-bin-macos-x64.zip

-  ubuntu-latest-cmake:
-    runs-on: ubuntu-latest
+  ubuntu-cpu-cmake:
+    strategy:
+      matrix:
+        include:
+          - build: 'x64'
+            os: ubuntu-22.04
+          - build: 'arm64'
+            os: ubuntu-22.04-arm
+
+    runs-on: ${{ matrix.os }}

    steps:
      - name: Clone
@@ -168,6 +190,12 @@ jobs:
        with:
          fetch-depth: 0

+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: ubuntu-cpu-cmake
+          evict-old-files: 1d
+
      - name: Dependencies
        id: depends
        run: |
@@ -177,10 +205,11 @@ jobs:
      - name: Build
        id: cmake_build
        run: |
-          mkdir build
-          cd build
-          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DGGML_RPC=ON
-          cmake --build . --config Release -j $(nproc)
+          cmake -B build \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DLLAMA_CURL=ON \
+            -DGGML_RPC=ON
+          cmake --build build --config Release -j $(nproc)

      - name: Test
        id: cmake_test
@@ -217,14 +246,15 @@ jobs:
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        run: |
          cp LICENSE ./build/bin/
-          zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/*
+          cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
+          zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/*

      - name: Upload artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        uses: actions/upload-artifact@v4
        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip
-          name: llama-bin-ubuntu-x64.zip
+          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip
+          name: llama-bin-ubuntu-${{ matrix.build }}.zip

  ubuntu-latest-cmake-sanitizer:
    runs-on: ubuntu-latest
@@ -234,7 +264,54 @@ jobs:
    strategy:
      matrix:
        sanitizer: [ADDRESS, THREAD, UNDEFINED]
-        build_type: [Debug, Release]
+        build_type: [Debug]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: ubuntu-latest-cmake-sanitizer-${{ matrix.sanitizer }}
+          evict-old-files: 1d
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential
+
+      - name: Build
+        id: cmake_build
+        if: ${{ matrix.sanitizer != 'THREAD' }}
+        run: |
+          cmake -B build \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
+            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+          cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
+
+      - name: Build (no OpenMP)
+        id: cmake_build_no_openmp
+        if: ${{ matrix.sanitizer == 'THREAD' }}
+        run: |
+          cmake -B build \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
+            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
+            -DGGML_OPENMP=OFF
+          cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L main --verbose --timeout 900
+
+  ubuntu-latest-llguidance:
+    runs-on: ubuntu-latest

    steps:
      - name: Clone
@@ -249,21 +326,13 @@ jobs:

      - name: Build
        id: cmake_build
-        if: ${{ matrix.sanitizer != 'THREAD' }}
        run: |
          mkdir build
          cd build
-          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
-          cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
-
-      - name: Build (no OpenMP)
-        id: cmake_build_no_openmp
-        if: ${{ matrix.sanitizer == 'THREAD' }}
-        run: |
-          mkdir build
-          cd build
-          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DGGML_OPENMP=OFF
-          cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
+          cmake .. \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DLLAMA_LLGUIDANCE=ON
+          cmake --build . --config Release -j $(nproc)

      - name: Test
        id: cmake_test
@@ -281,6 +350,12 @@ jobs:
        id: checkout
        uses: actions/checkout@v4

+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: ubuntu-latest-cmake-rpc
+          evict-old-files: 1d
+
      - name: Dependencies
        id: depends
        run: |
@@ -290,10 +365,9 @@ jobs:
      - name: Build
        id: cmake_build
        run: |
-          mkdir build
-          cd build
-          cmake -DGGML_RPC=ON ..
-          cmake --build . --config Release -j $(nproc)
+          cmake -B build \
+            -DGGML_RPC=ON
+          cmake --build build --config Release -j $(nproc)

      - name: Test
        id: cmake_test
@@ -308,6 +382,14 @@ jobs:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: ubuntu-22-cmake-vulkan
+          evict-old-files: 1d

      - name: Dependencies
        id: depends
@@ -320,16 +402,44 @@ jobs:
      - name: Build
        id: cmake_build
        run: |
-          mkdir build
-          cd build
-          cmake -DGGML_VULKAN=ON ..
-          cmake --build . --config Release -j $(nproc)
+          cmake -B build \
+            -DGGML_VULKAN=ON
+          cmake --build build --config Release -j $(nproc)

      - name: Test
        id: cmake_test
        run: |
          cd build
-          ctest -L main --verbose --timeout 900
+          # This is using llvmpipe and runs slower than other backends
+          ctest -L main --verbose --timeout 2700
+
+      - name: Determine tag name
+        id: tag
+        shell: bash
+        run: |
+          BUILD_NUMBER="$(git rev-list --count HEAD)"
+          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+          else
+            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        run: |
+          cp LICENSE ./build/bin/
+          cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
+          zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip ./build/bin/*
+
+      - name: Upload artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip
+          name: llama-bin-ubuntu-vulkan-x64.zip

  ubuntu-22-cmake-hip:
    runs-on: ubuntu-22.04
@@ -346,21 +456,34 @@ jobs:
          sudo apt-get update
          sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev

+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: ubuntu-22-cmake-hip
+          evict-old-files: 1d
+
      - name: Build with native CMake HIP support
        id: cmake_build
        run: |
-          cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DGGML_HIP=ON
+          cmake -B build -S . \
+            -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
+            -DGGML_HIP_ROCWMMA_FATTN=ON \
+            -DGGML_HIP=ON
          cmake --build build --config Release -j $(nproc)

      - name: Build with legacy HIP support
        id: cmake_build_legacy_hip
        run: |
-          cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DGGML_HIP=ON
+          cmake -B build2 -S . \
+            -DCMAKE_C_COMPILER=hipcc \
+            -DCMAKE_CXX_COMPILER=hipcc \
+            -DGGML_HIP_ROCWMMA_FATTN=ON \
+            -DGGML_HIP=ON
          cmake --build build2 --config Release -j $(nproc)

  ubuntu-22-cmake-musa:
    runs-on: ubuntu-22.04
-    container: mthreads/musa:rc3.1.0-devel-ubuntu22.04
+    container: mthreads/musa:rc3.1.1-devel-ubuntu22.04

    steps:
      - name: Clone
@@ -373,10 +496,17 @@ jobs:
          apt-get update
          apt-get install -y build-essential git cmake libcurl4-openssl-dev

+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: ubuntu-22-cmake-musa
+          evict-old-files: 1d
+
      - name: Build with native CMake MUSA support
        id: cmake_build
        run: |
-          cmake -B build -S . -DGGML_MUSA=ON
+          cmake -B build -S . \
+            -DGGML_MUSA=ON
          cmake --build build --config Release -j $(nproc)

  ubuntu-22-cmake-sycl:
@@ -411,14 +541,21 @@ jobs:
        id: checkout
        uses: actions/checkout@v4

+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: ubuntu-22-cmake-sycl
+          evict-old-files: 1d
+
      - name: Build
        id: cmake_build
        run: |
          source /opt/intel/oneapi/setvars.sh
-          mkdir build
-          cd build
-          cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
-          cmake --build . --config Release -j $(nproc)
+          cmake -B build \
+            -DGGML_SYCL=ON \
+            -DCMAKE_C_COMPILER=icx \
+            -DCMAKE_CXX_COMPILER=icpx
+          cmake --build build --config Release -j $(nproc)

  ubuntu-22-cmake-sycl-fp16:
    runs-on: ubuntu-22.04
@@ -452,47 +589,22 @@ jobs:
        id: checkout
        uses: actions/checkout@v4

+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: ubuntu-22-cmake-sycl-fp16
+          evict-old-files: 1d
+
      - name: Build
        id: cmake_build
        run: |
          source /opt/intel/oneapi/setvars.sh
-          mkdir build
-          cd build
-          cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON ..
-          cmake --build . --config Release -j $(nproc)
-
-  # TODO: build with GGML_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
-  #       how to debug it.
-  #       ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584
-  #       would be great if we fix these
-  macOS-latest-cmake:
-    runs-on: macos-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          mkdir build
-          cd build
-          cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF ..
-          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose --timeout 900
+          cmake -B build \
+            -DGGML_SYCL=ON \
+            -DCMAKE_C_COMPILER=icx \
+            -DCMAKE_CXX_COMPILER=icpx \
+            -DGGML_SYCL_F16=ON
+          cmake --build build --config Release -j $(nproc)

  macOS-latest-cmake-ios:
    runs-on: macos-latest
@@ -502,6 +614,12 @@ jobs:
        id: checkout
        uses: actions/checkout@v4

+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: macOS-latest-cmake-ios
+          evict-old-files: 1d
+
      - name: Dependencies
        id: depends
        continue-on-error: true
@@ -512,9 +630,7 @@ jobs:
        id: cmake_build
        run: |
          sysctl -a
-          mkdir build
-          cd build
-          cmake -G Xcode .. \
+          cmake -B build -G Xcode \
            -DGGML_METAL_USE_BF16=ON \
            -DGGML_METAL_EMBED_LIBRARY=ON \
            -DLLAMA_BUILD_EXAMPLES=OFF \
@@ -523,7 +639,7 @@ jobs:
            -DCMAKE_SYSTEM_NAME=iOS \
            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
-          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
+          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO

  macOS-latest-cmake-tvos:
    runs-on: macos-latest
@@ -533,6 +649,12 @@ jobs:
        id: checkout
        uses: actions/checkout@v4

+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: macOS-latest-cmake-tvos
+          evict-old-files: 1d
+
      - name: Dependencies
        id: depends
        continue-on-error: true
@@ -543,9 +665,7 @@ jobs:
        id: cmake_build
        run: |
          sysctl -a
-          mkdir build
-          cd build
-          cmake -G Xcode .. \
+          cmake -B build -G Xcode \
            -DGGML_METAL_USE_BF16=ON \
            -DGGML_METAL_EMBED_LIBRARY=ON \
            -DLLAMA_BUILD_EXAMPLES=OFF \
@@ -554,7 +674,7 @@ jobs:
            -DCMAKE_SYSTEM_NAME=tvOS \
            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
-          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
+          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO

  macOS-latest-swift:
    runs-on: macos-latest
@@ -568,6 +688,12 @@ jobs:
        id: checkout
        uses: actions/checkout@v4

+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: macOS-latest-swift
+          evict-old-files: 1d
+
      - name: Dependencies
        id: depends
        continue-on-error: true
@@ -578,22 +704,19 @@ jobs:
        id: cmake_build
        run: |
          sysctl -a
-          mkdir build
-          cd build
-          cmake -G Xcode .. \
+          cmake -B build -G Xcode \
            -DGGML_METAL_USE_BF16=ON \
            -DGGML_METAL_EMBED_LIBRARY=ON \
            -DLLAMA_BUILD_EXAMPLES=OFF \
            -DLLAMA_BUILD_TESTS=OFF \
            -DLLAMA_BUILD_SERVER=OFF \
            -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64"
-          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
-          sudo cmake --install . --config Release
+          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)

      - name: xcodebuild for swift package
        id: xcodebuild
        run: |
-          xcodebuild -scheme llama-Package -destination "${{ matrix.destination }}"
+          ./build-xcframework.sh

  windows-msys2:
    runs-on: windows-latest
@@ -609,6 +732,13 @@ jobs:
      - name: Clone
        uses: actions/checkout@v4

+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: windows-msys2
+          variant: sccache
+          evict-old-files: 1d
+
      - name: Setup ${{ matrix.sys }}
        uses: msys2/setup-msys2@v2
        with:
@@ -616,6 +746,7 @@ jobs:
          msystem: ${{matrix.sys}}
          install: >-
            base-devel
+            git
            mingw-w64-${{matrix.env}}-toolchain
            mingw-w64-${{matrix.env}}-cmake
            mingw-w64-${{matrix.env}}-openblas
@@ -676,6 +807,13 @@ jobs:
        with:
          fetch-depth: 0

+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: windows-latest-cmake-${{ matrix.build }}
+          variant: sccache
+          evict-old-files: 1d
+
      - name: Clone Kompute submodule
        id: clone_kompute
        if: ${{ matrix.build == 'kompute-x64' }}
@@ -715,21 +853,19 @@ jobs:
        run: |
          git clone https://github.com/KhronosGroup/OpenCL-Headers
          cd OpenCL-Headers
-          mkdir build && cd build
-          cmake .. `
+          cmake -B build `
            -DBUILD_TESTING=OFF `
            -DOPENCL_HEADERS_BUILD_TESTING=OFF `
            -DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
-          cmake --build . --target install
+          cmake --build build --target install
          git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
          cd OpenCL-ICD-Loader
-          mkdir build-arm64-release && cd build-arm64-release
-          cmake .. `
+          cmake -B build-arm64-release `
            -A arm64 `
            -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" `
            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
-          cmake --build . --target install --config release
+          cmake --build build-arm64-release --target install --config release

      - name: Build
        id: cmake_build
@@ -796,6 +932,7 @@ jobs:
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        run: |
          Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
+          Copy-Item .\examples\run\linenoise.cpp\LICENSE .\build\bin\Release\linenoise.cpp.txt
          7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*

      - name: Upload artifacts
@@ -813,6 +950,8 @@ jobs:
        - name: Clone
          id: checkout
          uses: actions/checkout@v4
+          with:
+            fetch-depth: 0

        - name: Install dependencies
          env:
@@ -821,9 +960,21 @@ jobs:
              apt update
              apt install -y cmake build-essential ninja-build libgomp1 git

+        - name: ccache
+          uses: hendrikmuhs/ccache-action@v1.2.16
+          with:
+            key: ubuntu-latest-cmake-cuda
+            evict-old-files: 1d
+
        - name: Build with CMake
          run: |
-            cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=89-real -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined -DLLAMA_FATAL_WARNINGS=ON
+            cmake -S . -B build -G Ninja \
+              -DCMAKE_BUILD_TYPE=Release \
+              -DCMAKE_CUDA_ARCHITECTURES=89-real \
+              -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined \
+              -DLLAMA_FATAL_WARNINGS=ON \
+              -DGGML_NATIVE=OFF \
+              -DGGML_CUDA=ON
            cmake --build build

  windows-2019-cmake-cuda:
@@ -841,6 +992,13 @@ jobs:
        with:
            fetch-depth: 0

+      - name: Install ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: ${{ github.job }}-${{ matrix.cuda }}-${{ matrix.build }}
+          variant: sccache
+          evict-old-files: 1d
+
      - name: Install Cuda Toolkit 11.7
        if: ${{ matrix.cuda == '11.7' }}
        run: |
@@ -897,11 +1055,6 @@ jobs:
          echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
          echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8

-      - name: Install ccache
-        uses: hendrikmuhs/ccache-action@v1.2
-        with:
-          key: ${{ github.job }}-${{ matrix.cuda }}-${{ matrix.build }}
-
      - name: Install Ninja
        id: install_ninja
        run: |
@@ -912,7 +1065,11 @@ jobs:
        shell: cmd
        run: |
          call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
-          cmake -S . -B build -G "Ninja Multi-Config" -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DGGML_RPC=ON
+          cmake -S . -B build -G "Ninja Multi-Config" ^
+            -DLLAMA_BUILD_SERVER=ON ^
+            -DGGML_NATIVE=OFF ^
+            -DGGML_CUDA=ON ^
+            -DGGML_RPC=ON
          set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
          cmake --build build --config Release -j %NINJA_JOBS% -t ggml
          cmake --build build --config Release
@@ -977,6 +1134,13 @@ jobs:
        with:
          fetch-depth: 0

+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: windows-latest-cmake-sycl
+          variant: sccache
+          evict-old-files: 1d
+
      - name: Install
        run:  |
          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
@@ -1040,6 +1204,11 @@ jobs:
        id: checkout
        uses: actions/checkout@v4

+      - name: Clone rocWMMA repository
+        id: clone_rocwmma
+        run: |
+          git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1
+
      - name: Install
        id: depends
        run: |
@@ -1056,16 +1225,24 @@ jobs:
          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version

      - name: Install ccache
-        uses: hendrikmuhs/ccache-action@v1.2
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: ${{ github.job }}
+          evict-old-files: 1d

      - name: Build
        id: cmake_build
        run: |
          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
-          cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIP=ON -DCMAKE_BUILD_TYPE=Release -DGGML_RPC=ON
+          cmake -G "Unix Makefiles" -B build -S . `
+            -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
+            -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
+            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/" `
+            -DCMAKE_BUILD_TYPE=Release `
+            -DGGML_HIP=ON `
+            -DGGML_HIP_ROCWMMA_FATTN=ON `
+            -DGGML_RPC=ON
          cmake --build build -j ${env:NUMBER_OF_PROCESSORS}

  windows-latest-cmake-hip-release:
@@ -1083,6 +1260,17 @@ jobs:
        with:
            fetch-depth: 0

+      - name: Clone rocWMMA repository
+        id: clone_rocwmma
+        run: |
+          git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: windows-latest-cmake-hip-release
+          evict-old-files: 1d
+
      - name: Install
        id: depends
        run: |
@@ -1103,7 +1291,15 @@ jobs:
        run: |
          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
-          cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIP=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=${{ matrix.gpu_target }} -DGGML_RPC=ON
+          cmake -G "Unix Makefiles" -B build -S . `
+            -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
+            -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
+            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/" `
+            -DCMAKE_BUILD_TYPE=Release `
+            -DAMDGPU_TARGETS=${{ matrix.gpu_target }} `
+            -DGGML_HIP_ROCWMMA_FATTN=ON `
+            -DGGML_HIP=ON `
+            -DGGML_RPC=ON
          cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
          md "build\bin\rocblas\library\"
          cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
@@ -1140,14 +1336,14 @@ jobs:
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0

      - name: Build
        id: cmake_build
        run: |
          sysctl -a
-          mkdir build
-          cd build
-          cmake -G Xcode .. \
+          cmake -B build -G Xcode \
            -DGGML_METAL_USE_BF16=ON \
            -DGGML_METAL_EMBED_LIBRARY=ON \
            -DLLAMA_BUILD_EXAMPLES=OFF \
@@ -1156,16 +1352,41 @@ jobs:
            -DCMAKE_SYSTEM_NAME=iOS \
            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
-          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
-          sudo cmake --install . --config Release
+          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO

      - name: xcodebuild for swift package
        id: xcodebuild
        run: |
-          xcodebuild -scheme llama-Package -destination 'generic/platform=iOS'
+          ./build-xcframework.sh

      - name: Build Xcode project
-        run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
+        run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build
+
+      - name: Determine tag name
+        id: tag
+        shell: bash
+        run: |
+          BUILD_NUMBER="$(git rev-list --count HEAD)"
+          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+          else
+            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        run: |
+          zip -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
+
+      - name: Upload artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
+          name: llama-${{ steps.tag.outputs.name }}-xcframework

  android-build:
    runs-on: ubuntu-latest
@@ -1174,6 +1395,12 @@ jobs:
      - name: Clone
        uses: actions/checkout@v4

+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: android-build
+          evict-old-files: 1d
+
      - name: Set up JDK
        uses: actions/setup-java@v3
        with:
@@ -1197,10 +1424,11 @@ jobs:
    runs-on: ubuntu-latest

    needs:
-      - ubuntu-latest-cmake
-      - macOS-latest-cmake
+      - ubuntu-cpu-cmake
+      - ubuntu-22-cmake-vulkan
      - windows-latest-cmake
      - windows-2019-cmake-cuda
+      - windows-latest-cmake-sycl
      - windows-latest-cmake-hip-release
      - macOS-latest-cmake-arm64
      - macOS-latest-cmake-x64
@@ -1212,6 +1440,12 @@ jobs:
        with:
          fetch-depth: 0

+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: release
+          evict-old-files: 1d
+
      - name: Determine tag name
        id: tag
        shell: bash
@@ -1457,3 +1691,37 @@ jobs:
 #          popd
 #          emcmake cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }}
 #          make
+
+  openEuler-latest-cmake-cann:
+    if: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'Ascend NPU') }}
+    defaults:
+      run:
+       shell: bash -el {0}
+    runs-on: ubuntu-24.04-arm
+    strategy:
+      matrix:
+        cann:
+          - '8.0.rc3.beta1-910b-openeuler22.03-py3.10'
+        device:
+          - 'ascend910b3'
+        build:
+          - 'Release'
+    container: ascendai/cann:${{ matrix.cann }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Dependencies
+        run: |
+          yum update -y
+          yum install -y git gcc gcc-c++ make cmake
+
+      - name: Build
+        run: |
+          export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
+
+          cmake -S . -B build \
+              -DCMAKE_BUILD_TYPE=${{ matrix.build }} \
+              -DGGML_CANN=on \
+              -DSOC_TYPE=${{ matrix.device }}
+          cmake --build build -j $(nproc)
--- a/.github/workflows/close-issue.yml
+++ b/.github/workflows/close-issue.yml
@@ -17,7 +17,7 @@ jobs:
    steps:
      - uses: actions/stale@v5
        with:
-          exempt-issue-labels: "refactor,help wanted,good first issue,research,bug"
+          exempt-issue-labels: "refactor,help wanted,good first issue,research,bug,roadmap"
          days-before-issue-stale: 30
          days-before-issue-close: 14
          stale-issue-label: "stale"
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -28,10 +28,11 @@ jobs:
  push_to_registry:
    name: Push Docker image to Docker Hub

-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
    env:
      COMMIT_SHA: ${{ github.sha }}
    strategy:
+      fail-fast: false
      matrix:
        config:
          # Multi-stage build
@@ -50,6 +51,8 @@ jobs:

      - name: Set up QEMU
        uses: docker/setup-qemu-action@v3
+        with:
+          image: tonistiigi/binfmt:qemu-v7.0.0-28

      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
--- a/.github/workflows/labeler.yml
+++ b/.github/workflows/labeler.yml
@@ -11,7 +11,7 @@ jobs:
    steps:
    - uses: actions/checkout@v4
      with:
-        repository: "ggerganov/llama.cpp"
+        repository: "ggml-org/llama.cpp"
    - uses: actions/labeler@v5
      with:
        configuration-path: '.github/labeler.yml'
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -81,13 +81,36 @@ jobs:
        with:
          node-version: '22.11.0'

+      - name: WebUI - Install dependencies
+        id: webui_lint
+        run: |
+          cd examples/server/webui
+          npm ci
+
+      - name: WebUI - Check code format
+        id: webui_format
+        run: |
+          git config --global --add safe.directory $(realpath .)
+          cd examples/server/webui
+          git status
+
+          npm run format
+          git status
+          modified_files="$(git status -s)"
+          echo "Modified files: ${modified_files}"
+          if [ -n "${modified_files}" ]; then
+            echo "Files do not follow coding style. To fix: npm run format"
+            echo "${modified_files}"
+            exit 1
+          fi
+
      - name: Verify bundled index.html
        id: verify_server_index_html
        run: |
          git config --global --add safe.directory $(realpath .)
          cd examples/server/webui
          git status
-          npm ci
+
          npm run build
          git status
          modified_files="$(git status -s)"
@@ -112,9 +135,9 @@ jobs:
              -DGGML_OPENMP=OFF ;
          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server

-      - name: Build
-        id: cmake_build
-        if: ${{ matrix.sanitizer != 'THREAD' }}
+      - name: Build (sanitizers)
+        id: cmake_build_sanitizers
+        if: ${{ matrix.sanitizer != '' && matrix.sanitizer != 'THREAD' }}
        run: |
          cmake -B build \
              -DGGML_NATIVE=OFF \
@@ -124,12 +147,33 @@ jobs:
              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server

+      - name: Build (sanitizers)
+        id: cmake_build
+        if: ${{ matrix.sanitizer == '' }}
+        run: |
+          cmake -B build \
+              -DGGML_NATIVE=OFF \
+              -DLLAMA_BUILD_SERVER=ON \
+              -DLLAMA_CURL=ON \
+              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
+          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
+
      - name: Tests
        id: server_integration_tests
+        if: ${{ matrix.sanitizer == '' }}
+        env:
+          GITHUB_ACTIONS: "true"
        run: |
          cd examples/server/tests
          ./tests.sh

+      - name: Tests (sanitizers)
+        id: server_integration_tests_sanitizers
+        if: ${{ matrix.sanitizer != '' }}
+        run: |
+          cd examples/server/tests
+          LLAMA_SANITIZE=1 ./tests.sh
+
      - name: Slow tests
        id: server_integration_tests_slow
        if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
@@ -186,7 +230,7 @@ jobs:
        run: |
          cd examples/server/tests
          $env:PYTHONIOENCODING = ":replace"
-          pytest -v -x
+          pytest -v -x -m "not slow"

      - name: Slow tests
        id: server_integration_tests_slow
--- a/.gitignore
+++ b/.gitignore
@@ -18,6 +18,7 @@
 *.metallib
 *.o
 *.so
+*.swp
 *.tmp

 # IDE / OS
@@ -44,6 +45,8 @@ lcov-report/
 tags
 .build/
 build*
+release
+debug
 !build-info.cmake
 !build-info.cpp.in
 !build-info.sh
@@ -97,6 +100,7 @@ examples/server/*.css.hpp
 examples/server/*.html.hpp
 examples/server/*.js.hpp
 examples/server/*.mjs.hpp
+examples/server/*.gz.hpp
 !build_64.sh
 !examples/*.bat
 !examples/*/*.kts
--- a/142
+++ b/142
@@ -1,4 +1,4 @@
-# date: Thu Nov 28 20:46:15 EET 2024
+# date: Sat Mar  8 18:23:52 EET 2025
 # this file is auto-generated by scripts/gen-authors.sh

 0cc4m <picard12@live.de>
@@ -8,10 +8,12 @@
 3ooabkhxtn <31479382+3ooabkhxtn@users.noreply.github.com>
 44670 <44670@users.noreply.github.com>
 65a <10104049+65a@users.noreply.github.com>
+708-145 <40387547+708-145@users.noreply.github.com>
 AN Long <aisk@users.noreply.github.com>
 AT <manyoso@users.noreply.github.com>
 Aarni Koskela <akx@iki.fi>
 Aaron Miller <apage43@ninjawhale.com>
+Aaron Teo <57927438+taronaeo@users.noreply.github.com>
 Aaryaman Vasishta <aaryaman.vasishta@amd.com>
 Abheek Gulati <abheekg@hotmail.com>
 Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
@@ -20,21 +22,27 @@ Adithya Balaji <adithya.b94@gmail.com>
 AdithyanI <adithyan.i4internet@gmail.com>
 Adrian <smith.adriane@gmail.com>
 Adrian Hesketh <a-h@users.noreply.github.com>
+Adrian Kretz <me@akretz.com>
+Adrien Gallouët <adrien@gallouet.fr>
+Adrien Gallouët <angt@huggingface.co>
 Ahmad Tameem <113388789+Tameem-10xE@users.noreply.github.com>
 Ahmet Zeer <ahmed.zeer@std.yildiz.edu.tr>
 AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com>
 AidanBeltonS <aidan.belton@codeplay.com>
 Aisuko <urakiny@gmail.com>
 Akarshan Biswas <akarshan.biswas@gmail.com>
+Akarshan Biswas <akarshan@menlo.ai>
 Akarshan Biswas <akarshanbiswas@fedoraproject.org>
 Al Mochkin <14274697+amochkin@users.noreply.github.com>
 Albert Jin <albert.jin@gmail.com>
 Alberto <57916483+albbus-stack@users.noreply.github.com>
 Alberto Cabrera Pérez <alberto.cabrera@codeplay.com>
 Alberto Cabrera Pérez <alberto.cabrera@intel.com>
+Aleksei Nikiforov <103434461+AlekseiNikiforovIBM@users.noreply.github.com>
 Alex <awhill19@icloud.com>
 Alex Azarov <alex@azarov.by>
 Alex Azarov <alexander.azarov@mapbox.com>
+Alex Brooks <alex.brooks@ibm.com>
 Alex Klinkhamer <from.github.com.917@grencez.dev>
 Alex Klinkhamer <git@grencez.dev>
 Alex Nguyen <tiendung@users.noreply.github.com>
@@ -55,6 +63,7 @@ Ananta Bastola <anantarajbastola@gmail.com>
 Anas Ahouzi <112881240+aahouzi@users.noreply.github.com>
 András Salamon <ott2@users.noreply.github.com>
 Andreas (Andi) Kunar <andreask@msn.com>
+Andreas Kieslinger <47689530+aendk@users.noreply.github.com>
 Andrei <abetlen@gmail.com>
 Andrew Canis <andrew.canis@gmail.com>
 Andrew Downing <andrew2085@gmail.com>
@@ -64,6 +73,7 @@ Andrew Minh Nguyen <40281306+amqdn@users.noreply.github.com>
 Andy Salerno <andysalerno@gmail.com>
 Andy Tai <andy-tai@users.noreply.github.com>
 Anthony Van de Gejuchte <anthonyvdgent@gmail.com>
+Antoine Viallon <antoine@lesviallon.fr>
 Antonis Makropoulos <benuix@gmail.com>
 Arik Poznanski <arikpoz@users.noreply.github.com>
 Armen Kaleshian <kriation@users.noreply.github.com>
@@ -80,6 +90,7 @@ Atsushi Tatsuma <yoshoku@outlook.com>
 Austin <77757836+teleprint-me@users.noreply.github.com>
 AustinMroz <austinmroz@utexas.edu>
 BADR <contact@pythops.com>
+BB-fat <45072480+BB-fat@users.noreply.github.com>
 Bach Le <bach@bullno1.com>
 Bailey Chittle <39804642+bachittle@users.noreply.github.com>
 BarfingLemurs <128182951+BarfingLemurs@users.noreply.github.com>
@@ -91,13 +102,18 @@ Ben Siraphob <bensiraphob@gmail.com>
 Ben Williams <ben@719ben.com>
 Benjamin Findley <39356821+Kartoffelsaft@users.noreply.github.com>
 Benjamin Lecaillon <84293038+blecaillon@users.noreply.github.com>
+Benson Wong <mostlygeek@gmail.com>
 Bernat Vadell <hounter.caza@gmail.com>
+Bernhard M. Wiedemann <githubbmwprimary@lsmod.de>
 Bert Wagner <github@bertwagner.com>
+Billel Mokeddem <billel.mokeddem.ml@gmail.com>
 Bingan <70050083+binganao@users.noreply.github.com>
 Bjarke Viksøe <164612031+bviksoe@users.noreply.github.com>
+Bodhi <3882561+BodhiHu@users.noreply.github.com>
 Bodo Graumann <mail@bodograumann.de>
 Bono Lv <lvscar@users.noreply.github.com>
 Borislav Stanimirov <b.stanimirov@abv.bg>
+Borislav Stanimirov <b@ibob.bg>
 Branden Butler <bwtbutler@hotmail.com>
 Brandon Squizzato <35474886+bsquizz@users.noreply.github.com>
 Brian <mofosyne@gmail.com>
@@ -117,9 +133,11 @@ Casey Primozic <casey@cprimozic.net>
 Casey Primozic <me@ameo.link>
 CausalLM <148736309+CausalLM@users.noreply.github.com>
 Cebtenzzre <cebtenzzre@gmail.com>
+CentricStorm <CentricStorm@users.noreply.github.com>
 Chad Brewbaker <crb002@gmail.com>
 Changyeon Kim <cyzero.kim@samsung.com>
 Chao Jiang <jc19chaoj@zoho.com>
+Charles Duffy <charles@dyfis.net>
 Charles Xu <63788048+chaxu01@users.noreply.github.com>
 Charles Xu <charles.xu@arm.com>
 Chen Xi <xi2.chen@intel.com>
@@ -131,12 +149,17 @@ Chris Kuehl <ckuehl@ckuehl.me>
 Christian Demsar <christian@github.email.demsar.us>
 Christian Demsar <crasm@git.vczf.us>
 Christian Falch <875252+chrfalch@users.noreply.github.com>
+Christian Fillion <cfillion@users.noreply.github.com>
+Christian Kastner <ckk@kvr.at>
 Christian Kögler <ck3d@gmx.de>
 Christian Köhnenkamp <cvk5@me.com>
 Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com>
+Christopher Nielsen <62156882+mascguy@users.noreply.github.com>
 Clark Saben <76020733+csaben@users.noreply.github.com>
+Clauszy <zhangyub@uniontech.com>
 Clint Herron <hanclinto@gmail.com>
 Conrad Kramer <conrad@conradkramer.com>
+Corentin REGAL <corentin.regal@gmail.com>
 CrispStrobe <154636388+CrispStrobe@users.noreply.github.com>
 Csaba Kecskemeti <csaba.kecskemeti@gmail.com>
 Cuong Trinh Manh <nguoithichkhampha@gmail.com>
@@ -152,6 +175,7 @@ Daniel Hiltgen <dhiltgen@users.noreply.github.com>
 Daniel Illescas Romero <illescas.daniel@protonmail.com>
 Daniel Kleine <53251018+d-kleine@users.noreply.github.com>
 Daniele <57776841+daniandtheweb@users.noreply.github.com>
+Danny Milosavljevic <dannym@friendly-machines.com>
 DannyDaemonic <DannyDaemonic@gmail.com>
 Dat Quoc Nguyen <2412555+datquocnguyen@users.noreply.github.com>
 Dave <dave-fl@users.noreply.github.com>
@@ -159,6 +183,7 @@ Dave Airlie <airlied@gmail.com>
 Dave Airlie <airlied@redhat.com>
 Dave Della Costa <ddellacosta+github@gmail.com>
 David Friehs <david@friehs.info>
+David Huang <1969802+hjc4869@users.noreply.github.com>
 David Kennedy <dakennedyd@gmail.com>
 David Pflug <david@pflug.email>
 David Renshaw <dwrenshaw@gmail.com>
@@ -176,6 +201,7 @@ Dibakar Gope <dibakar.gope@arm.com>
 Didzis Gosko <didzis@users.noreply.github.com>
 Diego Devesa <slarengh@gmail.com>
 Diogo Teles Sant'Anna <diogoteles@google.com>
+Djip007 <3705339+Djip007@users.noreply.github.com>
 Djip007 <djip.perois@free.fr>
 Don Mahurin <dmahurin@users.noreply.github.com>
 DooWoong Lee (David) <manics99@naver.com>
@@ -193,6 +219,7 @@ Edward Taylor <edeetee@gmail.com>
 Elaine <elaine.zosa@gmail.com>
 Elbios <141279586+Elbios@users.noreply.github.com>
 Elton Kola <eltonkola@gmail.com>
+Emreerdog <34742675+Emreerdog@users.noreply.github.com>
 Engininja2 <139037756+Engininja2@users.noreply.github.com>
 Equim <sayaka@ekyu.moe>
 Eric Curtin <ecurtin@redhat.com>
@@ -223,6 +250,7 @@ Felix <stenbackfelix@gmail.com>
 Finn Voorhees <finnvoorhees@gmail.com>
 Firat <firatkiral@gmail.com>
 FirstTimeEZ <179362031+FirstTimeEZ@users.noreply.github.com>
+Florent BENOIT <fbenoit@redhat.com>
 Folko-Ven <71110216+Folko-Ven@users.noreply.github.com>
 Foul-Tarnished <107711110+Foul-Tarnished@users.noreply.github.com>
 Francisco Melo <43780565+francis2tm@users.noreply.github.com>
@@ -233,6 +261,7 @@ Fred Douglas <43351173+fredlas@users.noreply.github.com>
 Frederik Vogel <Schaltfehler@users.noreply.github.com>
 Gabe Goodhart <gabe.l.hart@gmail.com>
 Gabe Goodhart <ghart@us.ibm.com>
+Gaetan Bisson <gaetan@fenua.org>
 GainLee <perfecter.gen@gmail.com>
 Galunid <karolek1231456@gmail.com>
 Gary Linscott <glinscott@gmail.com>
@@ -240,6 +269,7 @@ Gary Mulder <gjmulder@gmail.com>
 Gavin Zhao <gavinzhaojw@protonmail.com>
 Genkagaku.GPT <hlhr202@163.com>
 Georgi Gerganov <ggerganov@gmail.com>
+Gian-Carlo Pascutto <gcp@sjeng.org>
 Gilad S <giladgd@users.noreply.github.com>
 Gilad S. <7817232+giladgd@users.noreply.github.com>
 Giuseppe Scrivano <giuseppe@scrivano.org>
@@ -249,21 +279,27 @@ Guillaume "Vermeille" Sanchez <Guillaume.V.Sanchez@gmail.com>
 Guillaume Wenzek <gwenzek@users.noreply.github.com>
 Guoliang Hua <32868157+nbcsm@users.noreply.github.com>
 Guoteng <32697156+SolenoidWGT@users.noreply.github.com>
+Guspan Tanadi <36249910+guspan-tanadi@users.noreply.github.com>
 Gustavo Rocha Dias <91472747+gustrd@users.noreply.github.com>
 Haggai Nuchi <h.nuchi@gmail.com>
 Halalaluyafail3 <55773281+Halalaluyafail3@users.noreply.github.com>
+Hale Chan <halechan@qq.com>
 Hamdoud Hakem <90524568+hamdoudhakem@users.noreply.github.com>
+Han Yin <han.yin@arm.com>
 HanishKVC <hanishkvc@gmail.com>
 Haohui Mai <ricetons@gmail.com>
 Haoxiang Fei <tonyfettes@tonyfettes.com>
 Harald Fernengel <harald.fernengel@here.com>
 Hatsune Miku <129688334+at8u@users.noreply.github.com>
 HatsuneMikuUwU33 <173229399+HatsuneMikuUwU33@users.noreply.github.com>
+Haus1 <haus.xda@gmail.com>
 Henk Poley <HenkPoley@gmail.com>
 Henri Vasserman <henv@hot.ee>
 Henrik Forstén <henrik.forsten@gmail.com>
+Henry Linjamäki <henry.linjamaki@gmail.com>
 Herman Semenov <GermanAizek@yandex.ru>
 Hesen Peng <hesen.peng@gmail.com>
+HimariO <dsfhe49854@gmail.com>
 Hoang Nguyen <hugo53@users.noreply.github.com>
 Hong Bo PENG <penghb@cn.ibm.com>
 Hongyu Ouyang <96765450+casavaca@users.noreply.github.com>
@@ -280,6 +316,7 @@ Icecream95 <the.real.icecream95@gmail.com>
 Ido S <ido.pluto@gmail.com>
 IgnacioFDM <ignaciofdm@gmail.com>
 Igor Okulist <okigan@gmail.com>
+Ihar Hrachyshka <ihrachys@redhat.com>
 Ikko Eltociear Ashimine <eltociear@gmail.com>
 Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
 Ionoclast Laboratories <brigham@ionoclast.com>
@@ -289,12 +326,15 @@ Ivan <nekotekina@gmail.com>
 Ivan Filipov <159561759+vanaka11@users.noreply.github.com>
 Ivan Komarov <Ivan.Komarov@dfyz.info>
 Ivan Stepanov <ivanstepanovftw@gmail.com>
+JC <43374599+MrSMlT@users.noreply.github.com>
+JFLFY2255 <JFLFY2255@163.com>
 JH23X <165871467+JH23X@users.noreply.github.com>
 Jack Mousseau <jack@software.inc>
 Jack Mousseau <jmousseau@users.noreply.github.com>
 JackJollimore <130917767+JackJollimore@users.noreply.github.com>
 Jaeden Amero <jaeden@patater.com>
 Jaemin Son <woalsdnd@gmail.com>
+Jafar Uruç <jafar.uruc@gmail.com>
 Jag Chadha <jagtesh@gmail.com>
 Jakub N <jakubniemczyk97@gmail.com>
 James A Capozzoli <157492257+jac-jim@users.noreply.github.com>
@@ -305,6 +345,7 @@ Jan Ploski <jpl@plosquare.com>
 Jannis Schönleber <joennlae@gmail.com>
 Jared Van Bortel <cebtenzzre@gmail.com>
 Jared Van Bortel <jared@nomic.ai>
+Jason C.H <ctrysbita@outlook.com>
 Jason McCartney <jmac@theroot.org>
 Jason Stillerman <jason.t.stillerman@gmail.com>
 Jean-Christophe Hoelt <hoelt@fovea.cc>
@@ -315,12 +356,14 @@ Jeffrey Morgan <jmorganca@gmail.com>
 Jeffrey Quesnelle <emozilla@nousresearch.com>
 Jeroen Mostert <jeroen.mostert@cm.com>
 Jesse Jojo Johnson <williamsaintgeorge@gmail.com>
+Jett Janiak <jettjaniak@gmail.com>
 Jeximo <jeximo@gmail.com>
 Jhen-Jie Hong <iainst0409@gmail.com>
 Jiahao Li <liplus17@163.com>
 Jian Liao <jianliao@users.noreply.github.com>
 JidongZhang-THU <1119708529@qq.com>
 Jinwoo Jeong <33892306+williamjeong2@users.noreply.github.com>
+Jinyang He <hejinyang@loongson.cn>
 Jiří Podivín <66251151+jpodivin@users.noreply.github.com>
 Jiří Sejkora <Sejseloid@gmail.com>
 Joan Fontanals <jfontanalsmartinez@gmail.com>
@@ -343,6 +386,7 @@ Josh Ramer <josh.ramer@icloud.com>
 Joyce <joycebrum@google.com>
 Juan Calderon-Perez <835733+gaby@users.noreply.github.com>
 Judd <foldl@users.noreply.github.com>
+Juk Armstrong <69222624+jukofyork@users.noreply.github.com>
 Julius Arkenberg <arki05@users.noreply.github.com>
 Jun Hee Yoo <contact.jhyoo@gmail.com>
 Jun Jie <71215065+junnjiee16@users.noreply.github.com>
@@ -357,6 +401,8 @@ Justine Tunney <jtunney@mozilla.com>
 Juuso Alasuutari <juuso.alasuutari@gmail.com>
 KASR <karim.asrih@gmail.com>
 Kamil Tomšík <info@tomsik.cz>
+Kante Yin <kerthcet@gmail.com>
+Karol Kontny <82021046+kkontny@users.noreply.github.com>
 Karsten Weiss <knweiss@gmail.com>
 Karthick <j.karthic2004@gmail.com>
 Karthik Kumar Viswanathan <195178+guilt@users.noreply.github.com>
@@ -376,6 +422,7 @@ Kolen Cheung <ickc@users.noreply.github.com>
 Konstantin Herud <konstantin.herud@denkbares.com>
 Konstantin Zhuravlyov <konstantin.zhuravlyov@amd.com>
 Kunshang Ji <kunshang.ji@intel.com>
+Kyle Bruene <KyleBruene@users.noreply.github.com>
 Kyle Liang <liangmanlai@gmail.com>
 Kyle Mistele <kyle@mistele.com>
 Kylin <56434533+KyL0N@users.noreply.github.com>
@@ -394,6 +441,8 @@ Liu Jia <jia3.liu@intel.com>
 LoganDark <github@logandark.mozmail.com>
 Loïc Carrère <loic.carrere@gmail.com>
 LostRuins <39025047+LostRuins@users.noreply.github.com>
+LostRuins Concedo <39025047+LostRuins@users.noreply.github.com>
+Lucas Moura Belo <lucas.belo@live.com>
 Luciano <lucianostrika44@gmail.com>
 Luo Tian <lt@basecity.com>
 Lyle Dean <dean@lyle.dev>
@@ -423,6 +472,7 @@ MasterYi1024 <39848311+MasterYi1024@users.noreply.github.com>
 Mateusz Charytoniuk <mateusz.charytoniuk@protonmail.com>
 Matheus C. França <matheus-catarino@hotmail.com>
 Matheus Gabriel Alves Silva <matheusgasource@gmail.com>
+Mathieu Baudier <mbaudier@argeo.org>
 Mathieu Geli <mathieu.geli@gmail.com>
 Mathieu Nayrolles <MathieuNls@users.noreply.github.com>
 Mathijs Henquet <mathijs.henquet@gmail.com>
@@ -437,6 +487,7 @@ Matthew Tejo <matthew.tejo@gmail.com>
 Matvey Soloviev <blackhole89@gmail.com>
 Max Krasnyansky <max.krasnyansky@gmail.com>
 Max Krasnyansky <quic_maxk@quicinc.com>
+Maxim Evtush <154841002+maximevtush@users.noreply.github.com>
 Maxime <672982+maximegmd@users.noreply.github.com>
 Maximilian Winter <maximilian.winter.91@gmail.com>
 Meng Zhang <meng@tabbyml.com>
@@ -444,6 +495,7 @@ Meng, Hengyu <hengyu.meng@intel.com>
 Mengqing Cao <cmq0113@163.com>
 Merrick Christensen <merrick.christensen@gmail.com>
 Michael Coppola <m18coppola@gmail.com>
+Michael Engel <mengel@redhat.com>
 Michael Francis <edude03@gmail.com>
 Michael Hueschen <m@mhueschen.dev>
 Michael Kesper <mkesper@schokokeks.org>
@@ -452,7 +504,9 @@ Michael Podvitskiy <podvitskiymichael@gmail.com>
 Michael Potter <NanoTekGuy@Gmail.com>
 Michael de Gans <michael.john.degans@gmail.com>
 Michaël de Vries <vriesdemichael@gmail.com>
+Michał Moskal <michal@moskal.me>
 Michał Tuszyński <srgtuszy@gmail.com>
+Michelle Tan <41475767+MichelleTanPY@users.noreply.github.com>
 Mihai <mihai.chirculescu@yahoo.com>
 Mike <ytianhui2004@gmail.com>
 Mikko Juola <mikjuo@gmail.com>
@@ -465,6 +519,7 @@ Miwa / Ensan <63481257+ensan-hcl@users.noreply.github.com>
 Mohammadreza Hendiani <hendiani.mohammadreza@gmail.com>
 Mohammadreza Hendiani <mohammad.r.hendiani@gmail.com>
 Molly Sophia <mollysophia379@gmail.com>
+MoonRide303 <130458190+MoonRide303@users.noreply.github.com>
 MorganRO8 <47795945+MorganRO8@users.noreply.github.com>
 Murilo Santana <mvrilo@gmail.com>
 Musab Gultekin <musabgultekin@users.noreply.github.com>
@@ -477,6 +532,7 @@ Neo Zhang <14088817+arthw@users.noreply.github.com>
 Neo Zhang <zhang.jianyu@outlook.com>
 Neo Zhang Jianyu <jianyu.zhang@intel.com>
 Neuman Vong <neuman.vong@gmail.com>
+NeverLucky <92274250+nvrxq@users.noreply.github.com>
 Nexes the Old <124105151+Nexesenex@users.noreply.github.com>
 Nexesenex <124105151+Nexesenex@users.noreply.github.com>
 Niall Coates <1349685+Niall-@users.noreply.github.com>
@@ -484,12 +540,17 @@ Nicholai Tukanov <nicholaitukanov@gmail.com>
 Nico Bosshard <nico@bosshome.ch>
 Nicolai Weitkemper <kontakt@nicolaiweitkemper.de>
 Nicolás Pérez <nicolas_perez@brown.edu>
+Nicolò Scipione <nicolo.scipione@codeplay.com>
 Nigel Bosch <pnigelb@gmail.com>
+Nikita Sarychev <42014488+sARY77@users.noreply.github.com>
 Niklas Korz <niklas@niklaskorz.de>
 NikolaiLyssogor <59844691+NikolaiLyssogor@users.noreply.github.com>
+Nikolaos Pothitos <pothitos@di.uoa.gr>
 Nikolas <127742645+nneubacher@users.noreply.github.com>
 Nindaleth <Nindaleth@users.noreply.github.com>
+Nuno <rare-magma@posteo.eu>
 OSecret <135510162+OLSecret@users.noreply.github.com>
+Oleksandr Kuvshynov <661042+okuvshynov@users.noreply.github.com>
 Oleksandr Nikitin <oleksandr@tvori.info>
 Oleksii Maryshchenko <oleksii.maryshchenko@gmail.com>
 Olivier Chafik <ochafik@users.noreply.github.com>
@@ -499,11 +560,13 @@ PAB <pierreantoine.bannier@gmail.com>
 Pablo Duboue <pablo.duboue@gmail.com>
 Pascal Patry <ppatry@mtacitlabs.com>
 Patrice Ferlet <metal3d@gmail.com>
+Patrick Peng <retr0@retr0.blog>
 Paul Tsochantaris <ptsochantaris@icloud.com>
 Pavel Zloi <github.com@drteam.rocks>
 Pavol Rusnak <pavol@rusnak.io>
 Paweł Wodnicki <151604+32bitmicro@users.noreply.github.com>
 Pedro Cuenca <pedro@huggingface.co>
+Peter <peter277@users.noreply.github.com>
 Peter Sugihara <peter@campsh.com>
 Phil H <5756783+phiharri@users.noreply.github.com>
 Philip Taron <philip.taron@gmail.com>
@@ -514,6 +577,7 @@ Pieter Ouwerkerk <pieter.ouwerkerk@gmail.com>
 Plamen Minev <pacominev@gmail.com>
 Prashant Vithule <119530321+Vithulep@users.noreply.github.com>
 Przemysław Pawełczyk <przemoc@gmail.com>
+PureJourney <edward.pong@qq.com>
 Qin Yue Chen <71813199+chenqiny@users.noreply.github.com>
 Qingyou Meng <meng.qingyou@gmail.com>
 Qu Zongfu <43257352+yancaoweidaode@users.noreply.github.com>
@@ -529,11 +593,17 @@ Rand Xie <randxiexyy29@gmail.com>
 Randall Fitzgerald <randall@dasaku.net>
 Random Fly <renfei8@live.cn>
 Reinforce-II <fate@eastal.com>
+Rémy O <remyoudompheng@gmail.com>
+Rémy Oudompheng <oudomphe@phare.normalesup.org>
 Ren Xuancheng <jklj077@users.noreply.github.com>
 Rene Leonhardt <65483435+reneleonhardt@users.noreply.github.com>
+Reza Kakhki <rezakakhki.de@gmail.com>
+Reza Rahemtola <49811529+RezaRahemtola@users.noreply.github.com>
 RhinoDevel <RhinoDevel@users.noreply.github.com>
+Riccardo Orlando <Riccorl@users.noreply.github.com>
 Riceball LEE <snowyu.lee@gmail.com>
 Rich Dougherty <rich@rd.nz>
+Richard <r-burton@hotmail.co.uk>
 Richard Kiss <him@richardkiss.com>
 Richard Roberson <richardr1126@gmail.com>
 Rick G <26732651+TheFlipbook@users.noreply.github.com>
@@ -544,10 +614,13 @@ Riley Stewart <ristew@users.noreply.github.com>
 Rinne <AsakusaRinne@gmail.com>
 Rinne <liu_yaohui1998@126.com>
 Robert Brisita <986796+rbrisita@users.noreply.github.com>
+Robert Collins <roberto.tomas.cuentas@gmail.com>
+Robert Ormandi <52251610+ormandi@users.noreply.github.com>
 Robert Sung-wook Shin <edp1096@users.noreply.github.com>
 Robey Holderith <robey@flaminglunchbox.net>
 Robyn <robyngraf@users.noreply.github.com>
 Roger Meier <r.meier@siemens.com>
+Rohanjames1997 <rohan.james4@gmail.com>
 Roland <14355895+rbur0425@users.noreply.github.com>
 Romain Biessy <romain.biessy@codeplay.com>
 Romain D <90720+Artefact2@users.noreply.github.com>
@@ -559,7 +632,9 @@ Roni <sulpher@gmx.net>
 Ronny Brendel <ronnybrendel@gmail.com>
 Ronsor <ronsor@ronsor.pw>
 Rowan Hart <rowanbhart@gmail.com>
+Ruan <47767371+ruanych@users.noreply.github.com>
 Ruchira Hasaranga <ruchira66@gmail.com>
+Rudi Servo <rudiservo@gmail.com>
 Ruixin Huang <18860020911@163.com>
 Rune <43761327+Rune-AI@users.noreply.github.com>
 RunningLeon <maningsheng@sensetime.com>
@@ -568,6 +643,7 @@ Ryan Landay <rlanday@gmail.com>
 Ryder Wishart <ryderwishart@gmail.com>
 Ryuei <louixs@users.noreply.github.com>
 Rőczey Barnabás <31726601+An0nie@users.noreply.github.com>
+SAMI <samuel.koesnadi@stud.uni-due.de>
 SRHMorris <69468379+SRHMorris@users.noreply.github.com>
 SXX <sxx1136965276@gmail.com>
 SakuraUmi <yukinon244@gmail.com>
@@ -592,6 +668,8 @@ Shane A <shanea@allenai.org>
 Shangning Xu <32517059+xushangning@users.noreply.github.com>
 Shankar <gshankar.87@gmail.com>
 Shanshan Shen <467638484@qq.com>
+Shelby Jenkins <47464908+ShelbyJenkins@users.noreply.github.com>
+Sheldon Robinson <sheldon.robinson@live.com>
 Shijie <821898965@qq.com>
 Shintarou Okada <kokuzen@gmail.com>
 Shouzheng Liu <61452103+lshzh-ww@users.noreply.github.com>
@@ -623,12 +701,14 @@ Steven Roussey <sroussey@gmail.com>
 Steward Garcia <57494570+FSSRepo@users.noreply.github.com>
 StrangeBytesDev <141275258+StrangeBytesDev@users.noreply.github.com>
 Suaj Carrot <72162667+SuajCarrot@users.noreply.github.com>
+Sukriti Sharma <Ssukriti@users.noreply.github.com>
 SuperUserNameMan <yoann@terminajones.com>
 Sutou Kouhei <kou@cozmixng.org>
 Tai Duc Nguyen <taiducnguyen.drexel@gmail.com>
 Taikono-Himazin <kazu@po.harenet.ne.jp>
 Tameem <113388789+AhmadTameem@users.noreply.github.com>
 Tamotsu Takahashi <ttakah+github@gmail.com>
+Tei Home <taiteitonghome@proton.me>
 Thái Hoàng Tâm <75922889+RoyalHeart@users.noreply.github.com>
 Thatcher Chamberlin <j.thatcher.c@gmail.com>
 Theia Vogel <theia@vgel.me>
@@ -640,6 +720,7 @@ Tim Miller <drasticactions@users.noreply.github.com>
 Tim Wang <overocean@gmail.com>
 Timmy Knight <r2d2fish@gmail.com>
 Timothy Cronin <40186632+4imothy@users.noreply.github.com>
+Ting Lou <louting@189.cn>
 Ting Lou <ting.lou@gmail.com>
 Ting Sun <suntcrick@gmail.com>
 Tobias Lütke <tobi@shopify.com>
@@ -661,25 +742,36 @@ Uzo Nweke <uzoechi@gmail.com>
 Vaibhav Srivastav <vaibhavs10@gmail.com>
 Val Kharitonov <mail@kharvd.com>
 Valentin Konovalov <valle.ketsujin@gmail.com>
+Valentin Mamedov <45292985+Inf1delis@users.noreply.github.com>
 Valentyn Bezshapkin <61702053+valentynbez@users.noreply.github.com>
 Vali Malinoiu <0x4139@gmail.com>
 Victor Nogueira <felladrin@gmail.com>
 Victor Z. Peng <ziliangdotme@gmail.com>
 Viet-Anh NGUYEN (Andrew) <vietanh.dev@gmail.com>
 Vinesh Janarthanan <36610342+VJHack@users.noreply.github.com>
+Vitali Lovich <vlovich+github@gmail.com>
+Vivian <vynride@gmail.com>
 Vlad <spitfireage@gmail.com>
 Vladimir <bogdad@gmail.com>
 Vladimir Malyutin <first-leon@yandex.ru>
+Vladimir Vuksanovic <109677816+vvuksanovic@users.noreply.github.com>
 Vladimir Zorin <vladimir@deviant.guru>
 VoidIsVoid <343750470@qq.com>
 Volodymyr Vitvitskyi <72226+signalpillar@users.noreply.github.com>
+Wagner Bruna <wbruna@users.noreply.github.com>
+Wang Qin <37098874+wangqin0@users.noreply.github.com>
+Wang Ran (汪然) <wangr@smail.nju.edu.cn>
 WangHaoranRobin <56047610+WangHaoranRobin@users.noreply.github.com>
 Weird Constructor <weirdconstructor@gmail.com>
+Weizhao Ouyang <o451686892@gmail.com>
 Welby Seely <welbyseely@gmail.com>
 Wentai Zhang <rchardx@gmail.com>
+Wilken Gottwalt <12194808+wgottwalt@users.noreply.github.com>
 WillCorticesAI <150854901+WillCorticesAI@users.noreply.github.com>
 William Tambellini <william.tambellini@gmail.com>
+William Tambellini <wtambellini@sdl.com>
 Willy Tarreau <w@1wt.eu>
+Woof Dog <197125663+woof-dog@users.noreply.github.com>
 Wouter <9594229+DifferentialityDevelopment@users.noreply.github.com>
 Wu Jian Ping <wujjpp@hotmail.com>
 Wu Jian Ping <wujp@greatld.com>
@@ -692,6 +784,7 @@ Xie Yanbo <xieyanbo@gmail.com>
 Xingchen Song(宋星辰) <xingchensong1996@163.com>
 Xinpeng Dou <81913537+Dou-Git@users.noreply.github.com>
 Xuan Son Nguyen <thichthat@gmail.com>
+Xuan-Son Nguyen <thichthat@gmail.com>
 Yaiko <elyaiko@hotmail.com>
 Yann Follet <131855179+YannFollet@users.noreply.github.com>
 Yaroslav <yaroslav.yashin@me.com>
@@ -702,7 +795,9 @@ Yoshi Suhara <y.suhara@gmail.com>
 Yoshi Suhara <ysuhara@nvidia.com>
 Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
 Yueh-Po Peng <94939112+y10ab1@users.noreply.github.com>
+Yüg <eugeniosegalaweb@gmail.com>
 Yui <dev@sleepyyui.com>
+Yun Dou <dixyes@gmail.com>
 Yuri Khrustalev <ykhrustalev@users.noreply.github.com>
 Yusuf Kağan Hanoğlu <hanoglu@yahoo.com>
 Yuval Peled <31162840+Yuval-Peled@users.noreply.github.com>
@@ -714,18 +809,23 @@ Zhang Peiyuan <a1286225768@gmail.com>
 Zheng.Deng <32841220+dengzheng-cloud@users.noreply.github.com>
 Zhenwei Jin <109658203+kylo5aby@users.noreply.github.com>
 Zhiyuan Li <lizhiyuan@uniartisan.com>
+Zhiyuan Li <uniartisan2017@gmail.com>
 ZhouYuChen <zhouyuchen@naver.com>
 Ziad Ben Hadj-Alouane <zied.benhadjalouane@gmail.com>
 Ziang Wu <97337387+ZiangWu-77@users.noreply.github.com>
 Zsapi <martin1.zsapka@gmail.com>
 a-n-n-a-l-e-e <150648636+a-n-n-a-l-e-e@users.noreply.github.com>
+a3sh <38979186+A3shTnT@users.noreply.github.com>
 adel boussaken <netdur@gmail.com>
 afrideva <95653597+afrideva@users.noreply.github.com>
+ag2s20150909 <19373730+ag2s20150909@users.noreply.github.com>
 agray3 <agray3@users.noreply.github.com>
 akawrykow <142945436+akawrykow@users.noreply.github.com>
+alek3y <44779186+alek3y@users.noreply.github.com>
 alexpinel <93524949+alexpinel@users.noreply.github.com>
 alonfaraj <alonfaraj@gmail.com>
 alwqx <kenan3015@gmail.com>
+amd-dwang <dong.wang@amd.com>
 amd-lalithnc <lalithnc@amd.com>
 amritahs-ibm <amritahs@linux.vnet.ibm.com>
 andrijdavid <david@geek.mg>
@@ -737,6 +837,7 @@ arch-btw <57669023+arch-btw@users.noreply.github.com>
 arcrank <arcrank@gmail.com>
 ardfork <134447697+ardfork@users.noreply.github.com>
 arlo-phoenix <140345165+arlo-phoenix@users.noreply.github.com>
+aryantandon01 <80969509+aryantandon01@users.noreply.github.com>
 at8u <129688334+at8u@users.noreply.github.com>
 automaticcat <daogiatuank54@gmail.com>
 awatuna <23447591+awatuna@users.noreply.github.com>
@@ -751,12 +852,16 @@ bryanSwk <93190252+bryanSwk@users.noreply.github.com>
 bsilvereagle <bsilvereagle@users.noreply.github.com>
 bssrdf <merlintiger@hotmail.com>
 byte-6174 <88070277+byte-6174@users.noreply.github.com>
+cduk <19917266+cduk@users.noreply.github.com>
 cebtenzzre <cebtenzzre@gmail.com>
 chaihahaha <chai836275709@gmail.com>
 chiranko <96988916+chiranko@users.noreply.github.com>
 clibdev <52199778+clibdev@users.noreply.github.com>
 clyang <clyang@clyang.net>
+cmdr2 <secondary.cmdr2@gmail.com>
+cmdr2 <shashank.shekhar.global@gmail.com>
 cocktailpeanut <121128867+cocktailpeanut@users.noreply.github.com>
+codezjx <code.zjx@gmail.com>
 coezbek <c.oezbek@gmail.com>
 comex <comexk@gmail.com>
 compilade <113953597+compilade@users.noreply.github.com>
@@ -774,20 +879,25 @@ deepdiffuser <112834445+deepdiffuser@users.noreply.github.com>
 devojony <61173062+devojony@users.noreply.github.com>
 ditsuke <ditsuke@protonmail.com>
 divinity76 <divinity76@gmail.com>
+dm4 <dm4@secondstate.io>
 dm4 <sunrisedm4@gmail.com>
 dotpy314 <33351922+dotpy314@users.noreply.github.com>
 drbh <david.richard.holtz@gmail.com>
 ds5t5 <145942675+ds5t5@users.noreply.github.com>
 dylan <canardleteer@users.noreply.github.com>
 eastriver <lee@eastriver.dev>
+ebraminio <ebrahim@gnu.org>
 ebraminio <ebraminio@gmail.com>
 eiery <19350831+eiery@users.noreply.github.com>
 eric8607242 <e0928021388@gmail.com>
 fairydreaming <166155368+fairydreaming@users.noreply.github.com>
 fengerhu1 <2748250768@qq.com>
+fj-y-saito <85871716+fj-y-saito@users.noreply.github.com>
 fraxy-v <65565042+fraxy-v@users.noreply.github.com>
+fxzjshm <11426482+fxzjshm@users.noreply.github.com>
 github-actions[bot] <github-actions[bot]@users.noreply.github.com>
 gliptic <gliptic@users.noreply.github.com>
+gn64 <yukikaze.jp@gmail.com>
 goerch <jhr.walter@t-online.de>
 grahameth <96447521+grahameth@users.noreply.github.com>
 gtygo <gtydoit@gmail.com>
@@ -809,13 +919,17 @@ hydai <z54981220@gmail.com>
 iSma <ismail.senhaji@gmail.com>
 iacore <74560659+iacore@users.noreply.github.com>
 icppWorld <124377669+icppWorld@users.noreply.github.com>
+igardev <49397134+igardev@users.noreply.github.com>
 igarnier <igarnier@protonmail.com>
 intelmatt <61025942+intelmatt@users.noreply.github.com>
 iohub <rickyang.pro@gmail.com>
+issixx <46835150+issixx@users.noreply.github.com>
 jacobi petrucciani <8117202+jpetrucciani@users.noreply.github.com>
 jaime-m-p <167997752+jaime-m-p@users.noreply.github.com>
 jameswu2014 <545426914@qq.com>
+jason_w <jason.wang@126.com>
 jdomke <28772296+jdomke@users.noreply.github.com>
+jiahao su <damow890@gmail.com>
 jiez <373447296@qq.com>
 jneem <joeneeman@gmail.com>
 joecryptotoo <80373433+joecryptotoo@users.noreply.github.com>
@@ -825,9 +939,11 @@ jon-chuang <9093549+jon-chuang@users.noreply.github.com>
 jp-x-g <jpxg-dev@protonmail.com>
 jukofyork <69222624+jukofyork@users.noreply.github.com>
 junchao-loongson <68935141+junchao-loongson@users.noreply.github.com>
+junchao-zhao <68935141+junchao-loongson@users.noreply.github.com>
 jwj7140 <32943891+jwj7140@users.noreply.github.com>
 k.h.lai <adrian.k.h.lai@outlook.com>
 kaizau <kaizau@users.noreply.github.com>
+kallewoof <kalle.alm@gmail.com>
 kalomaze <66376113+kalomaze@users.noreply.github.com>
 kang <tpdns9032100@gmail.com>
 katsu560 <118887472+katsu560@users.noreply.github.com>
@@ -835,6 +951,7 @@ kchro3 <62481661+kchro3@users.noreply.github.com>
 khimaros <me@khimaros.com>
 kiltyj <kiltyj@gmail.com>
 klosax <131523366+klosax@users.noreply.github.com>
+krystiancha <krystian@krystianch.com>
 kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com>
 kunnis <kunnis@users.noreply.github.com>
 kuronekosaiko <EvanChanJ@163.com>
@@ -847,6 +964,8 @@ ldwang <ftgreat@163.com>
 le.chang <cljs118@126.com>
 leejet <leejet714@gmail.com>
 leo-pony <nengjunma@outlook.com>
+lexasub <lexakopp2212@gmail.com>
+lhez <quic_lih@quicinc.com>
 limitedAtonement <limitedAtonement@users.noreply.github.com>
 liuwei-git <14815172+liuwei-git@users.noreply.github.com>
 lon <114724657+longregen@users.noreply.github.com>
@@ -855,19 +974,25 @@ ltoniazzi <61414566+ltoniazzi@users.noreply.github.com>
 luoyu-intel <yu.luo@intel.com>
 m3ndax <adrian.goessl@outlook.com>
 maddes8cht <55592906+maddes8cht@users.noreply.github.com>
+magicse <magicse@users.noreply.github.com>
+mahorozte <41834471+mahorozte@users.noreply.github.com>
 makomk <makosoft@googlemail.com>
 manikbhandari <mbbhandarimanik2@gmail.com>
 maor-ps <154728172+maor-ps@users.noreply.github.com>
+mashdragon <122402293+mashdragon@users.noreply.github.com>
 matiaslin <45382001+matiaslin@users.noreply.github.com>
+matt23654 <matthew.webber@protonmail.com>
 matteo <matteogeniaccio@yahoo.it>
 mdrokz <mohammadmunshi@gmail.com>
 mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com>
+midnight <midnightmagic@users.noreply.github.com>
 minarchist <minarchist@users.noreply.github.com>
 mj-shifu <77107165+mj-shifu@users.noreply.github.com>
 mmyjona <jonathan.gonse@gmail.com>
 momonga <115213907+mmnga@users.noreply.github.com>
 momonga <146910567+mmngays@users.noreply.github.com>
 moritzbrantner <31051084+moritzbrantner@users.noreply.github.com>
+musoles <135031143+musoles@users.noreply.github.com>
 mzcu <milos.cubrilo@gmail.com>
 nanahi <130121847+na-na-hi@users.noreply.github.com>
 ngc92 <7938269+ngc92@users.noreply.github.com>
@@ -884,18 +1009,23 @@ omahs <73983677+omahs@users.noreply.github.com>
 oobabooga <112222186+oobabooga@users.noreply.github.com>
 opparco <parco.opaai@gmail.com>
 ostix360 <55257054+ostix360@users.noreply.github.com>
+pascal-lc <49066376+pascal-lc@users.noreply.github.com>
 pculliton <phillipculliton@gmail.com>
+peidaqi <peidaqi@gmail.com>
 pengxin99 <pengxin.yuan@intel.com>
 perserk <perserk@gmail.com>
+petterreinholdtsen <pere-github@hungry.com>
 piDack <104877312+piDack@users.noreply.github.com>
 pmysl <piotr.myslinski@outlook.com>
 postmasters <namnguyen@google.com>
 pudepiedj <pudepiedj@gmail.com>
 qingfengfenga <41416092+qingfengfenga@users.noreply.github.com>
+qingy1337 <qxli2@students.everettcc.edu>
 qouoq <qouoq@fastmail.com>
 qunash <anzoria@gmail.com>
 rabidcopy <rabidcopy@yahoo.com>
 rankaiyx <rankaiyx@rankaiyx.com>
+redbeard <bharrington@alticon.net>
 rhjdvsgsgks <26178113+rhjdvsgsgks@users.noreply.github.com>
 rhuddleston <ryan.huddleston@percona.com>
 rimoliga <53384203+rimoliga@users.noreply.github.com>
@@ -906,12 +1036,14 @@ semidark <me@semidark.net>
 serhii-nakon <57632032+serhii-nakon@users.noreply.github.com>
 sharpHL <132747147+sharpHL@users.noreply.github.com>
 shibe2 <shibe@tuta.io>
+simon886212 <37953122+simon886212@users.noreply.github.com>
 singularity <12184989+singularity-s0@users.noreply.github.com>
 sjinzh <sjinzh@gmail.com>
 sjxx <63994076+ylsdamxssjxxdd@users.noreply.github.com>
 slaren <2141330+slaren@users.noreply.github.com>
 slaren <slarengh@gmail.com>
 snadampal <87143774+snadampal@users.noreply.github.com>
+someone13574 <81528246+someone13574@users.noreply.github.com>
 standby24x7 <standby24x7@gmail.com>
 staviq <staviq@gmail.com>
 stduhpf <stephduh@live.fr>
@@ -922,19 +1054,23 @@ tarcey <cey.tarik@gmail.com>
 tc-mb <157115220+tc-mb@users.noreply.github.com>
 texmex76 <40733439+texmex76@users.noreply.github.com>
 thement <40525767+thement@users.noreply.github.com>
+theraininsky <76763719+theraininsky@users.noreply.github.com>
 thewh1teagle <61390950+thewh1teagle@users.noreply.github.com>
 tjohnman <tjohnman@users.noreply.github.com>
 toyer <2042519524@qq.com>
 tslmy <tslmy@users.noreply.github.com>
+tv1wnd <55383215+tv1wnd@users.noreply.github.com>
 ubik2 <ubik2@users.noreply.github.com>
 uint256_t <konndennsa@gmail.com>
 uint256_t <maekawatoshiki1017@gmail.com>
 unbounded <haakon@likedan.net>
 uvos <devnull@uvos.xyz>
+uvos <philipp@uvos.xyz>
 valiray <133289098+valiray@users.noreply.github.com>
 vb <vaibhavs10@gmail.com>
 vik <vikhyatk@gmail.com>
 viric <viric@viric.name>
+vmobilis <75476228+vmobilis@users.noreply.github.com>
 vodkaslime <646329483@qq.com>
 vvhg1 <94630311+vvhg1@users.noreply.github.com>
 vxiiduu <73044267+vxiiduu@users.noreply.github.com>
@@ -949,8 +1085,11 @@ wzy <32936898+Freed-Wu@users.noreply.github.com>
 xaedes <xaedes@gmail.com>
 xaedes <xaedes@googlemail.com>
 xctan <axunlei@gmail.com>
+xiaobing318 <71554036+xiaobing318@users.noreply.github.com>
+xiaofei <hbuxiaofei@gmail.com>
 xloem <0xloem@gmail.com>
 yangli2 <yangli2@gmail.com>
+ymcki <84055651+ymcki@users.noreply.github.com>
 yuiseki <yuiseki@gmail.com>
 yuri@FreeBSD <yurivict@users.noreply.github.com>
 zakkor <edward.partenie@gmail.com>
@@ -963,4 +1102,5 @@ zrm <trustiosity.zrm@gmail.com>
 杨朱 · Kiki <baofa.fan@daocloud.io>
 源文雨 <41315874+fumiama@users.noreply.github.com>
 蕭澧邦 <45505768+shou692199@users.noreply.github.com>
+谢乃闻 <sienaiwun@users.noreply.github.com>
 Нияз Гарифзянов <112617865+garrnizon@users.noreply.github.com>
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,6 +16,7 @@ endif()
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")

 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)

 if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
    set(LLAMA_STANDALONE ON)
@@ -49,6 +50,8 @@ endif()
 if (MSVC)
    add_compile_options("$<$<COMPILE_LANGUAGE:C>:/utf-8>")
    add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/utf-8>")
+    add_compile_options("$<$<COMPILE_LANGUAGE:C>:/bigobj>")
+    add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/bigobj>")
 endif()

 #
@@ -77,17 +80,15 @@ option(LLAMA_BUILD_SERVER   "llama: build server example" ${LLAMA_STANDALONE})

 # 3rd party libs
 option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
+option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF)

 # Required for relocatable CMake package
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake)

 # override ggml options
-set(GGML_SANITIZE_THREAD    ${LLAMA_SANITIZE_THREAD})
-set(GGML_SANITIZE_ADDRESS   ${LLAMA_SANITIZE_ADDRESS})
-set(GGML_SANITIZE_UNDEFINED ${LLAMA_SANITIZE_UNDEFINED})
-set(GGML_ALL_WARNINGS       ${LLAMA_ALL_WARNINGS})
-set(GGML_FATAL_WARNINGS     ${LLAMA_FATAL_WARNINGS})
+set(GGML_ALL_WARNINGS   ${LLAMA_ALL_WARNINGS})
+set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})

 # change the default for these ggml options
 if (NOT DEFINED GGML_LLAMAFILE)
@@ -117,16 +118,62 @@ llama_option_depr(WARNING     LLAMA_SYCL                GGML_SYCL)
 llama_option_depr(WARNING     LLAMA_SYCL_F16            GGML_SYCL_F16)
 llama_option_depr(WARNING     LLAMA_CANN                GGML_CANN)

+if (NOT MSVC)
+    if (LLAMA_SANITIZE_THREAD)
+        message(STATUS "Using -fsanitize=thread")
+
+        add_compile_options(-fsanitize=thread)
+        link_libraries     (-fsanitize=thread)
+    endif()
+
+    if (LLAMA_SANITIZE_ADDRESS)
+        message(STATUS "Using -fsanitize=address")
+
+        add_compile_options(-fsanitize=address -fno-omit-frame-pointer)
+        link_libraries     (-fsanitize=address)
+    endif()
+
+    if (LLAMA_SANITIZE_UNDEFINED)
+        message(STATUS "Using -fsanitize=undefined")
+
+        add_compile_options(-fsanitize=undefined)
+        link_libraries     (-fsanitize=undefined)
+    endif()
+endif()
+
 #
-# build the library
+# 3rd-party
 #

 if (NOT TARGET ggml)
    add_subdirectory(ggml)
    # ... otherwise assume ggml is added by a parent CMakeLists.txt
 endif()
+
+#
+# build the library
+#
+
 add_subdirectory(src)

+#
+# utils, programs, examples and tests
+#
+
+if (LLAMA_BUILD_COMMON)
+    add_subdirectory(common)
+endif()
+
+if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
+    include(CTest)
+    add_subdirectory(tests)
+endif()
+
+if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES)
+    add_subdirectory(examples)
+    add_subdirectory(pocs)
+endif()
+
 #
 # install
 #
@@ -142,27 +189,14 @@ set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location o
 set(LLAMA_LIB_INSTALL_DIR     ${CMAKE_INSTALL_LIBDIR}     CACHE PATH "Location of library files")
 set(LLAMA_BIN_INSTALL_DIR     ${CMAKE_INSTALL_BINDIR}     CACHE PATH "Location of binary  files")

-# At the moment some compile definitions are placed within the ggml/src
-# directory but not exported on the `ggml` target. This could be improved by
-# determining _precisely_ which defines are necessary for the llama-config
-# package.
-#
-set(GGML_TRANSIENT_DEFINES)
-get_target_property(GGML_DIRECTORY ggml SOURCE_DIR)
-get_directory_property(GGML_DIR_DEFINES DIRECTORY ${GGML_DIRECTORY} COMPILE_DEFINITIONS)
-if (GGML_DIR_DEFINES)
-    list(APPEND GGML_TRANSIENT_DEFINES ${GGML_DIR_DEFINES})
-endif()
-get_target_property(GGML_TARGET_DEFINES ggml COMPILE_DEFINITIONS)
-if (GGML_TARGET_DEFINES)
-    list(APPEND GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES})
-endif()
-get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)
-# all public headers
 set(LLAMA_PUBLIC_HEADERS
    ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h
    ${CMAKE_CURRENT_SOURCE_DIR}/include/llama-cpp.h)
-set_target_properties(llama PROPERTIES PUBLIC_HEADER "${LLAMA_PUBLIC_HEADERS}")
+
+set_target_properties(llama
+    PROPERTIES
+        PUBLIC_HEADER "${LLAMA_PUBLIC_HEADERS}")
+
 install(TARGETS llama LIBRARY PUBLIC_HEADER)

 configure_package_config_file(
@@ -199,22 +233,4 @@ configure_file(cmake/llama.pc.in
        @ONLY)

 install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
-        DESTINATION lib/pkgconfig)
-
-#
-# utils, programs, examples and tests
-#
-
-if (LLAMA_BUILD_COMMON)
-    add_subdirectory(common)
-endif()
-
-if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
-    include(CTest)
-    add_subdirectory(tests)
-endif()
-
-if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES)
-    add_subdirectory(examples)
-    add_subdirectory(pocs)
-endif()
+        DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,10 +1,12 @@
 # Pull requests (for contributors)

+- llama.cpp uses the ggml tensor library for model evaluation. If you are unfamiliar with ggml, consider taking a look at the [examples in the ggml repository](https://github.com/ggml-org/ggml/tree/master/examples/). [simple](https://github.com/ggml-org/ggml/tree/master/examples/simple) shows the bare minimum for using ggml. [gpt-2](https://github.com/ggml-org/ggml/tree/master/examples/gpt-2) has minimal implementations for language model inference using GPT-2. [mnist](https://github.com/ggml-org/ggml/tree/master/examples/mnist) demonstrates how to train and evaluate a simple image classifier
 - Test your changes:
    - Execute [the full CI locally on your machine](ci/README.md) before publishing
    - Verify that the perplexity and the performance are not affected negatively by your changes (use `llama-perplexity` and `llama-bench`)
    - If you modified the `ggml` source, run the `test-backend-ops` tool to check whether different backend implementations of the `ggml` operators produce consistent results (this requires access to at least two different `ggml` backends)
    - If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops`
+- Create separate PRs for each feature or fix. Avoid combining unrelated changes in a single PR
 - Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
 - If your PR becomes stale, don't hesitate to ping the maintainers in the comments

@@ -12,7 +14,7 @@

 - Squash-merge PRs
 - Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
- Optionally pick a `<module>` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules
+- Optionally pick a `<module>` from here: https://github.com/ggml-org/llama.cpp/wiki/Modules
 - Consider adding yourself to [CODEOWNERS](CODEOWNERS)

 # Coding guidelines
@@ -37,17 +39,17 @@

    _(NOTE: this guideline is yet to be applied to the `llama.cpp` codebase. New code should follow this guideline.)_

- Try to follow the existing patterns in the code (indentation, spaces, etc.). In case of doubt use `clang-format` to format the added code
+- Try to follow the existing patterns in the code (indentation, spaces, etc.). In case of doubt use `clang-format` (from clang-tools v15+) to format the added code
 - For anything not covered in the current guidelines, refer to the [C++ Core Guidelines](https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines)
 - Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$
+- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggml-org/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$

 ![matmul](media/matmul.png)

 # Naming guidelines

 - Use `snake_case` for function, variable and type names
- Naming usually optimizes for longest common prefix (see https://github.com/ggerganov/ggml/pull/302#discussion_r1243240963)
+- Naming usually optimizes for longest common prefix (see https://github.com/ggml-org/ggml/pull/302#discussion_r1243240963)

    ```cpp
    // not OK
@@ -122,4 +124,4 @@

 The Github issues, PRs and discussions contain a lot of information that can be useful to get familiar with the codebase. For convenience, some of the more important information is referenced from Github projects:

-https://github.com/ggerganov/llama.cpp/projects
+https://github.com/ggml-org/llama.cpp/projects
--- a/37
+++ b/37
@@ -1,5 +1,5 @@
 ifndef LLAMA_MAKEFILE
-$(error The Makefile build is deprecated. Use the CMake build instead. For more details, see https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
+$(error The Makefile build is deprecated. Use the CMake build instead. For more details, see https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md)
 endif

 # Define the default target now so that it is always the first target
@@ -52,6 +52,7 @@ TEST_TARGETS = \
 	tests/test-arg-parser \
 	tests/test-autorelease \
 	tests/test-backend-ops \
+	tests/test-chat \
 	tests/test-chat-template \
 	tests/test-double-float \
 	tests/test-grammar-integration \
@@ -462,7 +463,7 @@ endif
 ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))'
 	# The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves.
 	# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
-	# https://github.com/ggerganov/llama.cpp/issues/2922
+	# https://github.com/ggml-org/llama.cpp/issues/2922
 	MK_CFLAGS   += -Xassembler -muse-unaligned-vector-move
 	MK_CXXFLAGS += -Xassembler -muse-unaligned-vector-move

@@ -595,7 +596,7 @@ ifdef GGML_RPC
 	OBJ_GGML_EXT += ggml/src/ggml-rpc.o
 endif # GGML_RPC

-OBJ_CUDA_TMPL      = $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-wmma*.cu))
+OBJ_CUDA_TMPL      = $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-mma*.cu))
 OBJ_CUDA_TMPL     += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/mmq*.cu))

 ifdef GGML_CUDA_FA_ALL_QUANTS
@@ -679,6 +680,10 @@ ifdef GGML_CUDA_CCBIN
 	MK_NVCCFLAGS += -ccbin $(GGML_CUDA_CCBIN)
 endif # GGML_CUDA_CCBIN

+ifdef GGML_CUDA_NO_FA
+	MK_NVCCFLAGS += -DGGML_CUDA_NO_FA
+endif # GGML_CUDA_NO_FA
+
 ifdef GGML_CUDA_FA_ALL_QUANTS
 	MK_NVCCFLAGS += -DGGML_CUDA_FA_ALL_QUANTS
 endif # GGML_CUDA_FA_ALL_QUANTS
@@ -799,6 +804,10 @@ ifdef GGML_CUDA_NO_PEER_COPY
 	HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY
 endif # GGML_CUDA_NO_PEER_COPY

+ifdef GGML_CUDA_NO_FA
+	HIPFLAGS += -DGGML_CUDA_NO_FA
+endif # GGML_CUDA_NO_FA
+
 	OBJ_GGML_EXT += ggml/src/ggml-cuda/ggml-cuda.o
 	OBJ_GGML_EXT += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
 	OBJ_GGML_EXT += $(OBJ_CUDA_TMPL)
@@ -827,7 +836,7 @@ ifdef GGML_MUSA
 	else
 		MUSA_PATH ?= /opt/musa
 	endif
-	MUSA_ARCHITECTURES ?= 21;22
+	MUSA_ARCHITECTURES ?= 21;22;31

 	MK_CPPFLAGS += -DGGML_USE_MUSA -DGGML_USE_CUDA
 	MK_LDFLAGS += -L$(MUSA_PATH)/lib -Wl,-rpath=$(MUSA_PATH)/lib
@@ -846,7 +855,7 @@ ifdef GGML_MUSA
 	CXX := $(MUSA_PATH)/bin/clang++
 	MCC := $(CCACHE) $(MUSA_PATH)/bin/mcc

-	MUSAFLAGS  = -x musa -mtgpu
+	MUSAFLAGS  = -fsigned-char -x musa -mtgpu
 	MUSAFLAGS += $(foreach arch,$(subst ;, ,$(MUSA_ARCHITECTURES)),--cuda-gpu-arch=mp_$(arch))

 ifdef GGML_CUDA_FORCE_MMQ
@@ -875,6 +884,10 @@ ifdef GGML_CUDA_NO_PEER_COPY
 	MUSAFLAGS += -DGGML_CUDA_NO_PEER_COPY
 endif # GGML_CUDA_NO_PEER_COPY

+ifdef GGML_CUDA_NO_FA
+	MUSAFLAGS += -DGGML_CUDA_NO_FA
+endif # GGML_CUDA_NO_FA
+
 ifdef GGML_CUDA_FA_ALL_QUANTS
 	MUSAFLAGS += -DGGML_CUDA_FA_ALL_QUANTS
 endif # GGML_CUDA_FA_ALL_QUANTS
@@ -983,6 +996,7 @@ OBJ_COMMON = \
 	$(DIR_COMMON)/ngram-cache.o \
 	$(DIR_COMMON)/sampling.o \
 	$(DIR_COMMON)/speculative.o \
+	$(DIR_COMMON)/chat.o \
 	$(DIR_COMMON)/build-info.o \
 	$(DIR_COMMON)/json-schema-to-grammar.o

@@ -1076,8 +1090,8 @@ endif
 ifdef REMOVE_WARNING
 $(info !!! REMOVAL WARNING !!!)
 $(info The following LLAMA_ options have been removed and are no longer supported)
-$(info   - LLAMA_DISABLE_LOGS   (https://github.com/ggerganov/llama.cpp/pull/9418))
-$(info   - LLAMA_SERVER_VERBOSE (https://github.com/ggerganov/llama.cpp/pull/9418))
+$(info   - LLAMA_DISABLE_LOGS   (https://github.com/ggml-org/llama.cpp/pull/9418))
+$(info   - LLAMA_SERVER_VERBOSE (https://github.com/ggml-org/llama.cpp/pull/9418))
 $(info )
 endif

@@ -1361,7 +1375,11 @@ llama-server: \
 	examples/server/httplib.h \
 	examples/server/index.html.hpp \
 	examples/server/loading.html.hpp \
+	common/chat.cpp \
+	common/chat.h \
+	common/chat-template.hpp \
 	common/json.hpp \
+	common/minja.hpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
@@ -1469,6 +1487,11 @@ tests/test-json-schema-to-grammar: tests/test-json-schema-to-grammar.cpp \
 	$(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

+tests/test-chat: tests/test-chat.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
 tests/test-opt: tests/test-opt.cpp \
 	$(OBJ_GGML)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
--- a/Package.swift
+++ b/Package.swift
@@ -1,19 +0,0 @@
-// swift-tools-version:5.5
-
-import PackageDescription
-
-let package = Package(
-    name: "llama",
-    platforms: [
-        .macOS(.v12),
-        .iOS(.v14),
-        .watchOS(.v4),
-        .tvOS(.v14)
-    ],
-    products: [
-        .library(name: "llama", targets: ["llama"]),
-    ],
-    targets: [
-        .systemLibrary(name: "llama", pkgConfig: "llama"),
-    ]
-)
--- a/README.md
+++ b/README.md
@@ -3,22 +3,33 @@
 ![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)

 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
-[![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
+[![Server](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml)

-[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
+[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggml-org/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml)

 Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++

+> [!IMPORTANT]
+> New `llama.cpp` package location: [ggml-org/llama.cpp](https://github.com/ggml-org/llama.cpp/pkgs/container/llama.cpp)
+>
+> Update your container URLs to: `ghcr.io/ggml-org/llama.cpp`
+>
+> More info: https://github.com/ggml-org/llama.cpp/discussions/11801
+
 ## Recent API changes

- [Changelog for `libllama` API](https://github.com/ggerganov/llama.cpp/issues/9289)
- [Changelog for `llama-server` REST API](https://github.com/ggerganov/llama.cpp/issues/9291)
+- [Changelog for `libllama` API](https://github.com/ggml-org/llama.cpp/issues/9289)
+- [Changelog for `llama-server` REST API](https://github.com/ggml-org/llama.cpp/issues/9291)

 ## Hot topics

- **Introducing GGUF-my-LoRA** https://github.com/ggerganov/llama.cpp/discussions/10123
- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669
- Hugging Face GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
+- **How to use [MTLResidencySet](https://developer.apple.com/documentation/metal/mtlresidencyset?language=objc) to keep the GPU memory active?** https://github.com/ggml-org/llama.cpp/pull/11427
+- **VS Code extension for FIM completions:** https://github.com/ggml-org/llama.vscode
+- Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
+- Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
+- Introducing GGUF-my-LoRA https://github.com/ggml-org/llama.cpp/discussions/10123
+- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggml-org/llama.cpp/discussions/9669
+- Hugging Face GGUF editor: [discussion](https://github.com/ggml-org/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)

 ----

@@ -35,7 +46,7 @@ range of hardware - locally and in the cloud.
 - Vulkan and SYCL backend support
 - CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity

-The `llama.cpp` project is the main playground for developing new features for the [ggml](https://github.com/ggerganov/ggml) library.
+The `llama.cpp` project is the main playground for developing new features for the [ggml](https://github.com/ggml-org/ggml) library.

 <details>
 <summary>Models</summary>
@@ -55,23 +66,23 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon)
 - [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
 - [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
- [X] [BERT](https://github.com/ggerganov/llama.cpp/pull/5423)
+- [X] [BERT](https://github.com/ggml-org/llama.cpp/pull/5423)
 - [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
 - [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft)
 - [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila)
- [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187)
+- [X] [Starcoder models](https://github.com/ggml-org/llama.cpp/pull/3187)
 - [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim)
- [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417)
- [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553)
+- [X] [MPT](https://github.com/ggml-org/llama.cpp/pull/3417)
+- [X] [Bloom](https://github.com/ggml-org/llama.cpp/pull/3553)
 - [x] [Yi models](https://huggingface.co/models?search=01-ai/Yi)
 - [X] [StableLM models](https://huggingface.co/stabilityai)
 - [x] [Deepseek models](https://huggingface.co/models?search=deepseek-ai/deepseek)
 - [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen)
- [x] [PLaMo-13B](https://github.com/ggerganov/llama.cpp/pull/3557)
+- [x] [PLaMo-13B](https://github.com/ggml-org/llama.cpp/pull/3557)
 - [x] [Phi models](https://huggingface.co/models?search=microsoft/phi)
- [x] [PhiMoE](https://github.com/ggerganov/llama.cpp/pull/11003)
+- [x] [PhiMoE](https://github.com/ggml-org/llama.cpp/pull/11003)
 - [x] [GPT-2](https://huggingface.co/gpt2)
- [x] [Orion 14B](https://github.com/ggerganov/llama.cpp/pull/5118)
+- [x] [Orion 14B](https://github.com/ggml-org/llama.cpp/pull/5118)
 - [x] [InternLM2](https://huggingface.co/models?search=internlm2)
 - [x] [CodeShell](https://github.com/WisdomShell/codeshell)
 - [x] [Gemma](https://ai.google.dev/gemma)
@@ -92,7 +103,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - [x] [Bitnet b1.58 models](https://huggingface.co/1bitLLM)
 - [x] [Flan T5](https://huggingface.co/models?search=flan-t5)
 - [x] [Open Elm models](https://huggingface.co/collections/apple/openelm-instruct-models-6619ad295d7ae9f868b759ca)
- [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b)
+- [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b) + [GLMEdge-1.5b](https://huggingface.co/THUDM/glm-edge-1.5b-chat) + [GLMEdge-4b](https://huggingface.co/THUDM/glm-edge-4b-chat)
 - [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)
 - [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct)
 - [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a)
@@ -113,6 +124,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
 - [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
 - [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)
+- [x] [GLM-EDGE](https://huggingface.co/models?search=glm-edge)
 - [x] [Qwen2-VL](https://huggingface.co/collections/Qwen/qwen2-vl-66cee7455501d7126940800d)

 </details>
@@ -131,6 +143,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - Rust (more features): [edgenai/llama_cpp-rs](https://github.com/edgenai/llama_cpp-rs)
 - Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
 - Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs)
+- Rust (automated build from crates.io): [ShelbyJenkins/llm_client](https://github.com/ShelbyJenkins/llm_client)
 - C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
 - C#/VB.NET (more features - community license): [LM-Kit.NET](https://docs.lm-kit.com/lm-kit-net/index.html)
 - Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
@@ -140,10 +153,11 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
 - Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
 - Flutter: [xuegao-tzx/Fllama](https://github.com/xuegao-tzx/Fllama)
- PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326)
+- PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggml-org/llama.cpp/pull/6326)
 - Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp)
 - Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift)
 - Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama)
+- Delphi [Embarcadero/llama-cpp-delphi](https://github.com/Embarcadero/llama-cpp-delphi)

 </details>

@@ -158,6 +172,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
 - [iohub/collama](https://github.com/iohub/coLLaMA) (Apache-2.0)
 - [janhq/jan](https://github.com/janhq/jan) (AGPL)
+- [johnbean393/Sidekick](https://github.com/johnbean393/Sidekick) (MIT)
 - [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file) (Apache-2.0)
 - [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
 - [llama.vim](https://github.com/ggml-org/llama.vim) (MIT)
@@ -183,6 +198,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - [ramalama](https://github.com/containers/ramalama) (MIT)
 - [semperai/amica](https://github.com/semperai/amica) (MIT)
 - [withcatai/catai](https://github.com/withcatai/catai) (MIT)
+- [Autopen](https://github.com/blackhole89/autopen) (GPL)

 </details>

@@ -204,7 +220,8 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
 - [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly
 - [llama-swap](https://github.com/mostlygeek/llama-swap) - transparent proxy that adds automatic model switching with llama-server
-
+- [Kalavai](https://github.com/kalavai-net/kalavai-client) - Crowdsource end to end LLM deployment at any scale
+- [llmaz](https://github.com/InftyAI/llmaz) - ☸️ Easy, advanced inference platform for large language models on Kubernetes.
 </details>

 <details>
@@ -227,6 +244,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 | [HIP](docs/build.md#hip) | AMD GPU |
 | [Vulkan](docs/build.md#vulkan) | GPU |
 | [CANN](docs/build.md#cann) | Ascend NPU |
+| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |

 ## Building the project

@@ -236,7 +254,7 @@ The project also includes many example programs and tools using the `llama` libr
 - Clone this repository and build locally, see [how to build](docs/build.md)
 - On MacOS or Linux, install `llama.cpp` via [brew, flox or nix](docs/install.md)
 - Use a Docker image, see [documentation for Docker](docs/docker.md)
- Download pre-built binaries from [releases](https://github.com/ggerganov/llama.cpp/releases)
+- Download pre-built binaries from [releases](https://github.com/ggml-org/llama.cpp/releases)

 ## Obtaining and quantizing models

@@ -249,14 +267,14 @@ You can either manually download the GGUF file or directly use any `llama.cpp`-c

 After downloading a model, use the CLI tools to run it locally - see below.

-`llama.cpp` requires the model to be stored in the [GGUF](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) file format. Models in other data formats can be converted to GGUF using the `convert_*.py` Python scripts in this repo.
+`llama.cpp` requires the model to be stored in the [GGUF](https://github.com/ggml-org/ggml/blob/master/docs/gguf.md) file format. Models in other data formats can be converted to GGUF using the `convert_*.py` Python scripts in this repo.

 The Hugging Face platform provides a variety of online tools for converting, quantizing and hosting models with `llama.cpp`:

 - Use the [GGUF-my-repo space](https://huggingface.co/spaces/ggml-org/gguf-my-repo) to convert to GGUF format and quantize model weights to smaller sizes
- Use the [GGUF-my-LoRA space](https://huggingface.co/spaces/ggml-org/gguf-my-lora) to convert LoRA adapters to GGUF format (more info: https://github.com/ggerganov/llama.cpp/discussions/10123)
- Use the [GGUF-editor space](https://huggingface.co/spaces/CISCai/gguf-editor) to edit GGUF meta data in the browser (more info: https://github.com/ggerganov/llama.cpp/discussions/9268)
- Use the [Inference Endpoints](https://ui.endpoints.huggingface.co/) to directly host `llama.cpp` in the cloud (more info: https://github.com/ggerganov/llama.cpp/discussions/9669)
+- Use the [GGUF-my-LoRA space](https://huggingface.co/spaces/ggml-org/gguf-my-lora) to convert LoRA adapters to GGUF format (more info: https://github.com/ggml-org/llama.cpp/discussions/10123)
+- Use the [GGUF-editor space](https://huggingface.co/spaces/CISCai/gguf-editor) to edit GGUF meta data in the browser (more info: https://github.com/ggml-org/llama.cpp/discussions/9268)
+- Use the [Inference Endpoints](https://ui.endpoints.huggingface.co/) to directly host `llama.cpp` in the cloud (more info: https://github.com/ggml-org/llama.cpp/discussions/9669)

 To learn more about model quantization, [read this documentation](examples/quantize/README.md)

@@ -418,7 +436,7 @@ To learn more about model quantization, [read this documentation](examples/quant

    </details>

-[^1]: [examples/perplexity/README.md](examples/perplexity/README.md)
+[^1]: [examples/perplexity/README.md](./examples/perplexity/README.md)
 [^2]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity)

 ## [`llama-bench`](examples/llama-bench)
@@ -479,9 +497,9 @@ To learn more about model quantization, [read this documentation](examples/quant
 - Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
 - Collaborators will be invited based on contributions
 - Any help with managing issues, PRs and projects is very appreciated!
- See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
+- See [good first issues](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
 - Read the [CONTRIBUTING.md](CONTRIBUTING.md) for more information
- Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205)
+- Make sure to read this: [Inference at the edge](https://github.com/ggml-org/llama.cpp/discussions/205)
 - A bit of backstory for those who are interested: [Changelog podcast](https://changelog.com/podcast/532)

 ## Other documentation
@@ -496,7 +514,7 @@ To learn more about model quantization, [read this documentation](examples/quant
 - [Running on Docker](docs/docker.md)
 - [Build on Android](docs/android.md)
 - [Performance troubleshooting](docs/development/token_generation_performance_tips.md)
- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
+- [GGML tips & tricks](https://github.com/ggml-org/llama.cpp/wiki/GGML-Tips-&-Tricks)

 #### Seminal papers and background on the models

@@ -510,5 +528,18 @@ If your issue is with model generation quality, then please at least scan the fo
    - [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
    - [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)

-#### References
+## Completions
+Command-line completion is available for some environments.

+#### Bash Completion
+```bash
+$ build/bin/llama-cli --completion-bash > ~/.llama-completion.bash
+$ source ~/.llama-completion.bash
+```
+Optionally this can be added to your `.bashrc` or `.bash_profile` to load it
+automatically. For example:
+```console
+$ echo "source ~/.llama-completion.bash" >> ~/.bashrc
+```
+
+## References
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -62,6 +62,6 @@ Beware that none of the topics under [Using llama.cpp securely](#using-llamacpp-
 <!-- normal version -->
 However, If you have discovered a security vulnerability in this project, please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released.

-Please disclose it as a private [security advisory](https://github.com/ggerganov/llama.cpp/security/advisories/new).
+Please disclose it as a private [security advisory](https://github.com/ggml-org/llama.cpp/security/advisories/new).

 A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.
--- a/Sources/llama/llama.h
+++ b/Sources/llama/llama.h
@@ -1,4 +0,0 @@
-#pragma once
-
-#include <llama.h>
-
--- a/Sources/llama/module.modulemap
+++ b/Sources/llama/module.modulemap
@@ -1,5 +0,0 @@
-module llama [system] {
-    header "llama.h"
-    link "llama"
-    export *
-}
--- a/build-xcframework.sh
+++ b/build-xcframework.sh
@@ -0,0 +1,519 @@
+#!/bin/bash
+#
+# Options
+IOS_MIN_OS_VERSION=16.4
+MACOS_MIN_OS_VERSION=13.3
+VISIONOS_MIN_OS_VERSION=1.0
+TVOS_MIN_OS_VERSION=16.4
+
+BUILD_SHARED_LIBS=OFF
+LLAMA_BUILD_EXAMPLES=OFF
+LLAMA_BUILD_TESTS=OFF
+LLAMA_BUILD_SERVER=OFF
+GGML_METAL=ON
+GGML_METAL_EMBED_LIBRARY=ON
+GGML_BLAS_DEFAULT=ON
+GGML_METAL_USE_BF16=ON
+GGML_OPENMP=OFF
+
+COMMON_C_FLAGS="-Wno-macro-redefined -Wno-shorten-64-to-32 -Wno-unused-command-line-argument -g"
+COMMON_CXX_FLAGS="-Wno-macro-redefined -Wno-shorten-64-to-32 -Wno-unused-command-line-argument -g"
+
+# Common options for all builds
+COMMON_CMAKE_ARGS=(
+    -DCMAKE_XCODE_ATTRIBUTE_CODE_SIGNING_REQUIRED=NO
+    -DCMAKE_XCODE_ATTRIBUTE_CODE_SIGN_IDENTITY=""
+    -DCMAKE_XCODE_ATTRIBUTE_CODE_SIGNING_ALLOWED=NO
+    -DCMAKE_XCODE_ATTRIBUTE_DEBUG_INFORMATION_FORMAT="dwarf-with-dsym"
+    -DCMAKE_XCODE_ATTRIBUTE_GCC_GENERATE_DEBUGGING_SYMBOLS=YES
+    -DCMAKE_XCODE_ATTRIBUTE_COPY_PHASE_STRIP=NO
+    -DCMAKE_XCODE_ATTRIBUTE_STRIP_INSTALLED_PRODUCT=NO
+    -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
+    -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}
+    -DLLAMA_BUILD_EXAMPLES=${LLAMA_BUILD_EXAMPLES}
+    -DLLAMA_BUILD_TESTS=${LLAMA_BUILD_TESTS}
+    -DLLAMA_BUILD_SERVER=${LLAMA_BUILD_SERVER}
+    -DGGML_METAL_EMBED_LIBRARY=${GGML_METAL_EMBED_LIBRARY}
+    -DGGML_BLAS_DEFAULT=${GGML_BLAS_DEFAULT}
+    -DGGML_METAL=${GGML_METAL}
+    -DGGML_METAL_USE_BF16=${GGML_METAL_USE_BF16}
+    -DGGML_NATIVE=OFF
+    -DGGML_OPENMP=${GGML_OPENMP}
+)
+
+check_required_tool() {
+    local tool=$1
+    local install_message=$2
+
+    if ! command -v $tool &> /dev/null; then
+        echo "Error: $tool is required but not found."
+        echo "$install_message"
+        exit 1
+    fi
+}
+echo "Checking for required tools..."
+check_required_tool "cmake" "Please install CMake 3.28.0 or later (brew install cmake)"
+check_required_tool "xcodebuild" "Please install Xcode and Xcode Command Line Tools (xcode-select --install)"
+check_required_tool "libtool" "Please install libtool which should be available with Xcode Command Line Tools (CLT). Make sure Xcode CLT is installed (xcode-select --install)"
+check_required_tool "dsymutil" "Please install Xcode and Xcode Command Line Tools (xcode-select --install)"
+
+set -e
+
+## Clean up previous builds
+rm -rf build-apple
+rm -rf build-ios-sim
+rm -rf build-ios-device
+rm -rf build-macos
+rm -rf build-visionos
+rm -rf build-visionos-sim
+rm -rf build-tvos-sim
+rm -rf build-tvos-device
+
+# Setup the xcframework build directory structure
+setup_framework_structure() {
+    local build_dir=$1
+    local min_os_version=$2
+    local platform=$3  # "ios", "macos", "visionos", or "tvos"
+    local framework_name="llama"
+
+    echo "Creating ${platform}-style framework structure for ${build_dir}"
+
+    if [[ "$platform" == "macos" ]]; then
+        # macOS versioned structure uses versioned directories
+        mkdir -p ${build_dir}/framework/${framework_name}.framework/Versions/A/Headers
+        mkdir -p ${build_dir}/framework/${framework_name}.framework/Versions/A/Modules
+        mkdir -p ${build_dir}/framework/${framework_name}.framework/Versions/A/Resources
+
+        # Create symbolic links
+        ln -sf A ${build_dir}/framework/${framework_name}.framework/Versions/Current
+        ln -sf Versions/Current/Headers ${build_dir}/framework/${framework_name}.framework/Headers
+        ln -sf Versions/Current/Modules ${build_dir}/framework/${framework_name}.framework/Modules
+        ln -sf Versions/Current/Resources ${build_dir}/framework/${framework_name}.framework/Resources
+        ln -sf Versions/Current/${framework_name} ${build_dir}/framework/${framework_name}.framework/${framework_name}
+
+        # Set header and module paths
+        local header_path=${build_dir}/framework/${framework_name}.framework/Versions/A/Headers/
+        local module_path=${build_dir}/framework/${framework_name}.framework/Versions/A/Modules/
+    else
+        # iOS/VisionOS/tvOS use a flat structure
+        mkdir -p ${build_dir}/framework/${framework_name}.framework/Headers
+        mkdir -p ${build_dir}/framework/${framework_name}.framework/Modules
+
+        # Remove any existing structure to ensure clean build
+        rm -rf ${build_dir}/framework/${framework_name}.framework/Versions
+
+        # Set header and module paths
+        local header_path=${build_dir}/framework/${framework_name}.framework/Headers/
+        local module_path=${build_dir}/framework/${framework_name}.framework/Modules/
+    fi
+
+    # Copy all required headers (common for all platforms)
+    cp include/llama.h             ${header_path}
+    cp ggml/include/ggml.h         ${header_path}
+    cp ggml/include/ggml-alloc.h   ${header_path}
+    cp ggml/include/ggml-backend.h ${header_path}
+    cp ggml/include/ggml-metal.h   ${header_path}
+    cp ggml/include/ggml-cpu.h     ${header_path}
+    cp ggml/include/ggml-blas.h    ${header_path}
+    cp ggml/include/gguf.h         ${header_path}
+
+    # Create module map (common for all platforms)
+    cat > ${module_path}module.modulemap << EOF
+framework module llama {
+    header "llama.h"
+    header "ggml.h"
+    header "ggml-alloc.h"
+    header "ggml-backend.h"
+    header "ggml-metal.h"
+    header "ggml-cpu.h"
+    header "ggml-blas.h"
+    header "gguf.h"
+
+    link "c++"
+    link framework "Accelerate"
+    link framework "Metal"
+    link framework "Foundation"
+
+    export *
+}
+EOF
+
+    # Platform-specific settings for Info.plist
+    local platform_name=""
+    local sdk_name=""
+    local supported_platform=""
+
+    case "$platform" in
+        "ios")
+            platform_name="iphoneos"
+            sdk_name="iphoneos${min_os_version}"
+            supported_platform="iPhoneOS"
+            local plist_path="${build_dir}/framework/${framework_name}.framework/Info.plist"
+            local device_family='    <key>UIDeviceFamily</key>
+    <array>
+        <integer>1</integer>
+        <integer>2</integer>
+    </array>'
+            ;;
+        "macos")
+            platform_name="macosx"
+            sdk_name="macosx${min_os_version}"
+            supported_platform="MacOSX"
+            local plist_path="${build_dir}/framework/${framework_name}.framework/Versions/A/Resources/Info.plist"
+            local device_family=""
+            ;;
+        "visionos")
+            platform_name="xros"
+            sdk_name="xros${min_os_version}"
+            supported_platform="XRPlatform"
+            local plist_path="${build_dir}/framework/${framework_name}.framework/Info.plist"
+            local device_family=""
+            ;;
+        "tvos")
+            platform_name="appletvos"
+            sdk_name="appletvos${min_os_version}"
+            supported_platform="AppleTVOS"
+            local plist_path="${build_dir}/framework/${framework_name}.framework/Info.plist"
+            local device_family='    <key>UIDeviceFamily</key>
+    <array>
+        <integer>3</integer>
+    </array>'
+            ;;
+    esac
+
+    # Create Info.plist
+    cat > ${plist_path} << EOF
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+    <key>CFBundleDevelopmentRegion</key>
+    <string>en</string>
+    <key>CFBundleExecutable</key>
+    <string>llama</string>
+    <key>CFBundleIdentifier</key>
+    <string>org.ggml.llama</string>
+    <key>CFBundleInfoDictionaryVersion</key>
+    <string>6.0</string>
+    <key>CFBundleName</key>
+    <string>llama</string>
+    <key>CFBundlePackageType</key>
+    <string>FMWK</string>
+    <key>CFBundleShortVersionString</key>
+    <string>1.0</string>
+    <key>CFBundleVersion</key>
+    <string>1</string>
+    <key>MinimumOSVersion</key>
+    <string>${min_os_version}</string>
+    <key>CFBundleSupportedPlatforms</key>
+    <array>
+        <string>${supported_platform}</string>
+    </array>${device_family}
+    <key>DTPlatformName</key>
+    <string>${platform_name}</string>
+    <key>DTSDKName</key>
+    <string>${sdk_name}</string>
+</dict>
+</plist>
+EOF
+}
+
+# Create dynamic libraries from static libraries.
+combine_static_libraries() {
+    local build_dir="$1"
+    local release_dir="$2"
+    local platform="$3"  # "ios", "macos", "visionos", or "tvos"
+    local is_simulator="$4"
+    local base_dir="$(pwd)"
+    local framework_name="llama"
+
+    # Determine output path based on platform
+    local output_lib=""
+    if [[ "$platform" == "macos" ]]; then
+        # macOS uses versioned structure
+        output_lib="${build_dir}/framework/${framework_name}.framework/Versions/A/${framework_name}"
+    else
+        # iOS, visionOS, and tvOS use a directory flat structure
+        output_lib="${build_dir}/framework/${framework_name}.framework/${framework_name}"
+    fi
+
+    local libs=(
+        "${base_dir}/${build_dir}/src/${release_dir}/libllama.a"
+        "${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml.a"
+        "${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml-base.a"
+        "${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml-cpu.a"
+        "${base_dir}/${build_dir}/ggml/src/ggml-metal/${release_dir}/libggml-metal.a"
+        "${base_dir}/${build_dir}/ggml/src/ggml-blas/${release_dir}/libggml-blas.a"
+    )
+
+    # Create temporary directory for processing
+    local temp_dir="${base_dir}/${build_dir}/temp"
+    mkdir -p "${temp_dir}"
+
+    # Since we have multiple architectures libtool will find object files that do not
+    # match the target architecture. We suppress these warnings.
+    libtool -static -o "${temp_dir}/combined.a" "${libs[@]}" 2> /dev/null
+
+    # Determine SDK, architectures, and install_name based on platform and simulator flag.
+    local sdk=""
+    local archs=""
+    local min_version_flag=""
+    local install_name=""
+
+    case "$platform" in
+        "ios")
+            if [[ "$is_simulator" == "true" ]]; then
+                sdk="iphonesimulator"
+                archs="arm64 x86_64"
+                min_version_flag="-mios-simulator-version-min=${IOS_MIN_OS_VERSION}"
+            else
+                sdk="iphoneos"
+                archs="arm64"
+                min_version_flag="-mios-version-min=${IOS_MIN_OS_VERSION}"
+            fi
+            install_name="@rpath/llama.framework/llama"
+            ;;
+        "macos")
+            sdk="macosx"
+            archs="arm64 x86_64"
+            min_version_flag="-mmacosx-version-min=${MACOS_MIN_OS_VERSION}"
+            install_name="@rpath/llama.framework/Versions/Current/llama"
+            ;;
+        "visionos")
+            if [[ "$is_simulator" == "true" ]]; then
+                sdk="xrsimulator"
+                archs="arm64 x86_64"
+                min_version_flag="-mtargetos=xros${VISIONOS_MIN_OS_VERSION}-simulator"
+            else
+                sdk="xros"
+                archs="arm64"
+                min_version_flag="-mtargetos=xros${VISIONOS_MIN_OS_VERSION}"
+            fi
+            # Use flat structure for visionOS, same as iOS
+            install_name="@rpath/llama.framework/llama"
+            ;;
+        "tvos")
+            if [[ "$is_simulator" == "true" ]]; then
+                sdk="appletvsimulator"
+                archs="arm64 x86_64"
+                min_version_flag="-mtvos-simulator-version-min=${TVOS_MIN_OS_VERSION}"
+            else
+                sdk="appletvos"
+                archs="arm64"
+                min_version_flag="-mtvos-version-min=${TVOS_MIN_OS_VERSION}"
+            fi
+            install_name="@rpath/llama.framework/llama"
+            ;;
+    esac
+
+    # Build architecture flags
+    local arch_flags=""
+    for arch in $archs; do
+        arch_flags+=" -arch $arch"
+    done
+
+    # Create dynamic library
+    echo "Creating dynamic library for ${platform}."
+    xcrun -sdk $sdk clang++ -dynamiclib \
+        -isysroot $(xcrun --sdk $sdk --show-sdk-path) \
+        $arch_flags \
+        $min_version_flag \
+        -Wl,-force_load,"${temp_dir}/combined.a" \
+        -framework Foundation -framework Metal -framework Accelerate \
+        -install_name "$install_name" \
+        -o "${base_dir}/${output_lib}"
+
+    # Platform-specific post-processing for device builds
+    if [[ "$is_simulator" == "false" ]]; then
+        if command -v vtool &>/dev/null; then
+            case "$platform" in
+                "ios")
+                    echo "Marking binary as a framework binary for iOS..."
+                    vtool -set-build-version ios ${IOS_MIN_OS_VERSION} ${IOS_MIN_OS_VERSION} -replace \
+                        -output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}"
+                    ;;
+                "visionos")
+                    echo "Marking binary as a framework binary for visionOS..."
+                    vtool -set-build-version xros ${VISIONOS_MIN_OS_VERSION} ${VISIONOS_MIN_OS_VERSION} -replace \
+                        -output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}"
+                    ;;
+                "tvos")
+                    echo "Marking binary as a framework binary for tvOS..."
+                    vtool -set-build-version tvos ${TVOS_MIN_OS_VERSION} ${TVOS_MIN_OS_VERSION} -replace \
+                        -output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}"
+                    ;;
+            esac
+        else
+            echo "Warning: vtool not found. Binary may not pass App Store validation."
+        fi
+    fi
+
+    echo "Creating properly formatted dSYM..."
+    # Create a separate directory for dSYMs for all platforms
+    mkdir -p "${base_dir}/${build_dir}/dSYMs"
+
+    # iOS and visionOS style dSYM (flat structure)
+    if [[ "$platform" == "ios" || "$platform" == "visionos" || "$platform" == "tvos" ]]; then
+        # Generate dSYM in the dSYMs directory
+        xcrun dsymutil "${base_dir}/${output_lib}" -o "${base_dir}/${build_dir}/dSYMs/llama.dSYM"
+
+        # Create a copy of the binary that will be stripped
+        cp "${base_dir}/${output_lib}" "${temp_dir}/binary_to_strip"
+
+        # Strip debug symbols from the copy
+        xcrun strip -S "${temp_dir}/binary_to_strip" -o "${temp_dir}/stripped_lib"
+
+        # Replace the original with the stripped version
+        mv "${temp_dir}/stripped_lib" "${base_dir}/${output_lib}"
+    else
+        # macOS style dSYM
+        # First strip debug info to a separate file
+        xcrun strip -S "${base_dir}/${output_lib}" -o "${temp_dir}/stripped_lib"
+
+        # Generate dSYM in the dSYMs directory
+        xcrun dsymutil "${base_dir}/${output_lib}" -o "${base_dir}/${build_dir}/dSYMs/llama.dSYM"
+
+        # Replace original binary with stripped version
+        mv "${temp_dir}/stripped_lib" "${base_dir}/${output_lib}"
+    fi
+
+    # Remove any automatically generated dSYM files in the framework structure as they will
+    # otherwise case Invalid Bundle Structure validation errors.
+    if [ -d "${base_dir}/${output_lib}.dSYM" ]; then
+        echo "Removing generated dSYM file in framework structure: ${base_dir}/${output_lib}.dSYM"
+        rm -rf "${base_dir}/${output_lib}.dSYM"
+    fi
+
+    # Clean up
+    rm -rf "${temp_dir}"
+}
+
+echo "Building for iOS simulator..."
+cmake -B build-ios-sim -G Xcode \
+    "${COMMON_CMAKE_ARGS[@]}" \
+    -DCMAKE_OSX_DEPLOYMENT_TARGET=${IOS_MIN_OS_VERSION} \
+    -DIOS=ON \
+    -DCMAKE_SYSTEM_NAME=iOS \
+    -DCMAKE_OSX_SYSROOT=iphonesimulator \
+    -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
+    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphonesimulator \
+    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
+    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
+    -S .
+cmake --build build-ios-sim --config Release -- -quiet
+
+echo "Building for iOS devices..."
+cmake -B build-ios-device -G Xcode \
+    "${COMMON_CMAKE_ARGS[@]}" \
+    -DCMAKE_OSX_DEPLOYMENT_TARGET=${IOS_MIN_OS_VERSION} \
+    -DCMAKE_OSX_SYSROOT=iphoneos \
+    -DCMAKE_OSX_ARCHITECTURES="arm64" \
+    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphoneos \
+    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
+    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
+    -S .
+cmake --build build-ios-device --config Release -- -quiet
+
+echo "Building for macOS..."
+cmake -B build-macos -G Xcode \
+    "${COMMON_CMAKE_ARGS[@]}" \
+    -DCMAKE_OSX_DEPLOYMENT_TARGET=${MACOS_MIN_OS_VERSION} \
+    -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
+    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
+    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
+    -S .
+cmake --build build-macos --config Release -- -quiet
+
+echo "Building for visionOS..."
+cmake -B build-visionos -G Xcode \
+    "${COMMON_CMAKE_ARGS[@]}" \
+    -DCMAKE_OSX_DEPLOYMENT_TARGET=${VISIONOS_MIN_OS_VERSION} \
+    -DCMAKE_OSX_ARCHITECTURES="arm64" \
+    -DCMAKE_SYSTEM_NAME=visionOS \
+    -DCMAKE_OSX_SYSROOT=xros \
+    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xros \
+    -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_C_FLAGS}" \
+    -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_CXX_FLAGS}" \
+    -S .
+cmake --build build-visionos --config Release -- -quiet
+
+echo "Building for visionOS simulator..."
+cmake -B build-visionos-sim -G Xcode \
+    "${COMMON_CMAKE_ARGS[@]}" \
+    -DCMAKE_OSX_DEPLOYMENT_TARGET=${VISIONOS_MIN_OS_VERSION} \
+    -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
+    -DCMAKE_SYSTEM_NAME=visionOS \
+    -DCMAKE_OSX_SYSROOT=xrsimulator \
+    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xrsimulator \
+    -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_C_FLAGS}" \
+    -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_CXX_FLAGS}" \
+    -S .
+cmake --build build-visionos-sim --config Release -- -quiet
+
+# Add tvOS builds (might need the same u_int definitions as watchOS and visionOS)
+echo "Building for tvOS simulator..."
+cmake -B build-tvos-sim -G Xcode \
+    "${COMMON_CMAKE_ARGS[@]}" \
+    -DCMAKE_OSX_DEPLOYMENT_TARGET=${TVOS_MIN_OS_VERSION} \
+    -DCMAKE_SYSTEM_NAME=tvOS \
+    -DCMAKE_OSX_SYSROOT=appletvsimulator \
+    -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
+    -DGGML_METAL=ON \
+    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=appletvsimulator \
+    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
+    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
+    -S .
+cmake --build build-tvos-sim --config Release -- -quiet
+
+echo "Building for tvOS devices..."
+cmake -B build-tvos-device -G Xcode \
+    "${COMMON_CMAKE_ARGS[@]}" \
+    -DCMAKE_OSX_DEPLOYMENT_TARGET=${TVOS_MIN_OS_VERSION} \
+    -DCMAKE_SYSTEM_NAME=tvOS \
+    -DCMAKE_OSX_SYSROOT=appletvos \
+    -DCMAKE_OSX_ARCHITECTURES="arm64" \
+    -DGGML_METAL=ON \
+    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=appletvos \
+    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
+    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
+    -S .
+cmake --build build-tvos-device --config Release -- -quiet
+
+# Setup frameworks and copy binaries and headers
+echo "Setting up framework structures..."
+setup_framework_structure "build-ios-sim" ${IOS_MIN_OS_VERSION} "ios"
+setup_framework_structure "build-ios-device" ${IOS_MIN_OS_VERSION} "ios"
+setup_framework_structure "build-macos" ${MACOS_MIN_OS_VERSION} "macos"
+setup_framework_structure "build-visionos" ${VISIONOS_MIN_OS_VERSION} "visionos"
+setup_framework_structure "build-visionos-sim" ${VISIONOS_MIN_OS_VERSION} "visionos"
+setup_framework_structure "build-tvos-sim" ${TVOS_MIN_OS_VERSION} "tvos"
+setup_framework_structure "build-tvos-device" ${TVOS_MIN_OS_VERSION} "tvos"
+
+# Create dynamic libraries from static libraries
+echo "Creating dynamic libraries from static libraries..."
+combine_static_libraries "build-ios-sim" "Release-iphonesimulator" "ios" "true"
+combine_static_libraries "build-ios-device" "Release-iphoneos" "ios" "false"
+combine_static_libraries "build-macos" "Release" "macos" "false"
+combine_static_libraries "build-visionos" "Release-xros" "visionos" "false"
+combine_static_libraries "build-visionos-sim" "Release-xrsimulator" "visionos" "true"
+combine_static_libraries "build-tvos-sim" "Release-appletvsimulator" "tvos" "true"
+combine_static_libraries "build-tvos-device" "Release-appletvos" "tvos" "false"
+
+# Create XCFramework with correct debug symbols paths
+echo "Creating XCFramework..."
+xcodebuild -create-xcframework \
+    -framework $(pwd)/build-ios-sim/framework/llama.framework \
+    -debug-symbols $(pwd)/build-ios-sim/dSYMs/llama.dSYM \
+    -framework $(pwd)/build-ios-device/framework/llama.framework \
+    -debug-symbols $(pwd)/build-ios-device/dSYMs/llama.dSYM \
+    -framework $(pwd)/build-macos/framework/llama.framework \
+    -debug-symbols $(pwd)/build-macos/dSYMS/llama.dSYM \
+    -framework $(pwd)/build-visionos/framework/llama.framework \
+    -debug-symbols $(pwd)/build-visionos/dSYMs/llama.dSYM \
+    -framework $(pwd)/build-visionos-sim/framework/llama.framework \
+    -debug-symbols $(pwd)/build-visionos-sim/dSYMs/llama.dSYM \
+    -framework $(pwd)/build-tvos-device/framework/llama.framework \
+    -debug-symbols $(pwd)/build-tvos-device/dSYMs/llama.dSYM \
+    -framework $(pwd)/build-tvos-sim/framework/llama.framework \
+    -debug-symbols $(pwd)/build-tvos-sim/dSYMs/llama.dSYM \
+    -output $(pwd)/build-apple/llama.xcframework
--- a/ci/README.md
+++ b/ci/README.md
@@ -1,11 +1,11 @@
 # CI

-In addition to [Github Actions](https://github.com/ggerganov/llama.cpp/actions) `llama.cpp` uses a custom CI framework:
+In addition to [Github Actions](https://github.com/ggml-org/llama.cpp/actions) `llama.cpp` uses a custom CI framework:

 https://github.com/ggml-org/ci

 It monitors the `master` branch for new commits and runs the
-[ci/run.sh](https://github.com/ggerganov/llama.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us
+[ci/run.sh](https://github.com/ggml-org/llama.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us
 to execute heavier workloads compared to just using Github Actions. Also with time, the cloud instances will be scaled
 to cover various hardware architectures, including GPU and Apple Silicon instances.

--- a/ci/run.sh
+++ b/ci/run.sh
@@ -299,7 +299,7 @@ function gg_run_open_llama_7b_v2 {
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log

-    python ../examples/convert_legacy_llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
+    python3 ../examples/convert_legacy_llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf

    model_f16="${path_models}/ggml-model-f16.gguf"
    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@@ -352,10 +352,10 @@ function gg_run_open_llama_7b_v2 {

    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log

-    (time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 10 -c 0     ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 99 -c 0     ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0     ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0     ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log

    function check_ppl {
        qnt="$1"
@@ -433,7 +433,7 @@ function gg_run_pythia_1_4b {
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log

-    python ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
+    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf

    model_f16="${path_models}/ggml-model-f16.gguf"
    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@@ -564,7 +564,7 @@ function gg_run_pythia_2_8b {
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log

-    python ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
+    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf

    model_f16="${path_models}/ggml-model-f16.gguf"
    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@@ -699,7 +699,7 @@ function gg_run_embd_bge_small {
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log

-    python ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
+    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf

    model_f16="${path_models}/ggml-model-f16.gguf"
    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@@ -747,7 +747,7 @@ function gg_run_rerank_tiny {
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log

-    python ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
+    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf

    model_f16="${path_models}/ggml-model-f16.gguf"

@@ -814,8 +814,8 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
    mkdir -p ${mnt_models}
    ln -sfn ${mnt_models} ${SRC}/models-mnt

-    # Create a fresh python venv and enter it
-    if ! python -m venv "$MNT/venv"; then
+    # Create a fresh python3 venv and enter it
+    if ! python3 -m venv "$MNT/venv"; then
        echo "Error: Failed to create Python virtual environment at $MNT/venv."
        exit 1
    fi
--- a/cmake/build-info.cmake
+++ b/cmake/build-info.cmake
@@ -44,7 +44,7 @@ if(MSVC)
    set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME})
 else()
    execute_process(
-        COMMAND sh -c "$@ --version | head -1" _ ${CMAKE_C_COMPILER}
+        COMMAND sh -c "\"$@\" --version | head -1" _ ${CMAKE_C_COMPILER}
        OUTPUT_VARIABLE OUT
        OUTPUT_STRIP_TRAILING_WHITESPACE
    )
--- a/cmake/llama-config.cmake.in
+++ b/cmake/llama-config.cmake.in
@@ -3,159 +3,13 @@ set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
 set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
 set(LLAMA_SHARED_LIB   @BUILD_SHARED_LIBS@)

-set(GGML_STATIC @GGML_STATIC@)
-set(GGML_NATIVE @GGML_NATIVE@)
-set(GGML_LTO    @GGML_LTO@)
-set(GGML_CCACHE @GGML_CCACHE@)
-set(GGML_AVX    @GGML_AVX@)
-set(GGML_AVX2   @GGML_AVX2@)
-set(GGML_AVX512 @GGML_AVX512@)
-set(GGML_AVX512_VBMI @GGML_AVX512_VBMI@)
-set(GGML_AVX512_VNNI @GGML_AVX512_VNNI@)
-set(GGML_AVX512_BF16 @GGML_AVX512_BF16@)
-set(GGML_AMX_TILE @GGML_AMX_TILE@)
-set(GGML_AMX_INT8 @GGML_AMX_INT8@)
-set(GGML_AMX_BF16 @GGML_AMX_BF16@)
-set(GGML_FMA  @GGML_FMA@)
-set(GGML_LASX @GGML_LASX@)
-set(GGML_LSX  @GGML_LSX@)
-set(GGML_RVV  @GGML_RVV@)
-set(GGML_SVE  @GGML_SVE@)
-
-set(GGML_ACCELERATE @GGML_ACCELERATE@)
-set(GGML_OPENMP  @GGML_OPENMP@)
-set(GGML_CPU_HBM @GGML_CPU_HBM@)
-set(GGML_BLAS_VENDOR @GGML_BLAS_VENDOR@)
-
-set(GGML_CUDA_FORCE_MMQ    @GGML_CUDA_FORCE_MMQ@)
-set(GGML_CUDA_FORCE_CUBLAS @GGML_CUDA_FORCE_CUBLAS@)
-set(GGML_CUDA_F16          @GGML_CUDA_F16@)
-set(GGML_CUDA_PEER_MAX_BATCH_SIZE @GGML_CUDA_PEER_MAX_BATCH_SIZE@)
-set(GGML_CUDA_NO_PEER_COPY  @GGML_CUDA_NO_PEER_COPY@)
-set(GGML_CUDA_NO_VMM        @GGML_CUDA_NO_VMM@)
-set(GGML_CUDA_FA_ALL_QUANTS @GGML_CUDA_FA_ALL_QUANTS@)
-set(GGML_CUDA_GRAPHS        @GGML_CUDA_GRAPHS@)
-
-set(GGML_HIP_UMA @GGML_HIP_UMA@)
-
-set(GGML_VULKAN_CHECK_RESULTS @GGML_VULKAN_CHECK_RESULTS@)
-set(GGML_VULKAN_DEBUG         @GGML_VULKAN_DEBUG@)
-set(GGML_VULKAN_MEMORY_DEBUG  @GGML_VULKAN_MEMORY_DEBUG@)
-set(GGML_VULKAN_SHADER_DEBUG_INFO @GGML_VULKAN_SHADER_DEBUG_INFO@)
-set(GGML_VULKAN_PERF      @GGML_VULKAN_PERF@)
-set(GGML_VULKAN_VALIDATE  @GGML_VULKAN_VALIDATE@)
-set(GGML_VULKAN_RUN_TESTS @GGML_VULKAN_RUN_TESTS@)
-
-set(GGML_METAL_USE_BF16 @GGML_METAL_USE_BF16@)
-set(GGML_METAL_NDEBUG   @GGML_METAL_NDEBUG@)
-set(GGML_METAL_SHADER_DEBUG  @GGML_METAL_SHADER_DEBUG@)
-set(GGML_METAL_EMBED_LIBRARY @GGML_METAL_EMBED_LIBRARY@)
-set(GGML_METAL_MACOSX_VERSION_MIN @GGML_METAL_MACOSX_VERSION_MIN@)
-set(GGML_METAL_STD @GGML_METAL_STD@)
-
-set(GGML_SYCL_F16    @GGML_SYCL_F16@)
-set(GGML_SYCL_TARGET @GGML_SYCL_TARGET@)
-set(GGML_SYCL_DEVICE_ARCH @GGML_SYCL_DEVICE_ARCH@)
-
-
@PACKAGE_INIT@

 set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@")
 set_and_check(LLAMA_LIB_DIR     "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
 set_and_check(LLAMA_BIN_DIR     "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")

-find_package(Threads REQUIRED)
-
-set(_llama_transient_defines "@GGML_TRANSIENT_DEFINES@")
-set(_llama_link_deps "")
-set(_llama_link_opts "")
-foreach(_ggml_lib ggml ggml-base)
-    string(REPLACE "-" "_" _ggml_lib_var "${_ggml_lib}_LIBRARY")
-    find_library(${_ggml_lib_var} ${_ggml_lib}
-        REQUIRED
-        HINTS ${LLAMA_LIB_DIR}
-        NO_CMAKE_FIND_ROOT_PATH
-    )
-    list(APPEND _llama_link_deps "${${_ggml_lib_var}}")
-    message(STATUS "Found ${${_ggml_lib_var}}")
-endforeach()
-
-foreach(backend amx blas cann cpu cuda hip kompute metal musa rpc sycl vulkan)
-    string(TOUPPER "GGML_${backend}" backend_id)
-    set(_ggml_lib "ggml-${backend}")
-    string(REPLACE "-" "_" _ggml_lib_var "${_ggml_lib}_LIBRARY")
-
-    find_library(${_ggml_lib_var} ${_ggml_lib}
-        HINTS ${LLAMA_LIB_DIR}
-        NO_CMAKE_FIND_ROOT_PATH
-    )
-    if(${_ggml_lib_var})
-        list(APPEND _llama_link_deps "${${_ggml_lib_var}}")
-        set(${backend_id} ON)
-        message(STATUS "Found backend ${${_ggml_lib_var}}")
-    else()
-        set(${backend_id} OFF)
-    endif()
-endforeach()
-
-if (NOT LLAMA_SHARED_LIB)
-    if (APPLE AND GGML_ACCELERATE)
-        find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
-        list(APPEND _llama_link_deps ${ACCELERATE_FRAMEWORK})
-    endif()
-
-    if (GGML_OPENMP)
-        find_package(OpenMP REQUIRED)
-        list(APPEND _llama_link_deps OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
-    endif()
-
-    if (GGML_CPU_HBM)
-        find_library(memkind memkind REQUIRED)
-        list(APPEND _llama_link_deps memkind)
-    endif()
-
-    if (GGML_BLAS)
-        find_package(BLAS REQUIRED)
-        list(APPEND _llama_link_deps ${BLAS_LIBRARIES})
-        list(APPEND _llama_link_opts ${BLAS_LINKER_FLAGS})
-    endif()
-
-    if (GGML_CUDA)
-        find_package(CUDAToolkit REQUIRED)
-    endif()
-
-    if (GGML_METAL)
-        find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
-        find_library(METAL_FRAMEWORK    Metal REQUIRED)
-        find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
-        list(APPEND _llama_link_deps ${FOUNDATION_LIBRARY}
-                                     ${METAL_FRAMEWORK} ${METALKIT_FRAMEWORK})
-    endif()
-
-    if (GGML_VULKAN)
-        find_package(Vulkan REQUIRED)
-        list(APPEND _llama_link_deps Vulkan::Vulkan)
-    endif()
-
-    if (GGML_HIP)
-        find_package(hip     REQUIRED)
-        find_package(hipblas REQUIRED)
-        find_package(rocblas REQUIRED)
-        list(APPEND _llama_link_deps hip::host roc::rocblas roc::hipblas)
-    endif()
-
-    if (GGML_SYCL)
-        find_package(DNNL)
-        if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
-            list(APPEND _llama_link_deps DNNL::dnnl)
-        endif()
-        if (WIN32)
-            find_package(IntelSYCL REQUIRED)
-            find_package(MKL       REQUIRED)
-            list(APPEND _llama_link_deps IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
-        endif()
-    endif()
-endif()
+find_package(ggml REQUIRED HINTS ${LLAMA_LIB_DIR}/cmake)

 find_library(llama_LIBRARY llama
    REQUIRED
@@ -167,12 +21,10 @@ add_library(llama UNKNOWN IMPORTED)
 set_target_properties(llama
    PROPERTIES
        INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
-        INTERFACE_LINK_LIBRARIES "${_llama_link_deps}"
-        INTERFACE_LINK_OPTIONS   "${_llama_link_opts}"
-        INTERFACE_COMPILE_DEFINITIONS "${_llama_transient_defines}"
+        INTERFACE_LINK_LIBRARIES "ggml::ggml;ggml::ggml-base;"
        IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
        IMPORTED_LOCATION "${llama_LIBRARY}"
-        INTERFACE_COMPILE_FEATURES cxx_std_11
-        POSITION_INDEPENDENT_CODE ON )
+        INTERFACE_COMPILE_FEATURES c_std_90
+        POSITION_INDEPENDENT_CODE ON)

 check_required_components(Llama)
--- a/cmake/llama.pc.in
+++ b/cmake/llama.pc.in
@@ -1,10 +1,10 @@
 prefix=@CMAKE_INSTALL_PREFIX@
-exec_prefix=${prefix}
-libdir=${exec_prefix}/lib
-includedir=${prefix}/include
+exec_prefix=@CMAKE_INSTALL_PREFIX@
+libdir=@CMAKE_INSTALL_FULL_LIBDIR@
+includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@

 Name: llama
 Description: Port of Facebook's LLaMA model in C/C++
-Version: @PROJECT_VERSION@
-Libs: -L${libdir} -lggml  -lggml-base -lllama
+Version: @LLAMA_INSTALL_VERSION@
+Libs: -L${libdir} -lggml -lggml-base -lllama
 Cflags: -I${includedir}
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -56,14 +56,19 @@ add_library(${TARGET} STATIC
    arg.cpp
    arg.h
    base64.hpp
+    chat.cpp
+    chat.h
    common.cpp
    common.h
    console.cpp
    console.h
    json-schema-to-grammar.cpp
    json.hpp
+    llguidance.cpp
    log.cpp
    log.h
+    minja/chat-template.hpp
+    minja/minja.hpp
    ngram-cache.cpp
    ngram-cache.h
    sampling.cpp
@@ -87,6 +92,50 @@ if (LLAMA_CURL)
    set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
 endif ()

+if (LLAMA_LLGUIDANCE)
+    include(ExternalProject)
+    set(LLGUIDANCE_SRC ${CMAKE_BINARY_DIR}/llguidance/source)
+    set(LLGUIDANCE_PATH ${LLGUIDANCE_SRC}/target/release)
+
+    # Set the correct library file extension based on platform
+    if (WIN32)
+        set(LLGUIDANCE_LIB_NAME "llguidance.lib")
+        # Add Windows-specific libraries
+        set(LLGUIDANCE_PLATFORM_LIBS
+            ws2_32    # Windows Sockets API
+            userenv   # For GetUserProfileDirectoryW
+            ntdll     # For NT functions
+            bcrypt    # For BCryptGenRandom
+        )
+    else()
+        set(LLGUIDANCE_LIB_NAME "libllguidance.a")
+        set(LLGUIDANCE_PLATFORM_LIBS "")
+    endif()
+
+    ExternalProject_Add(llguidance_ext
+        GIT_REPOSITORY https://github.com/guidance-ai/llguidance
+        # v0.6.12:
+        GIT_TAG ced1c9023d47ec194fa977932d35ce65c2ebfc09
+        PREFIX ${CMAKE_BINARY_DIR}/llguidance
+        SOURCE_DIR ${LLGUIDANCE_SRC}
+        BUILD_IN_SOURCE TRUE
+        CONFIGURE_COMMAND ""
+        BUILD_COMMAND cargo build --release
+        INSTALL_COMMAND ""
+        BUILD_BYPRODUCTS ${LLGUIDANCE_PATH}/${LLGUIDANCE_LIB_NAME} ${LLGUIDANCE_PATH}/llguidance.h
+        UPDATE_COMMAND ""
+    )
+    target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_LLGUIDANCE)
+
+    add_library(llguidance STATIC IMPORTED)
+    set_target_properties(llguidance PROPERTIES IMPORTED_LOCATION ${LLGUIDANCE_PATH}/${LLGUIDANCE_LIB_NAME})
+    add_dependencies(llguidance llguidance_ext)
+
+    target_include_directories(${TARGET} PRIVATE ${LLGUIDANCE_PATH})
+    # Add platform libraries to the main target
+    set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance ${LLGUIDANCE_PLATFORM_LIBS})
+endif ()
+
 target_include_directories(${TARGET} PUBLIC .)
 target_compile_features   (${TARGET} PUBLIC cxx_std_17)
 target_link_libraries     (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2,6 +2,7 @@

 #include "log.h"
 #include "sampling.h"
+#include "chat.h"

 #include <algorithm>
 #include <climits>
@@ -133,7 +134,8 @@ static void common_params_handle_model_default(
        const std::string & model_url,
        std::string & hf_repo,
        std::string & hf_file,
-        const std::string & hf_token) {
+        const std::string & hf_token,
+        const std::string & model_default) {
    if (!hf_repo.empty()) {
        // short-hand to avoid specifying --hf-file -> default it to --model
        if (hf_file.empty()) {
@@ -163,7 +165,7 @@ static void common_params_handle_model_default(
            model = fs_get_cache_file(string_split<std::string>(f, '/').back());
        }
    } else if (model.empty()) {
-        model = DEFAULT_MODEL_PATH;
+        model = model_default;
    }
 }

@@ -299,8 +301,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
    }

    // TODO: refactor model params in a common struct
-    common_params_handle_model_default(params.model,         params.model_url,         params.hf_repo,         params.hf_file,         params.hf_token);
-    common_params_handle_model_default(params.vocoder.model, params.vocoder.model_url, params.vocoder.hf_repo, params.vocoder.hf_file, params.hf_token);
+    common_params_handle_model_default(params.model,             params.model_url,             params.hf_repo,             params.hf_file,             params.hf_token, DEFAULT_MODEL_PATH);
+    common_params_handle_model_default(params.speculative.model, params.speculative.model_url, params.speculative.hf_repo, params.speculative.hf_file, params.hf_token, "");
+    common_params_handle_model_default(params.vocoder.model,     params.vocoder.model_url,     params.vocoder.hf_repo,     params.vocoder.hf_file,     params.hf_token, "");

    if (params.escape) {
        string_process_escapes(params.prompt);
@@ -323,6 +326,14 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
        throw std::invalid_argument("error: either --embedding or --reranking can be specified, but not both");
    }

+    if (!params.chat_template.empty() && !common_chat_verify_template(params.chat_template, params.use_jinja)) {
+        throw std::runtime_error(string_format(
+            "error: the supplied chat template is not supported: %s%s\n",
+            params.chat_template.c_str(),
+            params.use_jinja ? "" : "\nnote: llama.cpp was started without --jinja, we only support commonly used templates"
+        ));
+    }
+
    return true;
 }

@@ -355,6 +366,112 @@ static void common_params_print_usage(common_params_context & ctx_arg) {
    print_options(specific_options);
 }

+static void common_params_print_completion(common_params_context & ctx_arg) {
+    std::vector<common_arg *> common_options;
+    std::vector<common_arg *> sparam_options;
+    std::vector<common_arg *> specific_options;
+
+    for (auto & opt : ctx_arg.options) {
+        if (opt.is_sparam) {
+            sparam_options.push_back(&opt);
+        } else if (opt.in_example(ctx_arg.ex)) {
+            specific_options.push_back(&opt);
+        } else {
+            common_options.push_back(&opt);
+        }
+    }
+
+    printf("_llama_completions() {\n");
+    printf("    local cur prev opts\n");
+    printf("    COMPREPLY=()\n");
+    printf("    cur=\"${COMP_WORDS[COMP_CWORD]}\"\n");
+    printf("    prev=\"${COMP_WORDS[COMP_CWORD-1]}\"\n\n");
+
+    printf("    opts=\"");
+    auto print_options = [](const std::vector<common_arg *> & options) {
+        for (const common_arg * opt : options) {
+            for (const char * arg : opt->args) {
+                printf("%s ", arg);
+            }
+        }
+    };
+
+    print_options(common_options);
+    print_options(sparam_options);
+    print_options(specific_options);
+    printf("\"\n\n");
+
+    printf("    case \"$prev\" in\n");
+    printf("        --model)\n");
+    printf("            COMPREPLY=( $(compgen -f -X '!*.gguf' -- \"$cur\") $(compgen -d -- \"$cur\") )\n");
+    printf("            return 0\n");
+    printf("            ;;\n");
+    printf("        --grammar-file)\n");
+    printf("            COMPREPLY=( $(compgen -f -X '!*.gbnf' -- \"$cur\") $(compgen -d -- \"$cur\") )\n");
+    printf("            return 0\n");
+    printf("            ;;\n");
+    printf("        --chat-template-file)\n");
+    printf("            COMPREPLY=( $(compgen -f -X '!*.jinja' -- \"$cur\") $(compgen -d -- \"$cur\") )\n");
+    printf("            return 0\n");
+    printf("            ;;\n");
+    printf("        *)\n");
+    printf("            COMPREPLY=( $(compgen -W \"${opts}\" -- \"$cur\") )\n");
+    printf("            return 0\n");
+    printf("            ;;\n");
+    printf("    esac\n");
+    printf("}\n\n");
+
+    std::set<std::string> executables = {
+        "llama-batched",
+        "llama-batched-bench",
+        "llama-bench",
+        "llama-cli",
+        "llama-convert-llama2c-to-ggml",
+        "llama-cvector-generator",
+        "llama-embedding",
+        "llama-eval-callback",
+        "llama-export-lora",
+        "llama-gbnf-validator",
+        "llama-gen-docs",
+        "llama-gguf",
+        "llama-gguf-hash",
+        "llama-gguf-split",
+        "llama-gritlm",
+        "llama-imatrix",
+        "llama-infill",
+        "llama-llava-cli",
+        "llama-llava-clip-quantize-cli",
+        "llama-lookahead",
+        "llama-lookup",
+        "llama-lookup-create",
+        "llama-lookup-merge",
+        "llama-lookup-stats",
+        "llama-minicpmv-cli",
+        "llama-parallel",
+        "llama-passkey",
+        "llama-perplexity",
+        "llama-q8dot",
+        "llama-quantize",
+        "llama-quantize-stats",
+        "llama-qwen2vl-cli",
+        "llama-retrieval",
+        "llama-run",
+        "llama-save-load-state",
+        "llama-server",
+        "llama-simple",
+        "llama-simple-chat",
+        "llama-speculative",
+        "llama-speculative-simple",
+        "llama-tokenize",
+        "llama-tts",
+        "llama-vdot"
+    };
+
+    for (const auto& exe : executables) {
+        printf("complete -F _llama_completions %s\n", exe.c_str());
+    }
+}
+
 static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & value) {
    std::vector<ggml_backend_dev_t> devices;
    auto dev_names = string_split<std::string>(value, ',');
@@ -376,6 +493,30 @@ static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & val
    return devices;
 }

+static void add_rpc_devices(std::string servers) {
+    auto rpc_servers = string_split<std::string>(servers, ',');
+    if (rpc_servers.empty()) {
+        throw std::invalid_argument("no RPC servers specified");
+    }
+    ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
+    if (!rpc_reg) {
+        throw std::invalid_argument("failed to find RPC backend");
+    }
+    typedef ggml_backend_dev_t (*ggml_backend_rpc_add_device_t)(const char * endpoint);
+    ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
+    if (!ggml_backend_rpc_add_device_fn) {
+        throw std::invalid_argument("failed to find RPC device add function");
+    }
+    for (const auto & server : rpc_servers) {
+        ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str());
+        if (dev) {
+            ggml_backend_device_register(dev);
+        } else {
+            throw std::invalid_argument("failed to register RPC device");
+        }
+    }
+}
+
 bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
    auto ctx_arg = common_params_parser_init(params, ex, print_usage);
    const common_params params_org = ctx_arg.params; // the example can modify the default params
@@ -392,6 +533,10 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
            }
            exit(0);
        }
+        if (ctx_arg.params.completion) {
+            common_params_print_completion(ctx_arg);
+            exit(0);
+        }
    } catch (const std::invalid_argument & ex) {
        fprintf(stderr, "%s\n", ex.what());
        ctx_arg.params = params_org;
@@ -460,6 +605,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            exit(0);
        }
    ));
+    add_opt(common_arg(
+        {"--completion-bash"},
+        "print source-able bash completion script for llama.cpp",
+        [](common_params & params) {
+            params.completion = true;
+        }
+    ));
    add_opt(common_arg(
        {"--verbose-prompt"},
        string_format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"),
@@ -640,7 +792,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ));
    add_opt(common_arg(
        {"--no-context-shift"},
-        string_format("disables context shift on inifinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
+        string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
        [](common_params & params) {
            params.ctx_shift = false;
        }
@@ -661,13 +813,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_env("LLAMA_ARG_FLASH_ATTN"));
    add_opt(common_arg(
        {"-p", "--prompt"}, "PROMPT",
-        ex == LLAMA_EXAMPLE_MAIN
-            ? "prompt to start generation with\nif -cnv is set, this will be used as system prompt"
-            : "prompt to start generation with",
+        "prompt to start generation with; for system message, use -sys",
        [](common_params & params, const std::string & value) {
            params.prompt = value;
        }
    ).set_excludes({LLAMA_EXAMPLE_SERVER}));
+    add_opt(common_arg(
+        {"-sys", "--system-prompt"}, "PROMPT",
+        "system prompt to use with model (if applicable, depending on chat template)",
+        [](common_params & params, const std::string & value) {
+            params.system_prompt = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
    add_opt(common_arg(
        {"--no-perf"},
        string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
@@ -792,6 +949,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
        }
    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(common_arg(
+        {"-st", "--single-turn"},
+        "run conversation for a single turn only, then exit when done\n"
+        "will not be interactive if first turn is predefined with --prompt\n"
+        "(default: false)",
+        [](common_params & params) {
+            params.single_turn = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
    add_opt(common_arg(
        {"-i", "--interactive"},
        string_format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"),
@@ -843,7 +1009,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params) {
            params.warmup = false;
        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING}));
    add_opt(common_arg(
        {"--spm-infill"},
        string_format(
@@ -912,6 +1078,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.sampling.min_p = std::stof(value);
        }
    ).set_sparam());
+    add_opt(common_arg(
+        {"--top-nsigma"}, "N",
+        string_format("top-n-sigma sampling (default: %.1f, -1.0 = disabled)", params.sampling.top_n_sigma),
+        [](common_params & params, const std::string & value) {
+            params.sampling.top_n_sigma = std::stof(value);
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}).set_sparam());
    add_opt(common_arg(
        {"--xtc-probability"}, "N",
        string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability),
@@ -1385,7 +1558,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            {"--rpc"}, "SERVERS",
            "comma separated list of RPC servers",
            [](common_params & params, const std::string & value) {
-                params.rpc_servers = value;
+                add_rpc_devices(value);
+                GGML_UNUSED(params);
            }
        ).set_env("LLAMA_ARG_RPC"));
    }
@@ -1410,7 +1584,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        "- isolate: only spawn threads on CPUs on the node that execution started on\n"
        "- numactl: use the CPU map provided by numactl\n"
        "if run without this previously, it is recommended to drop the system page cache before using this\n"
-        "see https://github.com/ggerganov/llama.cpp/issues/1437",
+        "see https://github.com/ggml-org/llama.cpp/issues/1437",
        [](common_params & params, const std::string & value) {
            /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
            else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
@@ -1430,15 +1604,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"--list-devices"},
        "print list of available devices and exit",
        [](common_params &) {
-            printf("Available devices:\n");
+            std::vector<ggml_backend_dev_t> rpc_devices;
+            std::vector<ggml_backend_dev_t> all_devices;
            for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
                auto * dev = ggml_backend_dev_get(i);
                if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
-                    size_t free, total;
-                    ggml_backend_dev_memory(dev, &free, &total);
-                    printf("  %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
+                    ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
+                    if (ggml_backend_reg_name(reg) == std::string("RPC")) {
+                        rpc_devices.push_back(dev);
+                    } else {
+                        all_devices.push_back(dev);
+                    }
                }
            }
+            // insert RPC devices in front
+            all_devices.insert(all_devices.begin(), rpc_devices.begin(), rpc_devices.end());
+            printf("Available devices:\n");
+            for (size_t i = 0; i < all_devices.size(); ++i) {
+                auto * dev = all_devices[i];
+                size_t free, total;
+                ggml_backend_dev_memory(dev, &free, &total);
+                printf("  %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
+            }
            exit(0);
        }
    ));
@@ -1604,6 +1791,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.hf_repo = value;
        }
    ).set_env("LLAMA_ARG_HF_REPO"));
+    add_opt(common_arg(
+        {"-hfd", "-hfrd", "--hf-repo-draft"}, "<user>/<model>[:quant]",
+        "Same as --hf-repo, but for the draft model (default: unused)",
+        [](common_params & params, const std::string & value) {
+            params.speculative.hf_repo = value;
+        }
+    ).set_env("LLAMA_ARG_HFD_REPO"));
    add_opt(common_arg(
        {"-hff", "--hf-file"}, "FILE",
        "Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)",
@@ -1673,16 +1867,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_examples({LLAMA_EXAMPLE_PASSKEY}));
    add_opt(common_arg(
        {"-o", "--output", "--output-file"}, "FNAME",
-        string_format("output file (default: '%s')",
-            ex == LLAMA_EXAMPLE_EXPORT_LORA
-                ? params.lora_outfile.c_str()
-                : ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR
-                    ? params.cvector_outfile.c_str()
-                    : params.out_file.c_str()),
+        string_format("output file (default: '%s')", params.out_file.c_str()),
        [](common_params & params, const std::string & value) {
            params.out_file = value;
-            params.cvector_outfile = value;
-            params.lora_outfile = value;
        }
    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA}));
    add_opt(common_arg(
@@ -1913,24 +2100,55 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            }
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(common_arg(
+        {"--jinja"},
+        "use jinja template for chat (default: disabled)",
+        [](common_params & params) {
+            params.use_jinja = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
+    add_opt(common_arg(
+        {"--reasoning-format"}, "FORMAT",
+        "reasoning format (default: deepseek; allowed values: deepseek, none)\n"
+        "controls whether thought tags are extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only).\n"
+        "only supported for non-streamed responses",
+        [](common_params & params, const std::string & value) {
+            /**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
+            else if (value == "none") {     params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
+            else { std::invalid_argument("invalid value"); }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
    add_opt(common_arg(
        {"--chat-template"}, "JINJA_TEMPLATE",
        string_format(
            "set custom jinja chat template (default: template taken from model's metadata)\n"
            "if suffix/prefix are specified, template will be disabled\n"
+            "only commonly used templates are accepted (unless --jinja is set before this flag):\n"
            "list of built-in templates:\n%s", list_builtin_chat_templates().c_str()
        ),
        [](common_params & params, const std::string & value) {
-            if (!common_chat_verify_template(value)) {
-                throw std::runtime_error(string_format(
-                    "error: the supplied chat template is not supported: %s\n"
-                    "note: llama.cpp does not use jinja parser, we only support commonly used templates\n",
-                    value.c_str()
-                ));
-            }
            params.chat_template = value;
        }
    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
+    add_opt(common_arg(
+        {"--chat-template-file"}, "JINJA_TEMPLATE_FILE",
+        string_format(
+            "set custom jinja chat template file (default: template taken from model's metadata)\n"
+            "if suffix/prefix are specified, template will be disabled\n"
+            "only commonly used templates are accepted (unless --jinja is set before this flag):\n"
+            "list of built-in templates:\n%s", list_builtin_chat_templates().c_str()
+        ),
+        [](common_params & params, const std::string & value) {
+            std::ifstream file(value);
+            if (!file) {
+                throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
+            }
+            std::copy(
+                std::istreambuf_iterator<char>(file),
+                std::istreambuf_iterator<char>(),
+                std::back_inserter(params.chat_template));
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
    add_opt(common_arg(
        {"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
        string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
@@ -2037,7 +2255,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_env("LLAMA_LOG_VERBOSITY"));
    add_opt(common_arg(
        {"--log-prefix"},
-        "Enable prefx in log messages",
+        "Enable prefix in log messages",
        [](common_params &) {
            common_log_set_prefix(common_log_main(), true);
        }
@@ -2229,6 +2447,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.vocoder.model = value;
        }
    ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
+     add_opt(common_arg(
+        {"--tts-use-guide-tokens"},
+        "Use guide tokens to improve TTS word recall",
+        [](common_params & params) {
+            params.vocoder.use_guide_tokens = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
+    add_opt(common_arg(
+        {"--tts-speaker-file"}, "FNAME",
+        "speaker file path for audio generation",
+        [](common_params & params, const std::string & value) {
+            params.vocoder.speaker_file = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_TTS}));

    // model-specific
    add_opt(common_arg(
@@ -2242,5 +2474,133 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_examples({LLAMA_EXAMPLE_TTS}));

+    add_opt(common_arg(
+        {"--embd-bge-small-en-default"},
+        string_format("use default bge-small-en-v1.5 model (note: can download weights from the internet)"),
+        [](common_params & params) {
+            params.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF";
+            params.hf_file = "bge-small-en-v1.5-q8_0.gguf";
+            params.pooling_type = LLAMA_POOLING_TYPE_NONE;
+            params.embd_normalize = 2;
+            params.n_ctx = 512;
+            params.verbose_prompt = true;
+            params.embedding = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
+
+    add_opt(common_arg(
+        {"--embd-e5-small-en-default"},
+        string_format("use default e5-small-v2 model (note: can download weights from the internet)"),
+        [](common_params & params) {
+            params.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF";
+            params.hf_file = "e5-small-v2-q8_0.gguf";
+            params.pooling_type = LLAMA_POOLING_TYPE_NONE;
+            params.embd_normalize = 2;
+            params.n_ctx = 512;
+            params.verbose_prompt = true;
+            params.embedding = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
+
+    add_opt(common_arg(
+        {"--embd-gte-small-default"},
+        string_format("use default gte-small model (note: can download weights from the internet)"),
+        [](common_params & params) {
+            params.hf_repo = "ggml-org/gte-small-Q8_0-GGUF";
+            params.hf_file = "gte-small-q8_0.gguf";
+            params.pooling_type = LLAMA_POOLING_TYPE_NONE;
+            params.embd_normalize = 2;
+            params.n_ctx = 512;
+            params.verbose_prompt = true;
+            params.embedding = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
+
+    add_opt(common_arg(
+        {"--fim-qwen-1.5b-default"},
+        string_format("use default Qwen 2.5 Coder 1.5B (note: can download weights from the internet)"),
+        [](common_params & params) {
+            params.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
+            params.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
+            params.port = 8012;
+            params.n_gpu_layers = 99;
+            params.flash_attn = true;
+            params.n_ubatch = 1024;
+            params.n_batch = 1024;
+            params.n_ctx = 0;
+            params.n_cache_reuse = 256;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+
+    add_opt(common_arg(
+        {"--fim-qwen-3b-default"},
+        string_format("use default Qwen 2.5 Coder 3B (note: can download weights from the internet)"),
+        [](common_params & params) {
+            params.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
+            params.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
+            params.port = 8012;
+            params.n_gpu_layers = 99;
+            params.flash_attn = true;
+            params.n_ubatch = 1024;
+            params.n_batch = 1024;
+            params.n_ctx = 0;
+            params.n_cache_reuse = 256;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+
+    add_opt(common_arg(
+        {"--fim-qwen-7b-default"},
+        string_format("use default Qwen 2.5 Coder 7B (note: can download weights from the internet)"),
+        [](common_params & params) {
+            params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
+            params.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
+            params.port = 8012;
+            params.n_gpu_layers = 99;
+            params.flash_attn = true;
+            params.n_ubatch = 1024;
+            params.n_batch = 1024;
+            params.n_ctx = 0;
+            params.n_cache_reuse = 256;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+
+    add_opt(common_arg(
+        {"--fim-qwen-7b-spec"},
+        string_format("use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
+        [](common_params & params) {
+            params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
+            params.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
+            params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
+            params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
+            params.speculative.n_gpu_layers = 99;
+            params.port = 8012;
+            params.n_gpu_layers = 99;
+            params.flash_attn = true;
+            params.n_ubatch = 1024;
+            params.n_batch = 1024;
+            params.n_ctx = 0;
+            params.n_cache_reuse = 256;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+
+    add_opt(common_arg(
+        {"--fim-qwen-14b-spec"},
+        string_format("use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
+        [](common_params & params) {
+            params.hf_repo = "ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF";
+            params.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
+            params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
+            params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
+            params.speculative.n_gpu_layers = 99;
+            params.port = 8012;
+            params.n_gpu_layers = 99;
+            params.flash_attn = true;
+            params.n_ubatch = 1024;
+            params.n_batch = 1024;
+            params.n_ctx = 0;
+            params.n_cache_reuse = 256;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+
    return ctx_arg;
 }
--- a/common/chat.cpp
+++ b/common/chat.cpp
--- a/common/chat.h
+++ b/common/chat.h
@@ -0,0 +1,135 @@
+// Chat support (incl. tool call grammar constraining & output parsing) w/ generic & custom template handlers.
+
+#pragma once
+
+#include "common.h"
+#include <string>
+#include <vector>
+
+struct common_chat_templates;
+
+struct common_chat_tool_call {
+    std::string name;
+    std::string arguments;
+    std::string id;
+};
+
+struct common_chat_msg_content_part {
+    std::string type;
+    std::string text;
+};
+
+struct common_chat_msg {
+    std::string role;
+    std::string content;
+    std::vector<common_chat_msg_content_part> content_parts = {};
+    std::vector<common_chat_tool_call> tool_calls = {};
+    std::string reasoning_content;
+    std::string tool_name;
+    std::string tool_call_id;
+};
+
+struct common_chat_tool {
+    std::string name;
+    std::string description;
+    std::string parameters;
+};
+
+enum common_chat_tool_choice {
+    COMMON_CHAT_TOOL_CHOICE_AUTO,
+    COMMON_CHAT_TOOL_CHOICE_REQUIRED,
+    COMMON_CHAT_TOOL_CHOICE_NONE,
+};
+
+enum common_chat_format {
+    COMMON_CHAT_FORMAT_CONTENT_ONLY,
+    COMMON_CHAT_FORMAT_GENERIC,
+    COMMON_CHAT_FORMAT_MISTRAL_NEMO,
+    COMMON_CHAT_FORMAT_LLAMA_3_X,
+    COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
+    COMMON_CHAT_FORMAT_DEEPSEEK_R1,
+    COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING,
+    COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
+    COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
+    COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
+    COMMON_CHAT_FORMAT_HERMES_2_PRO,
+    COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING,
+    COMMON_CHAT_FORMAT_COMMAND_R7B,
+    COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING,
+
+    COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
+};
+
+struct common_chat_templates_inputs {
+    std::vector<common_chat_msg> messages;
+    std::string grammar;
+    std::string json_schema;
+    bool add_generation_prompt = true;
+    bool use_jinja = true;
+    // Parameters below only supported when use_jinja is true
+    std::vector<common_chat_tool> tools;
+    common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
+    bool parallel_tool_calls = false;
+    bool extract_reasoning     = true;
+};
+
+struct common_chat_params {
+    common_chat_format                  format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
+    std::string                         prompt;
+    std::string                         grammar;
+    bool                                grammar_lazy = false;
+    std::vector<common_grammar_trigger> grammar_triggers;
+    std::vector<std::string>            preserved_tokens;
+    std::vector<std::string>            additional_stops;
+};
+
+// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
+bool common_chat_verify_template(const std::string & tmpl, bool use_jinja);
+
+void common_chat_templates_free(struct common_chat_templates * tmpls);
+
+struct common_chat_templates_deleter { void operator()(common_chat_templates * tmpls) { common_chat_templates_free(tmpls); } };
+
+typedef std::unique_ptr<struct common_chat_templates, common_chat_templates_deleter> common_chat_templates_ptr;
+
+common_chat_templates_ptr common_chat_templates_init(
+                                    const struct llama_model * model,
+                                           const std::string & chat_template_override,
+                                           const std::string & bos_token_override = "",
+                                           const std::string & eos_token_override = "");
+
+bool         common_chat_templates_was_explicit(const struct common_chat_templates * tmpls);
+const char * common_chat_templates_source(const struct common_chat_templates * tmpls, const char * variant = nullptr);
+
+
+struct common_chat_params      common_chat_templates_apply(
+    const struct common_chat_templates * tmpls,
+    const struct common_chat_templates_inputs & inputs);
+
+// Format single message, while taking into account the position of that message in chat history
+std::string common_chat_format_single(
+        const struct common_chat_templates * tmpls,
+        const std::vector<common_chat_msg> & past_msg,
+        const common_chat_msg & new_msg,
+        bool add_ass,
+        bool use_jinja);
+
+// Returns an example of formatted chat
+std::string common_chat_format_example(
+    const struct common_chat_templates * tmpls,
+    bool use_jinja);
+
+std::string               common_chat_format_name(common_chat_format format);
+common_chat_msg           common_chat_parse(      const std::string & input, common_chat_format format);
+
+common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
+
+// Parses a JSON array of messages in OpenAI's chat completion API format.
+// T can be std::string containing JSON or nlohmann::ordered_json
+template <class T> std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const T & messages);
+template <class T> T common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msgs, bool concat_typed_text = false);
+
+// Parses a JSON array of tools in OpenAI's chat completion tool call API format.
+// T can be std::string containing JSON or nlohmann::ordered_json
+template <class T> std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const T & tools);
+template <class T> T common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools);
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -10,7 +10,6 @@
 // Change JSON_ASSERT from assert() to GGML_ASSERT:
 #define JSON_ASSERT GGML_ASSERT
 #include "json.hpp"
-#include "json-schema-to-grammar.h"
 #include "llama.h"

 #include <algorithm>
@@ -483,6 +482,53 @@ void string_replace_all(std::string & s, const std::string & search, const std::
    s = std::move(builder);
 }

+std::string regex_escape(const std::string & s) {
+    static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
+    return std::regex_replace(s, special_chars, "\\$0");
+}
+
+std::string string_join(const std::vector<std::string> & values, const std::string & separator) {
+    std::ostringstream result;
+    for (size_t i = 0; i < values.size(); ++i) {
+        if (i > 0) {
+            result << separator;
+        }
+        result << values[i];
+    }
+    return result.str();
+}
+
+std::vector<std::string> string_split(const std::string & str, const std::string & delimiter) {
+    std::vector<std::string> parts;
+    size_t start = 0;
+    size_t end = str.find(delimiter);
+
+    while (end != std::string::npos) {
+        parts.push_back(str.substr(start, end - start));
+        start = end + delimiter.length();
+        end = str.find(delimiter, start);
+    }
+
+    parts.push_back(str.substr(start));
+
+    return parts;
+}
+
+std::string string_repeat(const std::string & str, size_t n) {
+    if (n == 0) {
+        return "";
+    }
+
+    std::string result;
+    result.reserve(str.length() * n);
+
+    for (size_t i = 0; i < n; ++i) {
+        result += str;
+    }
+
+    return result;
+}
+
 std::string string_from(bool value) {
    return value ? "true" : "false";
 }
@@ -1043,7 +1089,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
    if (params.n_gpu_layers != -1) {
        mparams.n_gpu_layers = params.n_gpu_layers;
    }
-    mparams.rpc_servers     = params.rpc_servers.c_str();
    mparams.main_gpu        = params.main_gpu;
    mparams.split_mode      = params.split_mode;
    mparams.tensor_split    = params.tensor_split;
@@ -1725,95 +1770,6 @@ std::string common_detokenize(const struct llama_vocab * vocab, const std::vecto
    return text;
 }

-//
-// Chat template utils
-//
-
-std::string common_get_builtin_chat_template(const struct llama_model * model) {
-    const char * ptr_tmpl = llama_model_chat_template(model);
-    return ptr_tmpl == nullptr ? "" : ptr_tmpl;
-}
-
-bool common_chat_verify_template(const std::string & tmpl) {
-    llama_chat_message chat[] = {{"user", "test"}};
-    const int res = llama_chat_apply_template(tmpl.c_str(), chat, 1, true, nullptr, 0);
-    return res >= 0;
-}
-
-std::string common_chat_apply_template(const struct llama_model * model,
-        const std::string & tmpl,
-        const std::vector<common_chat_msg> & msgs,
-        bool add_ass) {
-    int alloc_size = 0;
-    bool fallback = false; // indicate if we must fallback to default chatml
-    std::vector<llama_chat_message> chat;
-    for (const auto & msg : msgs) {
-        chat.push_back({msg.role.c_str(), msg.content.c_str()});
-        alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
-    }
-
-    const char * ptr_tmpl = tmpl.empty() ? llama_model_chat_template(model) : tmpl.c_str();
-    std::vector<char> buf(alloc_size);
-
-    // run the first time to get the total output length
-    int32_t res = llama_chat_apply_template(ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
-
-    // error: chat template is not supported
-    if (res < 0) {
-        if (ptr_tmpl != nullptr) {
-            // if the custom "tmpl" is not supported, we throw an error
-            // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
-            throw std::runtime_error("this custom template is not supported");
-        }
-
-        // If the built-in template is not supported, we default to chatml
-        res = llama_chat_apply_template("chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
-        fallback = true;
-    }
-
-    // if it turns out that our buffer is too small, we resize it
-    if ((size_t) res > buf.size()) {
-        buf.resize(res);
-        res = llama_chat_apply_template(
-            fallback ? "chatml" : ptr_tmpl,
-            chat.data(), chat.size(), add_ass, buf.data(), buf.size());
-    }
-
-    std::string formatted_chat(buf.data(), res);
-    return formatted_chat;
-}
-
-std::string common_chat_format_single(const struct llama_model * model,
-        const std::string & tmpl,
-        const std::vector<common_chat_msg> & past_msg,
-        const common_chat_msg & new_msg,
-        bool add_ass) {
-    std::ostringstream ss;
-    auto fmt_past_msg = past_msg.empty() ? "" : common_chat_apply_template(model, tmpl, past_msg, false);
-    std::vector<common_chat_msg> chat_new(past_msg);
-    // if the past_msg ends with a newline, we must preserve it in the formatted version
-    if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
-        ss << "\n";
-    };
-    // format chat with new_msg
-    chat_new.push_back(new_msg);
-    auto fmt_new_msg = common_chat_apply_template(model, tmpl, chat_new, add_ass);
-    // get the diff part
-    ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
-    return ss.str();
-}
-
-std::string common_chat_format_example(const struct llama_model * model,
-        const std::string & tmpl) {
-    std::vector<common_chat_msg> msgs = {
-        {"system",    "You are a helpful assistant"},
-        {"user",      "Hello"},
-        {"assistant", "Hi there"},
-        {"user",      "How are you?"},
-    };
-    return common_chat_apply_template(model, tmpl, msgs, true);
-}
-
 //
 // KV cache utils
 //
@@ -2074,3 +2030,25 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
    return result;
 }

+template <>
+json common_grammar_trigger::to_json() const {
+    json out {
+        {"type", (int) type},
+        {"value", value},
+    };
+    if (type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
+        out["token"] = (int) token;
+    }
+    return out;
+}
+
+template <>
+common_grammar_trigger common_grammar_trigger::from_json(const json & in) {
+    common_grammar_trigger out;
+    out.type = (common_grammar_trigger_type) in.at("type").get<int>();
+    out.value = in.at("value").get<std::string>();
+    if (out.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
+        out.token = (llama_token) in.at("token").get<int>();
+    }
+    return out;
+}
--- a/common/common.h
+++ b/common/common.h
@@ -4,6 +4,7 @@

 #include "llama-cpp.h"

+#include <set>
 #include <string>
 #include <vector>
 #include <sstream>
@@ -109,6 +110,23 @@ enum common_conversation_mode {
    COMMON_CONVERSATION_MODE_AUTO     = 2,
 };

+enum common_grammar_trigger_type {
+    COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN,
+    COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
+    COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
+    COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
+};
+
+struct common_grammar_trigger {
+    common_grammar_trigger_type type;
+    std::string value;
+    llama_token token = LLAMA_TOKEN_NULL;
+
+    // T can only be nlohmann::ordered_json
+    template <class T> T to_json() const;
+    template <class T> static common_grammar_trigger from_json(const T & in);
+};
+
 // sampling parameters
 struct common_params_sampling {
    uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
@@ -134,6 +152,7 @@ struct common_params_sampling {
    int32_t dry_allowed_length = 2;     // tokens extending repetitions beyond this receive penalty
    int32_t dry_penalty_last_n = -1;    // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
    int32_t mirostat           = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
+    float   top_n_sigma        = -1.00f;// -1.0 = disabled
    float   mirostat_tau       = 5.00f; // target entropy
    float   mirostat_eta       = 0.10f; // learning rate
    bool    ignore_eos         = false;
@@ -154,7 +173,10 @@ struct common_params_sampling {
        COMMON_SAMPLER_TYPE_TEMPERATURE,
    };

-    std::string grammar; // optional BNF-like grammar to constrain sampling
+    std::string                         grammar; // optional BNF-like grammar to constrain sampling
+    bool                                grammar_lazy = false;
+    std::vector<common_grammar_trigger> grammar_triggers; // optional triggers (for lazy grammars)
+    std::set<llama_token>               preserved_tokens;

    std::vector<llama_logit_bias> logit_bias; // logit biases to apply

@@ -167,15 +189,19 @@ struct common_params_speculative {

    int32_t n_ctx        =     0; // draft context size
    int32_t n_max        =    16; // maximum number of tokens to draft during speculative decoding
-    int32_t n_min        =     5; // minimum number of draft tokens to use for speculative decoding
+    int32_t n_min        =     0; // minimum number of draft tokens to use for speculative decoding
    int32_t n_gpu_layers =    -1; // number of layers to store in VRAM for the draft model (-1 - use default)
    float   p_split      =  0.1f; // speculative decoding split probability
-    float   p_min        =  0.9f; // minimum speculative decoding probability (greedy)
+    float   p_min        = 0.75f; // minimum speculative decoding probability (greedy)

    struct cpu_params cpuparams;
    struct cpu_params cpuparams_batch;

-    std::string model = ""; // draft model for speculative decoding                          // NOLINT
+    std::string hf_repo = ""; // HF repo                                                     // NOLINT
+    std::string hf_file = ""; // HF file                                                     // NOLINT
+
+    std::string model = "";     // draft model for speculative decoding                      // NOLINT
+    std::string model_url = ""; // model url to download                                     // NOLINT
 };

 struct common_params_vocoder {
@@ -184,6 +210,15 @@ struct common_params_vocoder {

    std::string model     = ""; // model path                                                // NOLINT
    std::string model_url = ""; // model url to download                                     // NOLINT
+
+    std::string speaker_file = ""; // speaker file path                                      // NOLINT
+
+    bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy            // NOLINT
+};
+
+enum common_reasoning_format {
+    COMMON_REASONING_FORMAT_NONE,
+    COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`
 };

 struct common_params {
@@ -239,6 +274,7 @@ struct common_params {
    std::string hf_repo              = ""; // HF repo                                                       // NOLINT
    std::string hf_file              = ""; // HF file                                                       // NOLINT
    std::string prompt               = "";                                                                  // NOLINT
+    std::string system_prompt        = "";                                                                  // NOLINT
    std::string prompt_file          = ""; // store the external prompt file name                           // NOLINT
    std::string path_prompt_cache    = ""; // path to file for saving/loading prompt eval state             // NOLINT
    std::string input_prefix         = ""; // string to prefix user inputs with                             // NOLINT
@@ -246,7 +282,6 @@ struct common_params {
    std::string lookup_cache_static  = ""; // path of static ngram cache file for lookup decoding           // NOLINT
    std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding          // NOLINT
    std::string logits_file          = ""; // file for saving *all* logits                                  // NOLINT
-    std::string rpc_servers          = ""; // comma separated list of RPC servers                           // NOLINT

    std::vector<std::string> in_files;   // all input files
    std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
@@ -277,6 +312,7 @@ struct common_params {
    bool   kl_divergence    = false; // compute KL divergence

    bool usage             = false; // print usage
+    bool completion        = false; // print source-able completion script
    bool use_color         = false; // use color to distinguish generations and inputs
    bool special           = false; // enable special token output
    bool interactive       = false; // interactive mode
@@ -303,6 +339,8 @@ struct common_params {
    bool warmup            = true;  // warmup run
    bool check_tensors     = false; // validate tensor data

+    bool single_turn       = false; // single turn chat conversation
+
    ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
    ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V

@@ -329,7 +367,9 @@ struct common_params {
    std::string hostname      = "127.0.0.1";
    std::string public_path   = "";                                                                         // NOLINT
    std::string chat_template = "";                                                                         // NOLINT
+    bool use_jinja = false;                                                                                 // NOLINT
    bool enable_chat_template = true;
+    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;

    std::vector<std::string> api_keys;

@@ -367,8 +407,6 @@ struct common_params {
    int32_t i_pos  = -1;  // position of the passkey in the junk text

    // imatrix params
-    std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
-
    int32_t n_out_freq  = 10; // output the imatrix every n_out_freq iterations
    int32_t n_save_freq =  0; // save the imatrix every n_save_freq iterations
    int32_t i_chunk     =  0; // start processing from this chunk
@@ -380,16 +418,16 @@ struct common_params {
    int n_pca_batch = 100;
    int n_pca_iterations = 1000;
    dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
-    std::string cvector_outfile       = "control_vector.gguf";
    std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
    std::string cvector_negative_file = "examples/cvector-generator/negative.txt";

    bool spm_infill = false; // suffix/prefix/middle pattern for infill

-    std::string lora_outfile = "ggml-lora-merged-f16.gguf";
-
    // batched-bench params
    bool batched_bench_output_jsonl = false;
+
+    // common params
+    std::string out_file; // output filename for all example programs
 };

 // call once at the start of a program if it uses libcommon
@@ -408,13 +446,13 @@ bool set_process_priority(enum ggml_sched_priority prio);
 //

 #ifdef __GNUC__
-#ifdef __MINGW32__
-#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
+#    if defined(__MINGW32__) && !defined(__clang__)
+#        define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
+#    else
+#        define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
+#    endif
 #else
-#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
-#endif
-#else
-#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
+#    define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
 #endif

 LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
@@ -423,8 +461,14 @@ std::string string_format(const char * fmt, ...);
 std::string string_strip(const std::string & str);
 std::string string_get_sortable_timestamp();

+std::string string_join(const std::vector<std::string> & values, const std::string & separator);
+std::vector<std::string> string_split(const std::string & str, const std::string & delimiter);
+std::string string_repeat(const std::string & str, size_t n);
+
 void string_replace_all(std::string & s, const std::string & search, const std::string & replace);

+std::string regex_escape(const std::string & s);
+
 template<class T>
 static std::vector<T> string_split(const std::string & str, char delim) {
    static_assert(!std::is_same<T, std::string>::value, "Please use the specialized version for std::string");
@@ -507,12 +551,14 @@ struct llama_model * common_load_model_from_url(
    const std::string & local_path,
    const std::string & hf_token,
    const struct llama_model_params & params);
+
 struct llama_model * common_load_model_from_hf(
    const std::string & repo,
    const std::string & remote_path,
    const std::string & local_path,
    const std::string & hf_token,
    const struct llama_model_params & params);
+
 std::pair<std::string, std::string> common_get_hf_file(
    const std::string & hf_repo_with_tag,
    const std::string & hf_token);
@@ -586,41 +632,6 @@ std::string common_detokenize(
        const std::vector<llama_token> & tokens,
                                  bool   special = true);

-//
-// Chat template utils
-//
-
-// same with llama_chat_message, but uses std::string
-struct common_chat_msg {
-    std::string role;
-    std::string content;
-};
-
-// Get the built-in chat template for the model. Return empty string if not present.
-std::string common_get_builtin_chat_template(const struct llama_model * model);
-
-// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
-bool common_chat_verify_template(const std::string & tmpl);
-
-// CPP wrapper for llama_chat_apply_template
-// If the built-in template is not supported, we default to chatml
-// If the custom "tmpl" is not supported, we throw an error
-std::string common_chat_apply_template(const struct llama_model * model,
-        const std::string & tmpl,
-        const std::vector<common_chat_msg> & chat,
-        bool add_ass);
-
-// Format single message, while taking into account the position of that message in chat history
-std::string common_chat_format_single(const struct llama_model * model,
-        const std::string & tmpl,
-        const std::vector<common_chat_msg> & past_msg,
-        const common_chat_msg & new_msg,
-        bool add_ass);
-
-// Returns an example of formatted chat
-std::string common_chat_format_example(const struct llama_model * model,
-        const std::string & tmpl);
-
 //
 // KV cache utils
 //
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -1,4 +1,6 @@
 #include "json-schema-to-grammar.h"
+#include "common.h"
+
 #include <algorithm>
 #include <fstream>
 #include <map>
@@ -11,11 +13,6 @@

 using json = nlohmann::ordered_json;

-template <typename Iterator>
-static std::string join(Iterator begin, Iterator end, const std::string & separator);
-
-static std::string repeat(const std::string & str, size_t n);
-
 static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
    auto has_max = max_items != std::numeric_limits<int>::max();

@@ -128,8 +125,8 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
                if (sub_len > 0) {
                    auto from_sub = from.substr(i + 1);
                    auto to_sub = to.substr(i + 1);
-                    auto sub_zeros = repeat("0", sub_len);
-                    auto sub_nines = repeat("9", sub_len);
+                    auto sub_zeros = string_repeat("0", sub_len);
+                    auto sub_nines = string_repeat("9", sub_len);

                    auto to_reached = false;
                    out << "(";
@@ -188,8 +185,8 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
        auto max_digits = max_s.length();

        for (auto digits = min_digits; digits < max_digits; digits++) {
-            uniform_range(min_s, repeat("9", digits));
-            min_s = "1" + repeat("0", digits);
+            uniform_range(min_s, string_repeat("9", digits));
+            min_s = "1" + string_repeat("0", digits);
            out << " | ";
        }
        uniform_range(min_s, max_s);
@@ -267,7 +264,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
    throw std::runtime_error("At least one of min_value or max_value must be set");
 }

-const std::string SPACE_RULE = "| \" \" | \"\\n\" [ \\t]{0,20}";
+const std::string SPACE_RULE = "| \" \" | \"\\n\"{1,2} [ \\t]{0,20}";

 struct BuiltinRule {
    std::string content;
@@ -318,49 +315,6 @@ std::unordered_map<char, std::string> GRAMMAR_LITERAL_ESCAPES = {
 std::unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
 std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'^', '$', '.', '[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};

-template <typename Iterator>
-std::string join(Iterator begin, Iterator end, const std::string & separator) {
-    std::ostringstream result;
-    if (begin != end) {
-        result << *begin;
-        for (Iterator it = begin + 1; it != end; ++it) {
-            result << separator << *it;
-        }
-    }
-    return result.str();
-}
-
-static std::vector<std::string> split(const std::string & str, const std::string & delimiter) {
-    std::vector<std::string> tokens;
-    size_t start = 0;
-    size_t end = str.find(delimiter);
-
-    while (end != std::string::npos) {
-        tokens.push_back(str.substr(start, end - start));
-        start = end + delimiter.length();
-        end = str.find(delimiter, start);
-    }
-
-    tokens.push_back(str.substr(start));
-
-    return tokens;
-}
-
-static std::string repeat(const std::string & str, size_t n) {
-    if (n == 0) {
-        return "";
-    }
-
-    std::string result;
-    result.reserve(str.length() * n);
-
-    for (size_t i = 0; i < n; ++i) {
-        result += str;
-    }
-
-    return result;
-}
-
 static std::string replacePattern(const std::string & input, const std::regex & regex, const std::function<std::string(const std::smatch  &)> & replacement) {
    std::smatch match;
    std::string result;
@@ -389,6 +343,7 @@ static std::string format_literal(const std::string & literal) {

 class SchemaConverter {
 private:
+    friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
    std::function<json(const std::string &)> _fetch_json;
    bool _dotall;
    std::map<std::string, std::string> _rules;
@@ -418,7 +373,7 @@ private:
        for (size_t i = 0; i < alt_schemas.size(); i++) {
            rules.push_back(visit(alt_schemas[i], name + (name.empty() ? "alternative-" : "-") + std::to_string(i)));
        }
-        return join(rules.begin(), rules.end(), " | ");
+        return string_join(rules, " | ");
    }

    std::string _visit_pattern(const std::string & pattern, const std::string & name) {
@@ -481,7 +436,7 @@ private:
                for (const auto & item : ret) {
                    results.push_back(to_rule(item));
                }
-                return std::make_pair(join(results.begin(), results.end(), " "), false);
+                return std::make_pair(string_join(results, " "), false);
            };

            while (i < length) {
@@ -539,7 +494,7 @@ private:
                    }
                    curly_brackets += '}';
                    i++;
-                    auto nums = split(curly_brackets.substr(1, curly_brackets.length() - 2), ",");
+                    auto nums = string_split(curly_brackets.substr(1, curly_brackets.length() - 2), ",");
                    int min_times = 0;
                    int max_times = std::numeric_limits<int>::max();
                    try {
@@ -854,7 +809,7 @@ public:
                            return;
                        }
                        std::string pointer = ref.substr(ref.find('#') + 1);
-                        std::vector<std::string> tokens = split(pointer, "/");
+                        std::vector<std::string> tokens = string_split(pointer, "/");
                        for (size_t i = 1; i < tokens.size(); ++i) {
                            std::string sel = tokens[i];
                            if (target.is_null() || !target.contains(sel)) {
@@ -905,7 +860,7 @@ public:
            for (const auto & v : schema["enum"]) {
                enum_values.push_back(_generate_constant_rule(v));
            }
-            return _add_rule(rule_name, "(" + join(enum_values.begin(), enum_values.end(), " | ") + ") space");
+            return _add_rule(rule_name, "(" + string_join(enum_values, " | ") + ") space");
        } else if ((schema_type.is_null() || schema_type == "object")
                && (schema.contains("properties") ||
                    (schema.contains("additionalProperties") && schema["additionalProperties"] != true))) {
@@ -1019,10 +974,10 @@ public:

    void check_errors() {
        if (!_errors.empty()) {
-            throw std::runtime_error("JSON schema conversion failed:\n" + join(_errors.begin(), _errors.end(), "\n"));
+            throw std::runtime_error("JSON schema conversion failed:\n" + string_join(_errors, "\n"));
        }
        if (!_warnings.empty()) {
-            fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n", join(_warnings.begin(), _warnings.end(), "; ").c_str());
+            fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n", string_join(_warnings, "; ").c_str());
        }
    }

@@ -1035,11 +990,35 @@ public:
    }
 };

-std::string json_schema_to_grammar(const json & schema) {
-    SchemaConverter converter([](const std::string &) { return json::object(); }, /* dotall= */ false);
-    auto copy = schema;
-    converter.resolve_refs(copy, "input");
-    converter.visit(copy, "");
+std::string json_schema_to_grammar(const json & schema, bool force_gbnf) {
+#ifdef LLAMA_USE_LLGUIDANCE
+    if (!force_gbnf) {
+        return "%llguidance {}\nstart: %json " + schema.dump();
+    }
+#else
+    (void)force_gbnf;
+#endif // LLAMA_USE_LLGUIDANCE
+    return build_grammar([&](const common_grammar_builder & callbacks) {
+        auto copy = schema;
+        callbacks.resolve_refs(copy);
+        callbacks.add_schema("", copy);
+    });
+}
+
+std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options) {
+    SchemaConverter converter([&](const std::string &) { return json(); }, options.dotall);
+    common_grammar_builder builder {
+        /* .add_rule = */ [&](const std::string & name, const std::string & rule) {
+            return converter._add_rule(name, rule);
+        },
+        /* .add_schema = */ [&](const std::string & name, const nlohmann::ordered_json & schema) {
+            return converter.visit(schema, name == "root" ? "" : name);
+        },
+        /* .resolve_refs = */ [&](nlohmann::ordered_json & schema) {
+            converter.resolve_refs(schema, "");
+        }
+    };
+    cb(builder);
    converter.check_errors();
    return converter.format_grammar();
 }
--- a/common/json-schema-to-grammar.h
+++ b/common/json-schema-to-grammar.h
@@ -5,4 +5,17 @@
 #define JSON_ASSERT GGML_ASSERT
 #include "json.hpp"

-std::string json_schema_to_grammar(const nlohmann::ordered_json& schema);
+std::string json_schema_to_grammar(const nlohmann::ordered_json & schema,
+                                   bool force_gbnf = false);
+
+struct common_grammar_builder {
+    std::function<std::string(const std::string &, const std::string &)> add_rule;
+    std::function<std::string(const std::string &, const nlohmann::ordered_json &)> add_schema;
+    std::function<void(nlohmann::ordered_json &)> resolve_refs;
+};
+
+struct common_grammar_options {
+    bool dotall = false;
+};
+
+std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options = {});
--- a/common/llguidance.cpp
+++ b/common/llguidance.cpp
@@ -0,0 +1,270 @@
+#include "sampling.h"
+#include "log.h"
+
+#ifdef LLAMA_USE_LLGUIDANCE
+
+#    include "llguidance.h"
+#    include <cmath>
+
+struct llama_sampler_llg {
+    const llama_vocab * vocab;
+    std::string         grammar_kind;
+    std::string         grammar_data;
+    LlgTokenizer *      tokenizer;
+    LlgConstraint *     grammar;
+    LlgMaskResult       llg_res;
+    bool                has_llg_res;
+};
+
+static LlgConstraint * llama_sampler_llg_new(LlgTokenizer * tokenizer, const char * grammar_kind,
+                                             const char * grammar_data) {
+    LlgConstraintInit cinit;
+    llg_constraint_init_set_defaults(&cinit, tokenizer);
+    const char * log_level = getenv("LLGUIDANCE_LOG_LEVEL");
+    if (log_level && *log_level) {
+        cinit.log_stderr_level = atoi(log_level);
+    }
+    auto c = llg_new_constraint_any(&cinit, grammar_kind, grammar_data);
+    if (llg_get_error(c)) {
+        LOG_ERR("llg error: %s\n", llg_get_error(c));
+        llg_free_constraint(c);
+        return nullptr;
+    }
+    return c;
+}
+
+static const char * llama_sampler_llg_name(const llama_sampler * /*smpl*/) {
+    return "llguidance";
+}
+
+static void llama_sampler_llg_accept_impl(llama_sampler * smpl, llama_token token) {
+    auto * ctx = (llama_sampler_llg *) smpl->ctx;
+    if (ctx->grammar) {
+        LlgCommitResult res;
+        llg_commit_token(ctx->grammar, token, &res);
+        ctx->has_llg_res = false;
+    }
+}
+
+static void llama_sampler_llg_apply(llama_sampler * smpl, llama_token_data_array * cur_p) {
+    auto * ctx = (llama_sampler_llg *) smpl->ctx;
+    if (ctx->grammar) {
+        if (!ctx->has_llg_res) {
+            if (llg_compute_mask(ctx->grammar, &ctx->llg_res) == 0) {
+                ctx->has_llg_res = true;
+            } else {
+                LOG_ERR("llg error: %s\n", llg_get_error(ctx->grammar));
+                llg_free_constraint(ctx->grammar);
+                ctx->grammar = nullptr;
+            }
+        }
+        if (ctx->has_llg_res) {
+            if (ctx->llg_res.is_stop) {
+                for (size_t i = 0; i < cur_p->size; ++i) {
+                    if (!llama_vocab_is_eog(ctx->vocab, cur_p->data[i].id)) {
+                        cur_p->data[i].logit = -INFINITY;
+                    }
+                }
+            } else {
+                const uint32_t * mask = ctx->llg_res.sample_mask;
+                for (size_t i = 0; i < cur_p->size; ++i) {
+                    auto token = cur_p->data[i].id;
+                    if ((mask[token / 32] & (1 << (token % 32))) == 0) {
+                        cur_p->data[i].logit = -INFINITY;
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void llama_sampler_llg_reset(llama_sampler * smpl) {
+    auto * ctx = (llama_sampler_llg *) smpl->ctx;
+    if (!ctx->grammar) {
+        return;
+    }
+
+    auto * grammar_new = llama_sampler_llg_new(ctx->tokenizer, ctx->grammar_kind.c_str(), ctx->grammar_data.c_str());
+    llg_free_constraint(ctx->grammar);
+    ctx->grammar     = grammar_new;
+    ctx->has_llg_res = false;
+}
+
+static llama_sampler * llama_sampler_llg_clone(const llama_sampler * smpl) {
+    const auto * ctx = (const llama_sampler_llg *) smpl->ctx;
+
+    auto * result = llama_sampler_init_llg(ctx->vocab, nullptr, nullptr);
+
+    // copy the state
+    {
+        auto * result_ctx = (llama_sampler_llg *) result->ctx;
+
+        if (ctx->grammar) {
+            result_ctx->grammar_kind = ctx->grammar_kind;
+            result_ctx->grammar_data = ctx->grammar_data;
+            result_ctx->grammar      = llg_clone_constraint(ctx->grammar);
+            result_ctx->tokenizer    = llg_clone_tokenizer(ctx->tokenizer);
+        }
+    }
+
+    return result;
+}
+
+static void llama_sampler_llg_free(llama_sampler * smpl) {
+    const auto * ctx = (llama_sampler_llg *) smpl->ctx;
+
+    if (ctx->grammar) {
+        llg_free_constraint(ctx->grammar);
+        llg_free_tokenizer(ctx->tokenizer);
+    }
+
+    delete ctx;
+}
+
+static llama_sampler_i llama_sampler_llg_i = {
+    /* .name   = */ llama_sampler_llg_name,
+    /* .accept = */ llama_sampler_llg_accept_impl,
+    /* .apply  = */ llama_sampler_llg_apply,
+    /* .reset  = */ llama_sampler_llg_reset,
+    /* .clone  = */ llama_sampler_llg_clone,
+    /* .free   = */ llama_sampler_llg_free,
+};
+
+static size_t llama_sampler_llg_tokenize_fn(const void * user_data, const uint8_t * bytes, size_t bytes_len,
+                                            uint32_t * output_tokens, size_t output_tokens_len) {
+    const llama_vocab * vocab = (const llama_vocab *) user_data;
+    int                 r     = 0;
+    try {
+        r = llama_tokenize(vocab, (const char *) bytes, bytes_len, (int32_t *) output_tokens, output_tokens_len, false,
+                           true);
+    } catch (const std::exception & e) {
+        GGML_ABORT("llama_tokenize failed: %s\n", e.what());
+    }
+    if (r < 0) {
+        return -r;
+    }
+    return r;
+}
+
+static LlgTokenizer * llama_sampler_llg_new_tokenizer(const llama_vocab * vocab) {
+    // TODO store the tokenizer in the vocab somehow
+    static const llama_vocab * vocab_cache;
+    static LlgTokenizer *      tokenizer_cache;
+
+    if (vocab_cache == vocab) {
+        return llg_clone_tokenizer(tokenizer_cache);
+    }
+
+    auto tok_eos = llama_vocab_eot(vocab);
+    if (tok_eos == LLAMA_TOKEN_NULL) {
+        tok_eos = llama_vocab_eos(vocab);
+    }
+
+    size_t vocab_size = llama_vocab_n_tokens(vocab);
+
+    auto token_lens       = new uint32_t[vocab_size];
+    // we typically have ~7 bytes per token; let's go on the safe side here
+    auto token_bytes_size = vocab_size * 16 + 1024 * 1024;
+    auto token_bytes      = new uint8_t[token_bytes_size];
+
+    size_t offset = 0;
+    for (size_t i = 0; i < vocab_size; i++) {
+        size_t max_token = 1024;
+        if (token_bytes_size - offset < max_token) {
+            GGML_ABORT("token_bytes buffer too small\n");
+        }
+
+        llama_token token = i;
+        auto        dp    = (char *) token_bytes + offset;
+        auto        size  = llama_detokenize(vocab, &token, 1, dp, max_token, false, false);
+        if (size < 0) {
+            GGML_ABORT("llama_detokenize failed\n");
+        }
+        if (size == 0) {
+            size = llama_detokenize(vocab, &token, 1, dp + 1, max_token - 1, false, true);
+            if (size < 0) {
+                GGML_ABORT("llama_detokenize failed\n");
+            }
+            if (size != 0) {
+                *dp = '\xff';  // special token prefix marker
+                size += 1;
+            }
+        }
+
+        token_lens[i] = size;
+        offset += size;
+    }
+
+    LlgTokenizerInit tinit = {
+        /* .vocab_size                         = */ (uint32_t) vocab_size,
+        /* .tok_eos                            = */ (uint32_t) tok_eos,
+        /* .token_lens                         = */ token_lens,
+        /* .token_bytes                        = */ token_bytes,
+        /* .tokenizer_json                     = */ nullptr,
+        /* .tokenize_assumes_string            = */ true,
+        /* .tokenize_fn                        = */ llama_sampler_llg_tokenize_fn,
+        /* .use_approximate_greedy_tokenize_fn = */ false,
+        /* .tokenize_user_data                 = */ vocab,
+    };
+
+    char           error_buffer[1024];
+    LlgTokenizer * tokenizer = llg_new_tokenizer(&tinit, error_buffer, sizeof(error_buffer));
+
+    delete[] token_bytes;
+    delete[] token_lens;
+
+    if (tokenizer == nullptr) {
+        LOG_ERR("llg tokenizer error: %s\n", error_buffer);
+        return tokenizer;
+    }
+
+    if (tokenizer_cache) {
+        llg_free_tokenizer(tokenizer_cache);
+    }
+    vocab_cache     = vocab;
+    tokenizer_cache = tokenizer;
+
+    return llg_clone_tokenizer(tokenizer_cache);
+}
+
+llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab, const char * grammar_kind,
+                                       const char * grammar_data) {
+    auto * ctx = new llama_sampler_llg;
+
+    if (grammar_kind != nullptr && grammar_kind[0] != '\0') {
+        auto tokenizer = llama_sampler_llg_new_tokenizer(vocab);
+        *ctx           = {
+            /* .vocab        = */ vocab,
+            /* .grammar_kind = */ grammar_kind,
+            /* .grammar_data = */ grammar_data,
+            /* .tokenizer    = */ tokenizer,
+            /* .grammar      = */ llama_sampler_llg_new(tokenizer, grammar_kind, grammar_data),
+            /* .llg_res      = */ {},
+            /* .has_llg_res  = */ false,
+        };
+    } else {
+        *ctx = {
+            /* .vocab        = */ vocab,
+            /* .grammar_kind = */ {},
+            /* .grammar_data = */ {},
+            /* .tokenizer    = */ nullptr,
+            /* .grammar      = */ nullptr,
+            /* .llg_res      = */ {},
+            /* .has_llg_res  = */ false,
+        };
+    }
+
+    return llama_sampler_init(
+        /* .iface = */ &llama_sampler_llg_i,
+        /* .ctx   = */ ctx
+    );
+}
+
+#else
+
+llama_sampler * llama_sampler_init_llg(const llama_vocab *, const char *, const char *) {
+    LOG_WRN("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
+    return nullptr;
+}
+
+#endif  // LLAMA_USE_LLGUIDANCE
--- a/common/log.cpp
+++ b/common/log.cpp
@@ -1,5 +1,6 @@
 #include "log.h"

+#include <chrono>
 #include <condition_variable>
 #include <cstdarg>
 #include <cstdio>
@@ -14,16 +15,6 @@ void common_log_set_verbosity_thold(int verbosity) {
    common_log_verbosity_thold = verbosity;
 }

-#define LOG_COL_DEFAULT "\033[0m"
-#define LOG_COL_BOLD    "\033[1m"
-#define LOG_COL_RED     "\033[31m"
-#define LOG_COL_GREEN   "\033[32m"
-#define LOG_COL_YELLOW  "\033[33m"
-#define LOG_COL_BLUE    "\033[34m"
-#define LOG_COL_MAGENTA "\033[35m"
-#define LOG_COL_CYAN    "\033[36m"
-#define LOG_COL_WHITE   "\033[37m"
-
 static int64_t t_us() {
    return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
 }
@@ -206,6 +197,7 @@ public:
                vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args_copy);
            }
 #endif
+            va_end(args_copy);
        }

        entry.level = level;
--- a/common/log.h
+++ b/common/log.h
@@ -2,9 +2,20 @@

 #include "ggml.h" // for ggml_log_level

+#define LOG_CLR_TO_EOL  "\033[K\r"
+#define LOG_COL_DEFAULT "\033[0m"
+#define LOG_COL_BOLD    "\033[1m"
+#define LOG_COL_RED     "\033[31m"
+#define LOG_COL_GREEN   "\033[32m"
+#define LOG_COL_YELLOW  "\033[33m"
+#define LOG_COL_BLUE    "\033[34m"
+#define LOG_COL_MAGENTA "\033[35m"
+#define LOG_COL_CYAN    "\033[36m"
+#define LOG_COL_WHITE   "\033[37m"
+
 #ifndef __GNUC__
 #    define LOG_ATTRIBUTE_FORMAT(...)
-#elif defined(__MINGW32__)
+#elif defined(__MINGW32__) && !defined(__clang__)
 #    define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
 #else
 #    define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
--- a/common/minja/chat-template.hpp
+++ b/common/minja/chat-template.hpp
@@ -0,0 +1,529 @@
+/*
+    Copyright 2024 Google LLC
+
+    Use of this source code is governed by an MIT-style
+    license that can be found in the LICENSE file or at
+    https://opensource.org/licenses/MIT.
+*/
+// SPDX-License-Identifier: MIT
+#pragma once
+
+#include "minja.hpp"
+#include <json.hpp>
+#include <string>
+#include <vector>
+
+using json = nlohmann::ordered_json;
+
+namespace minja {
+
+struct chat_template_caps {
+    bool supports_tools = false;
+    bool supports_tool_calls = false;
+    bool supports_tool_responses = false;
+    bool supports_system_role = false;
+    bool supports_parallel_tool_calls = false;
+    bool supports_tool_call_id = false;
+    // meta-llama/Llama-3.1-8B-Instruct expects arguments to be an object.
+    // Most other templates (and OpenAI's API) expect the arguments object to be stringified.
+    bool requires_object_arguments = false;
+    // CohereForAI/c4ai-command-r-plus simple variant
+    bool requires_non_null_content = false;
+    // MiniMaxAI/MiniMax-Text-01 special
+    bool requires_typed_content = false;
+};
+
+struct chat_template_inputs {
+    nlohmann::ordered_json messages;
+    nlohmann::ordered_json tools;
+    bool add_generation_prompt = true;
+    nlohmann::ordered_json extra_context;
+    std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
+};
+
+struct chat_template_options {
+    bool apply_polyfills = true;
+    bool use_bos_token = true;
+    bool use_eos_token = true;
+    bool define_strftime_now = true;
+
+    bool polyfill_tools = true;
+    bool polyfill_tool_call_examples = true;
+    bool polyfill_tool_calls = true;
+    bool polyfill_tool_responses = true;
+    bool polyfill_system_role = true;
+    bool polyfill_object_arguments = true;
+    bool polyfill_typed_content = true;
+};
+
+class chat_template {
+
+  private:
+    chat_template_caps caps_;
+    std::string source_;
+    std::string bos_token_;
+    std::string eos_token_;
+    std::shared_ptr<minja::TemplateNode> template_root_;
+    std::string tool_call_example_;
+
+    std::string try_raw_render(
+        const nlohmann::ordered_json & messages,
+        const nlohmann::ordered_json & tools,
+        bool add_generation_prompt,
+        const nlohmann::ordered_json & extra_context = nlohmann::ordered_json()) const
+    {
+        try {
+            chat_template_inputs inputs;
+            inputs.messages = messages;
+            inputs.tools = tools;
+            inputs.add_generation_prompt = add_generation_prompt;
+            inputs.extra_context = extra_context;
+            // Use fixed date for tests
+            inputs.now = std::chrono::system_clock::from_time_t(0);
+
+            chat_template_options opts;
+            opts.apply_polyfills = false;
+
+            auto prompt = apply(inputs, opts);
+            // fprintf(stderr, "try_raw_render: %s\n", prompt.c_str());
+            return prompt;
+        } catch (const std::exception & e) {
+            // fprintf(stderr, "try_raw_render error: %s\n", e.what());
+            return "";
+        }
+    }
+
+  public:
+
+    chat_template(const std::string & source, const std::string & bos_token, const std::string & eos_token)
+        : source_(source), bos_token_(bos_token), eos_token_(eos_token)
+    {
+        template_root_ = minja::Parser::parse(source_, {
+            /* .trim_blocks = */ true,
+            /* .lstrip_blocks = */ true,
+            /* .keep_trailing_newline = */ false,
+        });
+
+        auto contains = [](const std::string & haystack, const std::string & needle) {
+            return haystack.find(needle) != std::string::npos;
+        };
+
+        const std::string user_needle = "<User Needle>";
+        const std::string sys_needle = "<System Needle>";
+        const json dummy_str_user_msg = {{"role", "user"}, {"content", user_needle}};
+        const json dummy_typed_user_msg = {{"role", "user"}, {"content", json::array({{{"type", "text"}, {"text", user_needle}}})}};
+
+        caps_.requires_typed_content =
+            !contains(try_raw_render(json::array({dummy_str_user_msg}), {}, false), user_needle)
+            && contains(try_raw_render(json::array({dummy_typed_user_msg}), {}, false), user_needle);
+
+        const auto dummy_user_msg = caps_.requires_typed_content
+            ? dummy_typed_user_msg
+            : dummy_str_user_msg;
+        const json needle_system_msg = {
+            {"role", "system"},
+            {"content", caps_.requires_typed_content ? json::array({{{"type", "text"}, {"text", sys_needle}}}) : json(sys_needle)},
+        };
+
+        caps_.supports_system_role = contains(try_raw_render({needle_system_msg, dummy_user_msg,}, {}, false), sys_needle);
+
+        auto out = try_raw_render(json::array({
+            dummy_user_msg
+        }), json::array({
+            {
+                {"name", "some_tool"},
+                {"type", "function"},
+                {"function", {
+                    {"name", "some_tool"},
+                    {"description", "Some tool."},
+                    {"parameters", {
+                        {"type", "object"},
+                        {"properties", {
+                            {"arg", {
+                                {"type", "string"},
+                                {"description", "Some argument."},
+                            }},
+                        }},
+                        {"required", json::array({ "arg" })},
+                    }},
+                }},
+            },
+        }), false);
+        caps_.supports_tools = contains(out, "some_tool");
+
+        auto make_tool_calls_msg = [&](const json & tool_calls) {
+            return json {
+                {"role", "assistant"},
+                {"content", nullptr},
+                {"tool_calls", tool_calls},
+            };
+        };
+        auto make_tool_call = [](const std::string & tool_name, const json & arguments) {
+            return json {
+                {"id", "call_1___"},
+                {"type", "function"},
+                {"function", {
+                    {"arguments", arguments},
+                    {"name", tool_name},
+                }},
+            };
+        };
+        const json dummy_args_obj {{"argument_needle", "print('Hello, World!')"}};
+
+        // Note: the arguments are rendered in both cases, but may be double-escaped, which we don't want.
+        out = try_raw_render(json::array({
+            dummy_user_msg,
+            make_tool_calls_msg(json::array({make_tool_call("ipython", dummy_args_obj.dump())})),
+        }), {}, false);
+        auto tool_call_renders_str_arguments = contains(out, "\"argument_needle\":") || contains(out, "'argument_needle':");
+        out = try_raw_render(json::array({
+            dummy_user_msg,
+            make_tool_calls_msg(json::array({make_tool_call("ipython", dummy_args_obj)})),
+        }), {}, false);
+        auto tool_call_renders_obj_arguments = contains(out, "\"argument_needle\":") || contains(out, "'argument_needle':");
+
+        caps_.supports_tool_calls = tool_call_renders_str_arguments || tool_call_renders_obj_arguments;
+        caps_.requires_object_arguments = !tool_call_renders_str_arguments && tool_call_renders_obj_arguments;
+        auto out_empty = try_raw_render(json::array({dummy_user_msg, {{"role", "assistant"}, {"content", ""}}}), {}, false);
+        auto out_null = try_raw_render(json::array({dummy_user_msg, {{"role", "assistant"}, {"content", nullptr}}}), {}, false);
+        caps_.requires_non_null_content = contains(out_empty, user_needle) && !contains(out_null, user_needle);
+
+        if (caps_.supports_tool_calls) {
+            auto dummy_args = caps_.requires_object_arguments ? dummy_args_obj : json(dummy_args_obj.dump());
+            auto tc1 = make_tool_call("test_tool1", dummy_args);
+            auto tc2 = make_tool_call("test_tool2", dummy_args);
+            auto out = try_raw_render(json::array({
+                dummy_user_msg,
+                make_tool_calls_msg(json::array({tc1, tc2})),
+            }), {}, false);
+            caps_.supports_parallel_tool_calls = contains(out, "test_tool1") && contains(out, "test_tool2");
+
+            out = try_raw_render(json::array({
+                dummy_user_msg,
+                make_tool_calls_msg(json::array({tc1})),
+                {
+                    {"role", "tool"},
+                    {"name", "test_tool1"},
+                    {"content", "Some response!"},
+                    {"tool_call_id", "call_911_"},
+                }
+            }), {}, false);
+            caps_.supports_tool_responses = contains(out, "Some response!");
+            caps_.supports_tool_call_id = contains(out, "call_911_");
+        }
+
+        try {
+            if (!caps_.supports_tools) {
+                const json user_msg {
+                    {"role", "user"},
+                    {"content", "Hey"},
+                };
+                const json args {
+                    {"arg1", "some_value"},
+                };
+                const json tool_call_msg {
+                    {"role", "assistant"},
+                    {"content", nullptr},
+                    {"tool_calls", json::array({
+                        {
+                            // TODO: detect if requires numerical id or fixed length == 6 like Nemo
+                            {"id", "call_1___"},
+                            {"type", "function"},
+                            {"function", {
+                                {"name", "tool_name"},
+                                {"arguments", (caps_.requires_object_arguments ? args : json(minja::Value(args).dump(-1, /* to_json= */ true)))},
+                            }},
+                        },
+                    })},
+                };
+                std::string prefix, full;
+                {
+                    chat_template_inputs inputs;
+                    inputs.messages = json::array({user_msg});
+                    inputs.add_generation_prompt = true;
+                    prefix = apply(inputs);
+                }
+                {
+                    chat_template_inputs inputs;
+                    inputs.messages = json::array({user_msg, tool_call_msg});
+                    inputs.add_generation_prompt = false;
+                    full = apply(inputs);
+                }
+                auto eos_pos_last = full.rfind(eos_token_);
+                if (eos_pos_last == prefix.size() - eos_token_.size() ||
+                      (full[full.size() - 1] == '\n' && (eos_pos_last == full.size() - eos_token_.size() - 1))) {
+                    full = full.substr(0, eos_pos_last);
+                }
+                size_t common_prefix_length = 0;
+                for (size_t i = 0; i < prefix.size() && i < full.size(); ++i) {
+                    if (prefix[i] != full[i]) {
+                        break;
+                    }
+                    if (prefix[i] == '<') {
+                        // DeepSeek R1's template (as of 20250209) adds a trailing <think> if add_generation_prompt,
+                        // but it removes thinking tags for past messages.
+                        // The prefix and full strings diverge at <think> vs. <｜tool▁calls▁begin｜>, we avoid consuming the leading <.
+                        continue;
+                    }
+                    common_prefix_length = i + 1;
+                }
+                auto example = full.substr(common_prefix_length);
+                if (example.find("tool_name") == std::string::npos && example.find("some_value") == std::string::npos) {
+                    fprintf(stderr, "Failed to infer a tool call example (possible template bug)\n");
+                } else {
+                    tool_call_example_ = example;
+                }
+            }
+        } catch (const std::exception & e) {
+            fprintf(stderr, "Failed to generate tool call example: %s\n", e.what());
+        }
+    }
+
+    const std::string & source() const { return source_; }
+    const std::string & bos_token() const { return bos_token_; }
+    const std::string & eos_token() const { return eos_token_; }
+    const chat_template_caps & original_caps() const { return caps_; }
+
+    // Deprecated, please use the form with chat_template_inputs and chat_template_options
+    std::string apply(
+        const nlohmann::ordered_json & messages,
+        const nlohmann::ordered_json & tools,
+        bool add_generation_prompt,
+        const nlohmann::ordered_json & extra_context = nlohmann::ordered_json(),
+        bool apply_polyfills = true)
+    {
+        fprintf(stderr, "[%s] Deprecated!\n", __func__);
+        chat_template_inputs inputs;
+        inputs.messages = messages;
+        inputs.tools = tools;
+        inputs.add_generation_prompt = add_generation_prompt;
+        inputs.extra_context = extra_context;
+        inputs.now = std::chrono::system_clock::now();
+
+        chat_template_options opts;
+        opts.apply_polyfills = apply_polyfills;
+
+        return apply(inputs, opts);
+    }
+
+    std::string apply(
+        const chat_template_inputs & inputs,
+        const chat_template_options & opts = chat_template_options()) const
+    {
+        json actual_messages;
+
+        auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
+        auto has_tool_calls = false;
+        auto has_tool_responses = false;
+        auto has_string_content = false;
+        for (const auto & message : inputs.messages) {
+            if (message.contains("tool_calls") && !message["tool_calls"].is_null()) {
+                has_tool_calls = true;
+            }
+            if (message.contains("role") && message["role"] == "tool") {
+                has_tool_responses = true;
+            }
+            if (message.contains("content") && message["content"].is_string()) {
+                has_string_content = true;
+            }
+        }
+
+        auto polyfill_system_role = opts.polyfill_system_role && !caps_.supports_system_role;
+        auto polyfill_tools = opts.polyfill_tools && has_tools && !caps_.supports_tools;
+        auto polyfill_tool_call_example = polyfill_tools && opts.polyfill_tool_call_examples;
+        auto polyfill_tool_calls = opts.polyfill_tool_calls && has_tool_calls && !caps_.supports_tool_calls;
+        auto polyfill_tool_responses = opts.polyfill_tool_responses && has_tool_responses && !caps_.supports_tool_responses;
+        auto polyfill_object_arguments = opts.polyfill_object_arguments && has_tool_calls && caps_.requires_object_arguments;
+        auto polyfill_typed_content = opts.polyfill_typed_content && has_string_content && caps_.requires_typed_content;
+
+        auto needs_polyfills = opts.apply_polyfills && (false
+            || polyfill_system_role
+            || polyfill_tools
+            || polyfill_tool_calls
+            || polyfill_tool_responses
+            || polyfill_object_arguments
+            || polyfill_typed_content
+        );
+
+        if (needs_polyfills) {
+            actual_messages = json::array();
+
+            auto add_message = [&](const json & msg) {
+                if (polyfill_typed_content && msg.contains("content") && !msg.at("content").is_null() && msg.at("content").is_string()) {
+                    actual_messages.push_back({
+                        {"role", msg.at("role")},
+                        {"content", {{
+                            {"type", "text"},
+                            {"text", msg.at("content")},
+                        }}},
+                    });
+                } else {
+                    actual_messages.push_back(msg);
+                }
+            };
+
+            std::string pending_system;
+            auto flush_sys = [&]() {
+                if (!pending_system.empty()) {
+                    add_message({
+                        {"role", "user"},
+                        {"content", pending_system},
+                    });
+                    pending_system.clear();
+                }
+            };
+
+            json adjusted_messages;
+            if (polyfill_tools) {
+                adjusted_messages = add_system(inputs.messages,
+                    "You can call any of the following tools to satisfy the user's requests: " + minja::Value(inputs.tools).dump(2, /* to_json= */ true) +
+                    (!polyfill_tool_call_example || tool_call_example_.empty() ? "" : "\n\nExample tool call syntax:\n\n" + tool_call_example_ + "\n\n"));
+            } else {
+                adjusted_messages = inputs.messages;
+            }
+
+            for (const auto & message_ : adjusted_messages) {
+                auto message = message_;
+                if (!message.contains("role") || !message.contains("content")) {
+                    throw std::runtime_error("message must have 'role' and 'content' fields: " + message.dump());
+                }
+                std::string role = message.at("role");
+
+                if (message.contains("tool_calls")) {
+                    if (polyfill_object_arguments || polyfill_tool_calls) {
+                        for (auto & tool_call : message.at("tool_calls")) {
+                            if (tool_call["type"] == "function") {
+                                auto & function = tool_call.at("function");
+                                auto & arguments = function.at("arguments");
+                                if (arguments.is_string()) {
+                                    try {
+                                        arguments = json::parse(arguments.get<std::string>());
+                                    } catch (const std::exception & ecvt) {
+                                        fprintf(stderr, "Failed to parse arguments: %s\n", ecvt.what());
+                                    }
+                                }
+                            }
+                        }
+                    }
+                    if (polyfill_tool_calls) {
+                        auto content = message.at("content");
+                        auto tool_calls = json::array();
+                        for (const auto & tool_call : message.at("tool_calls")) {
+                            if (tool_call.at("type") != "function") {
+                                continue;
+                            }
+                            const auto & function = tool_call.at("function");
+                            auto tc = json {
+                                {"name", function.at("name")},
+                                {"arguments", function.at("arguments")},
+                            };
+                            if (tool_call.contains("id")) {
+                                tc["id"] = tool_call["id"];
+                            }
+                            tool_calls.push_back(tc);
+                        }
+                        auto obj = json {
+                            {"tool_calls", tool_calls},
+                        };
+                        if (!content.is_null() && content != "") {
+                            obj["content"] = content;
+                        }
+                        message["content"] = obj.dump(2);
+                        message.erase("tool_calls");
+                    }
+                }
+                if (polyfill_tool_responses && role == "tool") {
+                    message["role"] = "user";
+                    auto obj = json {
+                        {"tool_response", {
+                            {"content", message.at("content")},
+                        }},
+                    };
+                    if (message.contains("name")) {
+                        obj["tool_response"]["name"] = message.at("name");
+                    }
+                    if (message.contains("tool_call_id")) {
+                        obj["tool_response"]["tool_call_id"] = message.at("tool_call_id");
+                    }
+                    message["content"] = obj.dump(2);
+                    message.erase("name");
+                }
+
+                if (!message["content"].is_null() && polyfill_system_role) {
+                    std::string content = message.at("content");
+                    if (role == "system") {
+                        if (!pending_system.empty()) pending_system += "\n";
+                        pending_system += content;
+                        continue;
+                    } else {
+                        if (role == "user") {
+                            if (!pending_system.empty()) {
+                                message["content"] = pending_system + (content.empty() ? "" : "\n" + content);
+                                pending_system.clear();
+                            }
+                        } else {
+                            flush_sys();
+                        }
+                    }
+                }
+                add_message(message);
+            }
+            flush_sys();
+        } else {
+            actual_messages = inputs.messages;
+        }
+
+        auto context = minja::Context::make(json({
+            {"messages", actual_messages},
+            {"add_generation_prompt", inputs.add_generation_prompt},
+        }));
+        context->set("bos_token", opts.use_bos_token ? bos_token_ : "");
+        context->set("eos_token", opts.use_eos_token ? eos_token_ : "");
+        if (opts.define_strftime_now) {
+            auto now = inputs.now;
+            context->set("strftime_now", Value::callable([now](const std::shared_ptr<minja::Context> &, minja::ArgumentsValue & args) {
+                args.expectArgs("strftime_now", {1, 1}, {0, 0});
+                auto format = args.args[0].get<std::string>();
+
+                auto time = std::chrono::system_clock::to_time_t(now);
+                auto local_time = *std::localtime(&time);
+                std::ostringstream ss;
+                ss << std::put_time(&local_time, format.c_str());
+                return ss.str();
+            }));
+        }
+        if (!inputs.tools.is_null()) {
+            context->set("tools", minja::Value(inputs.tools));
+        }
+        if (!inputs.extra_context.is_null()) {
+            for (auto & kv : inputs.extra_context.items()) {
+                context->set(kv.key(), minja::Value(kv.value()));
+            }
+        }
+
+        auto ret = template_root_->render(context);
+        // fprintf(stderr, "actual_messages: %s\n", actual_messages.dump(2).c_str());
+        // fprintf(stderr, "apply: %s\n\n", ret.c_str());
+        return ret;
+    }
+
+    static nlohmann::ordered_json add_system(const nlohmann::ordered_json & messages, const std::string & system_prompt) {
+        json messages_with_system = messages;
+
+        if (messages_with_system.size() > 0 && messages_with_system[0].at("role") == "system") {
+            std::string existing_system = messages_with_system.at(0).at("content");
+            messages_with_system[0] = json {
+                {"role", "system"},
+                {"content", existing_system + "\n\n" + system_prompt},
+            };
+        } else {
+            messages_with_system.insert(messages_with_system.begin(), json {
+                {"role", "system"},
+                {"content", system_prompt},
+            });
+        }
+        return messages_with_system;
+    }
+};
+
+}  // namespace minja
--- a/common/minja/minja.hpp
+++ b/common/minja/minja.hpp
--- a/common/ngram-cache.cpp
+++ b/common/ngram-cache.cpp
@@ -7,6 +7,7 @@
 #include <cstdio>
 #include <fstream>
 #include <thread>
+#include <algorithm>

 void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
                              std::vector<llama_token> & inp, int nnew, bool print_progress) {
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -4,6 +4,7 @@

 #include <cmath>
 #include <unordered_map>
+#include <algorithm>

 // the ring buffer works similarly to std::deque, but with a fixed capacity
 // TODO: deduplicate with llama-impl.h
@@ -134,11 +135,11 @@ std::string common_params_sampling::print() const {
    snprintf(result, sizeof(result),
            "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
            "\tdry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d\n"
-            "\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, temp = %.3f\n"
+            "\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, top_n_sigma = %.3f, temp = %.3f\n"
            "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
            penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
            dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n,
-            top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, temp,
+            top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, top_n_sigma, temp,
            mirostat, mirostat_eta, mirostat_tau);

    return std::string(result);
@@ -151,9 +152,67 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co

    lparams.no_perf = params.no_perf;

+    struct llama_sampler * grmr;
+    if (params.grammar.compare(0, 11, "%llguidance") == 0) {
+#ifdef LLAMA_USE_LLGUIDANCE
+        grmr = llama_sampler_init_llg(vocab, "lark", params.grammar.c_str());
+#else
+        GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
+#endif // LLAMA_USE_LLGUIDANCE
+    } else {
+        std::vector<std::string> patterns_at_start;
+        std::vector<std::string> patterns_anywhere;
+        std::vector<llama_token> trigger_tokens;
+        for (const auto & trigger : params.grammar_triggers) {
+            switch (trigger.type) {
+                case COMMON_GRAMMAR_TRIGGER_TYPE_WORD:
+                {
+                    const auto & word = trigger.value;
+                    patterns_anywhere.push_back(regex_escape(word));
+                    break;
+                }
+                case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN:
+                case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START:
+                {
+                    const auto & pattern = trigger.value;
+                    (trigger.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START ? patterns_at_start : patterns_anywhere).push_back(pattern);
+                    break;
+                }
+                case COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN:
+                {
+                    const auto token = trigger.token;
+                    trigger_tokens.push_back(token);
+                    break;
+                }
+                default:
+                    GGML_ASSERT(false && "unknown trigger type");
+            }
+        }
+
+        std::vector<std::string> trigger_patterns;
+        if (!patterns_at_start.empty()) {
+            trigger_patterns.push_back("^(" + string_join(patterns_at_start, "|") + ")[\\s\\S]*");
+        }
+        if (!patterns_anywhere.empty()) {
+            trigger_patterns.push_back("^[\\s\\S]*?(" + string_join(patterns_anywhere, "|") + ")[\\s\\S]*");
+        }
+
+        std::vector<const char *> trigger_patterns_c;
+        trigger_patterns_c.reserve(trigger_patterns.size());
+        for (const auto & regex : trigger_patterns) {
+            trigger_patterns_c.push_back(regex.c_str());
+        }
+
+        grmr = params.grammar_lazy
+             ? llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
+                                                        trigger_patterns_c.data(), trigger_patterns_c.size(),
+                                                        trigger_tokens.data(), trigger_tokens.size())
+             :      llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
+    }
+
    auto * result = new common_sampler {
        /* .params = */ params,
-        /* .grmr   = */ llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root"),
+        /* .grmr   = */ grmr,
        /* .chain  = */ llama_sampler_chain_init(lparams),
        /* .prev   = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
        /* .cur    = */ {},
@@ -167,45 +226,51 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
                params.logit_bias.data()));

    if (params.mirostat == 0) {
-        for (const auto & cnstr : params.samplers) {
-            switch (cnstr) {
-                case COMMON_SAMPLER_TYPE_DRY:
-                    {
-                        std::vector<const char *> c_breakers;
-                        c_breakers.reserve(params.dry_sequence_breakers.size());
-                        for (const auto & str : params.dry_sequence_breakers) {
-                            c_breakers.push_back(str.c_str());
-                        }
+        if (params.top_n_sigma >= 0) {
+            llama_sampler_chain_add(result->chain, llama_sampler_init_top_k        (params.top_k));
+            llama_sampler_chain_add(result->chain, llama_sampler_init_temp         (params.temp));
+            llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma  (params.top_n_sigma));
+        } else {
+            for (const auto & cnstr : params.samplers) {
+                switch (cnstr) {
+                    case COMMON_SAMPLER_TYPE_DRY:
+                        {
+                            std::vector<const char *> c_breakers;
+                            c_breakers.reserve(params.dry_sequence_breakers.size());
+                            for (const auto & str : params.dry_sequence_breakers) {
+                                c_breakers.push_back(str.c_str());
+                            }

-                        llama_sampler_chain_add(result->chain, llama_sampler_init_dry      (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
-                    }
-                    break;
-                case COMMON_SAMPLER_TYPE_TOP_K:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_top_k    (params.top_k));
-                    break;
-                case COMMON_SAMPLER_TYPE_TOP_P:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_top_p    (params.top_p, params.min_keep));
-                    break;
-                case COMMON_SAMPLER_TYPE_MIN_P:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_min_p    (params.min_p, params.min_keep));
-                    break;
-                case COMMON_SAMPLER_TYPE_XTC:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_xtc      (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
-                    break;
-                case COMMON_SAMPLER_TYPE_TYPICAL_P:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_typical  (params.typ_p, params.min_keep));
-                    break;
-                case COMMON_SAMPLER_TYPE_TEMPERATURE:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
-                    break;
-                case COMMON_SAMPLER_TYPE_INFILL:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_infill   (vocab));
-                    break;
-                case COMMON_SAMPLER_TYPE_PENALTIES:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
-                    break;
-                default:
-                    GGML_ASSERT(false && "unknown sampler type");
+                            llama_sampler_chain_add(result->chain, llama_sampler_init_dry      (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
+                        }
+                        break;
+                    case COMMON_SAMPLER_TYPE_TOP_K:
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_top_k    (params.top_k));
+                        break;
+                    case COMMON_SAMPLER_TYPE_TOP_P:
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_top_p    (params.top_p, params.min_keep));
+                        break;
+                    case COMMON_SAMPLER_TYPE_MIN_P:
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_min_p    (params.min_p, params.min_keep));
+                        break;
+                    case COMMON_SAMPLER_TYPE_XTC:
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_xtc      (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
+                        break;
+                    case COMMON_SAMPLER_TYPE_TYPICAL_P:
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_typical  (params.typ_p, params.min_keep));
+                        break;
+                    case COMMON_SAMPLER_TYPE_TEMPERATURE:
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
+                        break;
+                    case COMMON_SAMPLER_TYPE_INFILL:
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_infill   (vocab));
+                        break;
+                    case COMMON_SAMPLER_TYPE_PENALTIES:
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
+                        break;
+                    default:
+                        GGML_ASSERT(false && "unknown sampler type");
+                }
            }
        }
        llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
--- a/common/sampling.h
+++ b/common/sampling.h
@@ -102,3 +102,6 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr);

 std::vector<enum common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
 std::vector<enum common_sampler_type> common_sampler_types_from_chars(const std::string & chars);
+
+llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab,
+                const char * grammar_kind, const char * grammar_data);
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -5,6 +5,7 @@
 #include "sampling.h"

 #include <cstring>
+#include <algorithm>

 #define SPEC_VOCAB_MAX_SIZE_DIFFERENCE  128
 #define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
@@ -252,11 +253,6 @@ llama_tokens common_speculative_gen_draft(
        // add drafted token for each sequence
        const llama_token id = cur_p->data[0].id;

-        // only collect very high-confidence draft tokens
-        if (cur_p->data[0].p < params.p_min) {
-            break;
-        }
-
        common_sampler_accept(smpl, id, true);

        result.push_back(id);
@@ -265,6 +261,11 @@ llama_tokens common_speculative_gen_draft(
            break;
        }

+        // only collect very high-confidence draft tokens
+        if (cur_p->data[0].p < params.p_min) {
+            break;
+        }
+
        common_batch_add(batch, id, n_past + i + 1, { 0 }, true);

        // evaluate the drafted tokens on the draft model
--- a/common/speculative.h
+++ b/common/speculative.h
@@ -9,7 +9,7 @@ struct common_speculative_params {
    int n_draft = 16;  // max drafted tokens
    int n_reuse = 256;

-    float p_min = 0.9f; // min probabiliy required to accept a token in the draft
+    float p_min = 0.75f; // min probability required to accept a token in the draft
 };

 struct common_speculative * common_speculative_init(struct llama_context * ctx_dft);
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -558,7 +558,7 @@ class Model:

    # NOTE: this function is generated by convert_hf_to_gguf_update.py
    #       do not modify it manually!
-    # ref:  https://github.com/ggerganov/llama.cpp/pull/6920
+    # ref:  https://github.com/ggml-org/llama.cpp/pull/6920
    # Marker: Start get_vocab_base_pre
    def get_vocab_base_pre(self, tokenizer) -> str:
        # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
@@ -648,7 +648,7 @@ class Model:
        if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
            # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
            res = "jina-v2-code"
-        if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
+        if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b" or chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516":
            # ref: https://huggingface.co/THUDM/glm-4-9b-chat
            res = "chatglm-bpe"
        if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
@@ -696,6 +696,12 @@ class Model:
        if chkhsh == "877081d19cf6996e2c4ff0e1236341e9b7bde288f5311a56a937f0afbbb3aeb5":
            # ref: https://huggingface.co/deepseek-ai/DeepSeek-V3
            res = "deepseek-v3"
+        if chkhsh == "b3f499bb4255f8ca19fccd664443283318f2fd2414d5e0b040fbdd0cc195d6c5":
+            # ref: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+            res = "deepseek-r1-qwen"
+        if chkhsh == "ccc2ef013c104be7bae2965776d611e1d7a8a2a9c547dd93a682c9a9fc80352e":
+            # ref: https://huggingface.co/Xenova/gpt-4o
+            res = "gpt-4o"

        if res is None:
            logger.warning("\n")
@@ -705,7 +711,7 @@ class Model:
            logger.warning("**          - the model has not been added to convert_hf_to_gguf_update.py yet")
            logger.warning("**          - the pre-tokenization config has changed upstream")
            logger.warning("**          Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
-            logger.warning("** ref:     https://github.com/ggerganov/llama.cpp/pull/6920")
+            logger.warning("** ref:     https://github.com/ggml-org/llama.cpp/pull/6920")
            logger.warning("**")
            logger.warning(f"** chkhsh:  {chkhsh}")
            logger.warning("**************************************************************************************")
@@ -855,6 +861,9 @@ class Model:
                for token_id, token_data in added_tokens_decoder.items():
                    token_id = int(token_id)
                    token: str = token_data["content"]
+                    if token_id >= vocab_size:
+                        logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
+                        continue
                    if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
                        if tokens[token_id] != token.encode("utf-8"):
                            logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}')
@@ -2509,7 +2518,8 @@ class Phi3MiniModel(Model):
        rms_eps = self.find_hparam(["rms_norm_eps"])
        max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
        orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
-        rope_dims = n_embd // n_head
+        rot_pct = self.hparams.get("partial_rotary_factor", 1.0)
+        rope_dims = int(rot_pct * n_embd) // n_head

        self.gguf_writer.add_context_length(max_pos_embds)
        self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds)
@@ -2533,7 +2543,8 @@ class Phi3MiniModel(Model):
        n_head = self.find_hparam(["num_attention_heads", "n_head"])
        max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
        orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
-        rope_dims = n_embd // n_head
+        rot_pct = self.hparams.get("partial_rotary_factor", 1.0)
+        rope_dims = int(rot_pct * n_embd) // n_head

        # write rope scaling for long context (128k) model
        rope_scaling = self.find_hparam(['rope_scaling'], True)
@@ -2562,7 +2573,7 @@ class Phi3MiniModel(Model):
            raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')

        if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
-            raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
+            raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}. long_factors = {len(long_factors)}, short_factors = {len(short_factors)}.')

        yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
        yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
@@ -2832,7 +2843,7 @@ class InternLM2Model(Model):
        if chat_eos_token_id is not None:
            # For the chat model, we replace the eos with '<|im_end|>'.
            # TODO: this is a hack, should be fixed
-            #       https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048
+            #       https://github.com/ggml-org/llama.cpp/pull/6745#issuecomment-2067687048
            special_vocab.special_token_ids["eos"] = chat_eos_token_id
            logger.warning(f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}"
                           " in chat mode so that the conversation can end normally.")
@@ -2882,6 +2893,66 @@ class InternLM2Model(Model):
            return [(self.map_tensor_name(name), data_torch)]


+@Model.register("InternLM3ForCausalLM")
+class InternLM3Model(Model):
+    model_arch = gguf.MODEL_ARCH.LLAMA
+
+    def set_vocab(self):
+        tokens, scores, toktypes = self._create_vocab_sentencepiece()
+
+        self.gguf_writer.add_tokenizer_model("llama")
+        self.gguf_writer.add_tokenizer_pre("default")
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_scores(scores)
+        self.gguf_writer.add_token_types(toktypes)
+
+        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
+
+        tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
+        if tokenizer_config_file.is_file():
+            with open(tokenizer_config_file, "r", encoding="utf-8") as f:
+                tokenizer_config_json = json.load(f)
+                if "add_prefix_space" in tokenizer_config_json:
+                    self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
+
+                if "added_tokens_decoder" in tokenizer_config_json:
+                    for token_id, token_data in tokenizer_config_json["added_tokens_decoder"].items():
+                        if token_data.get("special"):
+                            token_id = int(token_id)
+                            token = token_data["content"]
+                            special_vocab._set_special_token(token, token_id)
+                            # update eos token
+                            if token == '<|im_end|>' and "eos" in special_vocab.special_token_ids:
+                                special_vocab.special_token_ids["eos"] = token_id
+
+        special_vocab.add_to_gguf(self.gguf_writer)
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        hparams = self.hparams
+        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
+
+        if "head_dim" in hparams:
+            rope_dim = hparams["head_dim"]
+        else:
+            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
+        self.gguf_writer.add_rope_dimension_count(rope_dim)
+
+        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
+            if self.hparams["rope_scaling"].get("type") == "linear" or self.hparams["rope_scaling"].get("rope_type") == "linear":
+                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
+                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        n_head = self.hparams["num_attention_heads"]
+        n_kv_head = self.hparams.get("num_key_value_heads")
+        if name.endswith(("q_proj.weight", "q_proj.bias")):
+            data_torch = LlamaModel.permute(data_torch, n_head, n_head)
+        if name.endswith(("k_proj.weight", "k_proj.bias")):
+            data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
+        return [(self.map_tensor_name(name), data_torch)]
+
+
@Model.register("BertModel", "BertForMaskedLM", "CamembertModel")
 class BertModel(Model):
    model_arch = gguf.MODEL_ARCH.BERT
@@ -3254,6 +3325,83 @@ class Gemma2Model(Model):
        return [(self.map_tensor_name(name), data_torch)]


+@Model.register("Gemma3ForCausalLM", "Gemma3ForConditionalGeneration")
+class Gemma3Model(Model):
+    model_arch = gguf.MODEL_ARCH.GEMMA3
+    has_vision: bool = False
+
+    # we need to merge the text_config into the root level of hparams
+    def __init__(self, *args, **kwargs):
+        hparams = Model.load_hparams(kwargs["dir_model"])
+        if "text_config" in hparams:
+            hparams = {**hparams, **hparams["text_config"]}
+            kwargs["hparams"] = hparams
+        super().__init__(*args, **kwargs)
+        if "vision_config" in hparams:
+            logger.info("Has vision encoder, but it will be ignored")
+            self.has_vision = True
+
+    def write(self):
+        super().write()
+        if self.has_vision:
+            logger.info("NOTE: this script only convert the language model to GGUF")
+            logger.info("      for the vision model, please use gemma3_convert_encoder_to_gguf.py")
+
+    def set_vocab(self):
+        self._set_vocab_sentencepiece()
+
+        self.gguf_writer.add_add_space_prefix(False)
+
+    def set_gguf_parameters(self):
+        hparams = self.hparams
+        block_count = hparams["num_hidden_layers"]
+
+        # some default values are not specified in the hparams
+        self.gguf_writer.add_context_length(hparams.get("max_position_embeddings", 131072))
+        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
+        self.gguf_writer.add_block_count(block_count)
+        self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
+        self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 8))
+        self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-6))
+        self.gguf_writer.add_key_length(hparams.get("head_dim", 256))
+        self.gguf_writer.add_value_length(hparams.get("head_dim", 256))
+        self.gguf_writer.add_file_type(self.ftype)
+        self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1_000_000.0)) # for global layers
+        # both attn_logit_softcapping and final_logit_softcapping are removed in Gemma3
+        assert hparams.get("attn_logit_softcapping") is None
+        assert hparams.get("final_logit_softcapping") is None
+        self.gguf_writer.add_sliding_window(hparams["sliding_window"])
+        self.gguf_writer.add_head_count_kv(hparams.get("num_key_value_heads", 4))
+        if hparams.get("rope_scaling") is not None:
+            assert hparams["rope_scaling"]["rope_type"] == "linear"
+            # important: this rope_scaling is only applied for global layers, and not used by 1B model
+            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
+            self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        del bid  # unused
+
+        if name.startswith("language_model."):
+            name = name.replace("language_model.", "")
+        elif name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \
+                or name.startswith("multimodal_projector.") or name.startswith("vision_model."): # this is for old HF model, should be removed later
+            # ignore vision tensors
+            return []
+
+        # remove OOV (out-of-vocabulary) rows in token_embd
+        if "embed_tokens.weight" in name:
+            vocab = self._create_vocab_sentencepiece()
+            tokens = vocab[0]
+            data_torch = data_torch[:len(tokens)]
+
+        # ref code in Gemma3RMSNorm
+        # output = output * (1.0 + self.weight.float())
+        if name.endswith("norm.weight"):
+            data_torch = data_torch + 1
+
+        return [(self.map_tensor_name(name), data_torch)]
+
+
@Model.register("Starcoder2ForCausalLM")
 class StarCoder2Model(Model):
    model_arch = gguf.MODEL_ARCH.STARCODER2
@@ -4450,7 +4598,7 @@ class JaisModel(Model):
        self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias)


-@Model.register("ChatGLMModel", "ChatGLMForConditionalGeneration")
+@Model.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration")
 class ChatGLMModel(Model):
    model_arch = gguf.MODEL_ARCH.CHATGLM

@@ -4556,47 +4704,15 @@ class ChatGLMModel(Model):

        from transformers import AutoTokenizer
        tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
-        vocab_size = hparams["padded_vocab_size"]
+        vocab_size = hparams.get("padded_vocab_size",hparams["vocab_size"])
        assert max(tokenizer.get_vocab().values()) < vocab_size

-        tokpre = self.get_vocab_base_pre(tokenizer)
-
-        merges = []
-        vocab = {}
-        mergeable_ranks = tokenizer.mergeable_ranks
-        for token, rank in mergeable_ranks.items():
-            vocab[ChatGLMModel.token_bytes_to_string(token)] = rank
-            if len(token) == 1:
-                continue
-            merged = ChatGLMModel.bpe(mergeable_ranks, token, max_rank=rank)
-            assert len(merged) >= 2 and len(merged) <= 7
-            merges.append(' '.join(map(ChatGLMModel.token_bytes_to_string, merged)))
-
-        # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
-        added_vocab = tokenizer.get_added_vocab()
-        reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()}
-
-        for i in range(vocab_size):
-            if i not in reverse_vocab:
-                tokens.append(f"[PAD{i}]")
-                toktypes.append(gguf.TokenType.UNUSED)
-            elif reverse_vocab[i] in added_vocab:
-                tokens.append(reverse_vocab[i])
-                if tokenizer.added_tokens_decoder[i].special:
-                    toktypes.append(gguf.TokenType.CONTROL)
-                else:
-                    toktypes.append(gguf.TokenType.USER_DEFINED)
-            else:
-                tokens.append(reverse_vocab[i])
-                toktypes.append(gguf.TokenType.NORMAL)
-
+        tokens, toktypes, tokpre = self.get_vocab_base()
        self.gguf_writer.add_tokenizer_model("gpt2")
        self.gguf_writer.add_tokenizer_pre(tokpre)
        self.gguf_writer.add_token_list(tokens)
        self.gguf_writer.add_token_types(toktypes)
-
-        special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
-        special_vocab.merges = merges
+        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
        # only add special tokens when they were not already loaded from config.json
        special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
        special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
@@ -4607,16 +4723,20 @@ class ChatGLMModel(Model):
    def set_gguf_parameters(self):
        n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
        n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
-        n_head_kv = self.hparams.get("multi_query_group_num", n_head)
+        n_head_kv = self.hparams.get("multi_query_group_num", self.hparams.get("num_key_value_heads", n_head))
        self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
        self.gguf_writer.add_embedding_length(n_embed)
-        self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", 4 * n_embed))
-        self.gguf_writer.add_block_count(self.hparams["num_layers"])
+        self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", self.hparams.get("intermediate_size", 4 * n_embed)))
+        self.gguf_writer.add_block_count(self.hparams.get("num_layers", self.hparams["num_hidden_layers"]))
        self.gguf_writer.add_head_count(n_head)
        self.gguf_writer.add_head_count_kv(n_head_kv)
-        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layernorm_epsilon"])
+        self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon",1e-5))
        self.gguf_writer.add_file_type(self.ftype)
-        self.gguf_writer.add_rope_dimension_count(64)
+        if "attention_dim" in self.hparams:
+            rope_dim = self.hparams["attention_dim"]
+        else:
+            rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
+        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
        self.gguf_writer.add_add_bos_token(False)
        rope_freq = 10000
        if "rope_ratio" in self.hparams:
@@ -4626,7 +4746,7 @@ class ChatGLMModel(Model):
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        del bid  # unused

-        if name.endswith(".rotary_pos_emb.inv_freq"):
+        if name.endswith(".rotary_pos_emb.inv_freq") or name.startswith("model.vision."):
            return []

        name = name.removeprefix("transformer.")
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -8,7 +8,7 @@
 # provide the necessary information to llama.cpp via the GGUF header in order to implement
 # the same pre-tokenizer.
 #
-# ref: https://github.com/ggerganov/llama.cpp/pull/6920
+# ref: https://github.com/ggml-org/llama.cpp/pull/6920
 #
 # Instructions:
 #
@@ -65,49 +65,51 @@ else:

 # TODO: add models here, base models preferred
 models = [
-    {"name": "llama-spm",      "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
-    {"name": "llama-bpe",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
-    {"name": "phi-3",          "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", },
-    {"name": "deepseek-llm",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
-    {"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
-    {"name": "falcon",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
-    {"name": "bert-bge",       "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
-    {"name": "falcon3",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon3-7B-Base", },
-    {"name": "bert-bge-large", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/BAAI/bge-large-zh-v1.5", },
-    {"name": "mpt",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
-    {"name": "starcoder",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
-    {"name": "gpt-2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
-    {"name": "stablelm2",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b", },
-    {"name": "refact",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
-    {"name": "command-r",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
-    {"name": "qwen2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
-    {"name": "olmo",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
-    {"name": "dbrx",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
-    {"name": "jina-v1-en",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-reranker-v1-tiny-en", },
-    {"name": "jina-v2-en",     "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
-    {"name": "jina-v2-es",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
-    {"name": "jina-v2-de",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
-    {"name": "smaug-bpe",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
-    {"name": "poro-chat",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
-    {"name": "jina-v2-code",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
-    {"name": "viking",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B
-    {"name": "gemma",          "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2b", },
-    {"name": "gemma-2",        "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", },
-    {"name": "jais",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", },
-    {"name": "t5",             "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", },
-    {"name": "codeshell",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", },
-    {"name": "tekken",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },
-    {"name": "smollm",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", },
-    {'name': "bloom",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", },
-    {'name': "gpt3-finnish",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", },
-    {"name": "exaone",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
-    {"name": "phi-2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
-    {"name": "chameleon",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", },
-    {"name": "minerva-7b",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", },
-    {"name": "roberta-bpe",    "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"},
-    {"name": "gigachat",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct"},
-    {"name": "megrez",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Infinigence/Megrez-3B-Instruct"},
-    {"name": "deepseek-v3",    "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-V3"},
+    {"name": "llama-spm",        "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
+    {"name": "llama-bpe",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
+    {"name": "phi-3",            "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", },
+    {"name": "deepseek-llm",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
+    {"name": "deepseek-coder",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
+    {"name": "falcon",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
+    {"name": "bert-bge",         "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
+    {"name": "falcon3",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon3-7B-Base", },
+    {"name": "bert-bge-large",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/BAAI/bge-large-zh-v1.5", },
+    {"name": "mpt",              "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
+    {"name": "starcoder",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
+    {"name": "gpt-2",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
+    {"name": "stablelm2",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b", },
+    {"name": "refact",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
+    {"name": "command-r",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
+    {"name": "qwen2",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
+    {"name": "olmo",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
+    {"name": "dbrx",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
+    {"name": "jina-v1-en",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-reranker-v1-tiny-en", },
+    {"name": "jina-v2-en",       "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
+    {"name": "jina-v2-es",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
+    {"name": "jina-v2-de",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
+    {"name": "smaug-bpe",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
+    {"name": "poro-chat",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
+    {"name": "jina-v2-code",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
+    {"name": "viking",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B
+    {"name": "gemma",            "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2b", },
+    {"name": "gemma-2",          "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", },
+    {"name": "jais",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", },
+    {"name": "t5",               "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", },
+    {"name": "codeshell",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", },
+    {"name": "tekken",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },
+    {"name": "smollm",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", },
+    {'name': "bloom",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", },
+    {'name': "gpt3-finnish",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", },
+    {"name": "exaone",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
+    {"name": "phi-2",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
+    {"name": "chameleon",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", },
+    {"name": "minerva-7b",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", },
+    {"name": "roberta-bpe",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"},
+    {"name": "gigachat",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct"},
+    {"name": "megrez",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Infinigence/Megrez-3B-Instruct"},
+    {"name": "deepseek-v3",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-V3"},
+    {"name": "deepseek-r1-qwen", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"},
+    {"name": "gpt-4o",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Xenova/gpt-4o", },
 ]


@@ -130,6 +132,10 @@ def download_model(model):

    files = ["config.json", "tokenizer.json", "tokenizer_config.json"]

+    if name == "gpt-4o":
+        # Xenova/gpt-4o is tokenizer-only, it does not contain config.json
+        files = ["tokenizer.json", "tokenizer_config.json"]
+
    if tokt == TOKENIZER_TYPE.SPM:
        files.append("tokenizer.model")

@@ -245,7 +251,7 @@ src_func = f"""
            logger.warning("**          - the model has not been added to convert_hf_to_gguf_update.py yet")
            logger.warning("**          - the pre-tokenization config has changed upstream")
            logger.warning("**          Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
-            logger.warning("** ref:     https://github.com/ggerganov/llama.cpp/pull/6920")
+            logger.warning("** ref:     https://github.com/ggml-org/llama.cpp/pull/6920")
            logger.warning("**")
            logger.warning(f"** chkhsh:  {{chkhsh}}")
            logger.warning("**************************************************************************************")
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@@ -395,7 +395,7 @@ if __name__ == '__main__':
                        logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
                        if ".embed_tokens.weight" in name or ".lm_head.weight" in name:
                            logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning")
-                            logger.error("Please refer to https://github.com/ggerganov/llama.cpp/pull/9948")
+                            logger.error("Please refer to https://github.com/ggml-org/llama.cpp/pull/9948")
                        sys.exit(1)

                    if base_name in tensor_map:
@@ -419,7 +419,7 @@ if __name__ == '__main__':
                # some archs may have the same tensor for lm_head and output (tie word embeddings)
                # in this case, adapters targeting lm_head will fail when using llama-export-lora
                # therefore, we ignore them for now
-                # see: https://github.com/ggerganov/llama.cpp/issues/9065
+                # see: https://github.com/ggml-org/llama.cpp/issues/9065
                if name == "lm_head.weight" and len(dest) == 0:
                    raise ValueError("lm_head is present in adapter, but is ignored in base model")
                for dest_name, dest_data in dest:
--- a/docs/android.md
+++ b/docs/android.md
@@ -12,7 +12,7 @@ $ apt update && apt upgrade -y
 $ apt install git cmake
 ```

-Then, follow the [build instructions](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md), specifically for CMake.
+Then, follow the [build instructions](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md), specifically for CMake.

 Once the binaries are built, download your model of choice (e.g., from Hugging Face). It's recommended to place it in the `~/` directory for best performance:

--- a/docs/backend/OPENCL.md
+++ b/docs/backend/OPENCL.md
@@ -0,0 +1,205 @@
+# llama.cpp for OpenCL
+
+- [Background](#background)
+- [OS](#os)
+- [Hardware](#hardware)
+- [DataType Supports](#datatype-supports)
+- [Model Preparation](#model-preparation)
+- [CMake Options](#cmake-options)
+- [Android](#android)
+- [Windows 11 Arm64](#windows-11-arm64)
+- [Known Issue](#known-issues)
+- [TODO](#todo)
+
+## Background
+
+OpenCL (Open Computing Language) is an open, royalty-free standard for cross-platform, parallel programming of diverse accelerators found in supercomputers, cloud servers, personal computers, mobile devices and embedded platforms. OpenCL specifies a programming language (based on C99) for programming these devices and application programming interfaces (APIs) to control the platform and execute programs on the compute devices. Similar to CUDA, OpenCL has been widely used to program GPUs and is supported by most GPU vendors.
+
+### Llama.cpp + OpenCL
+
+The llama.cpp OpenCL backend is designed to enable llama.cpp on **Qualcomm Adreno GPU** firstly via OpenCL. Thanks to the portabilty of OpenCL, the OpenCL backend can also run on certain Intel GPUs although the performance is not optimal.
+
+## OS
+
+| OS      | Status  | Verified                                       |
+|---------|---------|------------------------------------------------|
+| Android | Support | Snapdragon 8 Gen 3, Snapdragon 8 Elite         |
+| Windows | Support | Windows 11 Arm64 with Snapdragon X Elite       |
+| Linux   | Support | Ubuntu 22.04 WSL2 with Intel 12700H            |
+
+## Hardware
+
+### Adreno GPU
+
+**Verified devices**
+
+| Adreno GPU                           | Status  |
+|:------------------------------------:|:-------:|
+| Adreno 750 (Snapdragon 8 Gen 3)      | Support |
+| Adreno 830 (Snapdragon 8 Elite)      | Support |
+| Adreno X85 (Snapdragon X Elite)      | Support |
+
+## DataType Supports
+
+| DataType               | Status                     |
+|:----------------------:|:--------------------------:|
+| Q4_0                   | Support                    |
+| Q6_K                   | Support, but not optimized |
+
+## Model Preparation
+
+You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration.
+
+Currently we support `Q4_0` quantization and have optimize for it. To achieve best performance on Adreno GPU, add `--pure` to `llama-quantize`. For example,
+
+```sh
+./llama-quantize --pure ggml-model-qwen2.5-3b-f16.gguf ggml-model-qwen-3b-Q4_0.gguf Q4_0
+```
+
+Since `Q6_K` is also supported, `Q4_0` quantization without `--pure` will also work. However, the performance will be worse compared to pure `Q4_0` quantization.
+
+## CMake Options
+
+The OpenCL backend has the following CMake options that control the behavior of the backend.
+
+| CMake options                     | Default value  | Description                               |
+|:---------------------------------:|:--------------:|:------------------------------------------|
+| `GGML_OPENCL_EMBED_KERNELS`       | `ON`           | Embed OpenCL kernels into the executable. |
+| `GGML_OPENCL_USE_ADRENO_KERNELS`  | `ON`           | Use kernels optimized for Adreno.         |
+
+## Android
+
+Ubuntu 22.04 is used for targeting Android. Make sure the following tools are accessible from command line,
+
+* Git
+* CMake 3.29
+* Ninja
+* Python3
+
+### I. Setup Environment
+
+1. **Install NDK**
+
+```sh
+cd ~
+wget https://dl.google.com/android/repository/commandlinetools-linux-8512546_latest.zip && \
+unzip commandlinetools-linux-8512546_latest.zip && \
+mkdir -p ~/android-sdk/cmdline-tools && \
+mv cmdline-tools latest && \
+mv latest ~/android-sdk/cmdline-tools/ && \
+rm -rf commandlinetools-linux-8512546_latest.zip
+
+yes | ~/android-sdk/cmdline-tools/latest/bin/sdkmanager "ndk;26.3.11579264"
+```
+
+2. **Install OpenCL Headers and Library**
+
+```sh
+mkdir -p ~/dev/llm
+cd ~/dev/llm
+
+git clone https://github.com/KhronosGroup/OpenCL-Headers && \
+cd OpenCL-Headers && \
+cp -r CL ~/android-sdk/ndk/26.3.11579264/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
+
+cd ~/dev/llm
+
+git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader && \
+cd OpenCL-ICD-Loader && \
+mkdir build_ndk26 && cd build_ndk26 && \
+cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release \
+  -DCMAKE_TOOLCHAIN_FILE=$HOME/android-sdk/ndk/26.3.11579264/build/cmake/android.toolchain.cmake \
+  -DOPENCL_ICD_LOADER_HEADERS_DIR=$HOME/android-sdk/ndk/26.3.11579264/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include \
+  -DANDROID_ABI=arm64-v8a \
+  -DANDROID_PLATFORM=24 \
+  -DANDROID_STL=c++_shared && \
+ninja && \
+cp libOpenCL.so ~/android-sdk/ndk/26.3.11579264/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
+```
+
+### II. Build llama.cpp
+
+```sh
+cd ~/dev/llm
+
+git clone https://github.com/ggml-org/llama.cpp && \
+cd llama.cpp && \
+mkdir build-android && cd build-android
+
+cmake .. -G Ninja \
+  -DCMAKE_TOOLCHAIN_FILE=$HOME/android-sdk/ndk/26.3.11579264/build/cmake/android.toolchain.cmake \
+  -DANDROID_ABI=arm64-v8a \
+  -DANDROID_PLATFORM=android-28 \
+  -DBUILD_SHARED_LIBS=OFF \
+  -DGGML_OPENCL=ON
+
+ninja
+```
+
+## Windows 11 Arm64
+
+A Snapdragon X Elite device with Windows 11 Arm64 is used. Make sure the following tools are accessible from command line,
+
+* Git
+* CMake 3.29
+* Clang 19
+* Ninja
+* Visual Studio 2022
+
+Powershell is used for the following instructions.
+
+### I. Setup Environment
+
+1. **Install OpenCL Headers and Library**
+
+```powershell
+mkdir -p ~/dev/llm
+
+cd ~/dev/llm
+git clone https://github.com/KhronosGroup/OpenCL-Headers && cd OpenCL-Headers
+mkdir build && cd build
+cmake .. -G Ninja `
+  -DBUILD_TESTING=OFF `
+  -DOPENCL_HEADERS_BUILD_TESTING=OFF `
+  -DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
+  -DCMAKE_INSTALL_PREFIX="$HOME/dev/llm/opencl"
+cmake --build . --target install
+
+cd ~/dev/llm
+git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader && cd OpenCL-ICD-Loader
+mkdir build && cd build
+cmake .. -G Ninja `
+  -DCMAKE_BUILD_TYPE=Release `
+  -DCMAKE_PREFIX_PATH="$HOME/dev/llm/opencl" `
+  -DCMAKE_INSTALL_PREFIX="$HOME/dev/llm/opencl"
+cmake --build . --target install
+```
+
+### II. Build llama.cpp
+
+```powershell
+
+mkdir -p ~/dev/llm
+cd ~/dev/llm
+
+git clone https://github.com/ggml-org/llama.cpp && cd llama.cpp
+mkdir build && cd build
+
+cmake .. -G Ninja `
+  -DCMAKE_TOOLCHAIN_FILE="$HOME/dev/llm/llama.cpp/cmake/arm64-windows-llvm.cmake" `
+  -DCMAKE_BUILD_TYPE=Release `
+  -DCMAKE_PREFIX_PATH="$HOME/dev/llm/opencl" `
+  -DBUILD_SHARED_LIBS=OFF `
+  -DGGML_OPENCL=ON
+ninja
+```
+
+## Known Issues
+
+- Qwen2.5 0.5B model produces gibberish output with Adreno kernels.
+
+## TODO
+
+- Fix Qwen2.5 0.5B
+- Optimization for Q6_K
+- Support and optimization for Q4_K
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -36,12 +36,22 @@ The following release is verified with good quality:

 |Commit ID|Tag|Release|Verified  Platform| Update date|
 |-|-|-|-|-|
-|3bcd40b3c593d14261fb2abfabad3c0fb5b9e318|b4040 |[llama-b4040-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b4040/llama-b4040-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1| 2024-11-19|
-|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1||
+|3bcd40b3c593d14261fb2abfabad3c0fb5b9e318|b4040 |[llama-b4040-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b4040/llama-b4040-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1| 2024-11-19|
+|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1||


 ## News

+- 2025.2
+  - Optimize MUL_MAT Q4_0 on Intel GPU for all dGPUs and built-in GPUs since MTL. Increase the performance of LLM (llama-2-7b.Q4_0.gguf) 21%-87% on Intel GPUs (MTL, ARL-H, Arc, Flex, PVC).
+    |GPU|Base tokens/s|Increased tokens/s|Percent|
+    |-|-|-|-|
+    |PVC 1550|39|73|+87%|
+    |Flex 170|39|50|+28%|
+    |Arc770|42|55|+30%|
+    |MTL|13|16|+23%|
+    |ARL-H|14|17|+21%|
+
 - 2024.11
  - Use syclcompat to improve the performance on some platforms. This requires to use oneAPI 2025.0 or newer.

@@ -58,7 +68,7 @@ The following release is verified with good quality:
 - 2024.3
  - Release binary files of Windows.
  - A blog is published: **Run LLM on all Intel GPUs Using llama.cpp**: [intel.com](https://www.intel.com/content/www/us/en/developer/articles/technical/run-llm-on-all-gpus-using-llama-cpp-artical.html) or [medium.com](https://medium.com/@jianyu_neo/run-llm-on-all-intel-gpus-using-llama-cpp-fd2e2dcbd9bd).
-  - New base line is ready: [tag b2437](https://github.com/ggerganov/llama.cpp/tree/b2437).
+  - New base line is ready: [tag b2437](https://github.com/ggml-org/llama.cpp/tree/b2437).
  - Support multiple cards: **--split-mode**: [none|layer]; not support [row], it's on developing.
  - Support to assign main GPU by **--main-gpu**, replace $GGML_SYCL_DEVICE.
  - Support detecting all GPUs with level-zero and same top **Max compute units**.
@@ -97,8 +107,8 @@ SYCL backend supports Intel GPU Family:
 | Intel Data Center Max Series  | Support | Max 1550, 1100                        |
 | Intel Data Center Flex Series | Support | Flex 170                              |
 | Intel Arc Series              | Support | Arc 770, 730M, Arc A750               |
-| Intel built-in Arc GPU        | Support | built-in Arc GPU in Meteor Lake       |
-| Intel iGPU                    | Support | iGPU in 13700k, i5-1250P, i7-1260P, i7-1165G7 |
+| Intel built-in Arc GPU        | Support | built-in Arc GPU in Meteor Lake, Arrow Lake    |
+| Intel iGPU                    | Support | iGPU in 13700k,iGPU in 13400, i5-1250P, i7-1260P, i7-1165G7 |

 *Notes:*

@@ -133,7 +143,7 @@ The docker build option is currently limited to *intel GPU* targets.
 ### Build image
 ```sh
 # Using FP16
-docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" -f .devops/llama-cli-intel.Dockerfile .
+docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" --target light -f .devops/intel.Dockerfile .
 ```

 *Notes*:
@@ -660,8 +670,10 @@ use 1 SYCL GPUs: [0] with Max compute units:512
 | Name              | Value            | Function                                                                                                                  |
 |-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------|
 | GGML_SYCL_DEBUG   | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG                                                                             |
+| GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features based on Intel GPU type, to compare the performance increase |
 | ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |

+
 ## Known Issues

 - `Split-mode:[row]` is not supported.
--- a/docs/build.md
+++ b/docs/build.md
@@ -3,7 +3,7 @@
 **To get the Code:**

 ```bash
-git clone https://github.com/ggerganov/llama.cpp
+git clone https://github.com/ggml-org/llama.cpp
 cd llama.cpp
 ```

@@ -46,7 +46,7 @@ cmake --build build --config Release
  ```

 - Building for Windows (x86, x64 and arm64) with MSVC or clang as compilers:
-    - Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/de/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...):
+    - Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...):
    - Tab Workload: Desktop-development with C++
    - Tab Components (select quickly via search): C++-_CMake_ Tools for Windows, _Git_ for Windows, C++-_Clang_ Compiler for Windows, MS-Build Support for LLVM-Toolset (clang)
    - Please remember to always use a Developer Command Prompt / PowerShell for VS2022 for git, build, test
@@ -125,21 +125,66 @@ For detailed info, please refer to [llama.cpp for SYCL](./backend/SYCL.md).

 ## CUDA

-This provides GPU acceleration using an NVIDIA GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from the [NVIDIA developer site](https://developer.nvidia.com/cuda-downloads).
+This provides GPU acceleration using an NVIDIA GPU. Make sure to have the [CUDA toolkit](https://developer.nvidia.com/cuda-toolkit) installed.

-If you are using Fedora (using Fedora Workstation, or an 'Atomic' variant such as Silverblue), or would like to set up CUDA in a toolbox, please consider our [Fedora CUDA guide](./cuda-fedora.md). Unfortunately, the process is not as simple as one might expect.
+#### Download directly from NVIDIA
+You may find the official downloads here: [NVIDIA developer site](https://developer.nvidia.com/cuda-downloads).

- Using `CMake`:

-  ```bash
-  cmake -B build -DGGML_CUDA=ON
-  cmake --build build --config Release
-  ```
+#### Compile and run inside a Fedora Toolbox Container
+We also have a [guide](./cuda-fedora.md) for setting up CUDA toolkit in a Fedora [toolbox container](https://containertoolbx.org/).

-The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used.
+**Recommended for:**
+
+- ***Particularly*** *convenient* for users of [Atomic Desktops for Fedora](https://fedoraproject.org/atomic-desktops/); such as: [Silverblue](https://fedoraproject.org/atomic-desktops/silverblue/) and [Kinoite](https://fedoraproject.org/atomic-desktops/kinoite/).
+- Toolbox is installed by default: [Fedora Workstation](https://fedoraproject.org/workstation/) or [Fedora KDE Plasma Desktop](https://fedoraproject.org/spins/kde).
+- *Optionally* toolbox packages are available: [Arch Linux](https://archlinux.org/), [Red Hat Enterprise Linux >= 8.5](https://www.redhat.com/en/technologies/linux-platforms/enterprise-linux), or [Ubuntu](https://ubuntu.com/download)
+
+
+### Compilation
+```bash
+cmake -B build -DGGML_CUDA=ON
+cmake --build build --config Release
+```
+
+### Override Compute Capability Specifications
+
+If `nvcc` cannot detect your gpu, you may get compile-warnings such as:
+ ```text
+nvcc warning : Cannot find valid GPU for '-arch=native', default arch is used
+```
+
+To override the `native` GPU detection:
+
+#### 1. Take note of the `Compute Capability` of your NVIDIA devices: ["CUDA: Your GPU Compute > Capability"](https://developer.nvidia.com/cuda-gpus).
+
+```text
+GeForce RTX 4090      8.9
+GeForce RTX 3080 Ti   8.6
+GeForce RTX 3070      8.6
+```
+
+#### 2. Manually list each varying `Compute Capability` in the `CMAKE_CUDA_ARCHITECTURES` list.
+
+```bash
+cmake -B build -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES="86;89"
+```
+
+### Runtime CUDA environmental variables
+
+You may set the [cuda environmental variables](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) at runtime.
+
+```bash
+# Use `CUDA_VISIBLE_DEVICES` to hide the first compute device.
+CUDA_VISIBLE_DEVICES="-0" ./build/bin/llama-server --model /srv/models/llama.gguf
+```
+
+### Unified Memory

 The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted. In Windows this setting is available in the NVIDIA control panel as `System Memory Fallback`.

+### Performance Tuning
+
 The following compilation options are also available to tweak performance:

 | Option                        | Legal values           | Default | Description                                                                                                                                                                                                                                                                             |
@@ -152,21 +197,53 @@ The following compilation options are also available to tweak performance:

 ## MUSA

-This provides GPU acceleration using the MUSA cores of your Moore Threads MTT GPU. Make sure to have the MUSA SDK installed. You can download it from here: [MUSA SDK](https://developer.mthreads.com/sdk/download/musa).
+This provides GPU acceleration using a Moore Threads GPU. Make sure to have the [MUSA SDK](https://developer.mthreads.com/musa/musa-sdk) installed.

- Using `CMake`:
+#### Download directly from Moore Threads

-  ```bash
-  cmake -B build -DGGML_MUSA=ON
+You may find the official downloads here: [Moore Threads developer site](https://developer.mthreads.com/sdk/download/musa).
+
+### Compilation
+
+```bash
+cmake -B build -DGGML_MUSA=ON
+cmake --build build --config Release
+```
+
+#### Override Compute Capability Specifications
+
+By default, all supported compute capabilities are enabled. To customize this behavior, you can specify the `MUSA_ARCHITECTURES` option in the CMake command:
+
+```bash
+cmake -B build -DGGML_MUSA=ON -DMUSA_ARCHITECTURES="21"
+```
+
+This configuration enables only compute capability `2.1` (MTT S80) during compilation, which can help reduce compilation time.
+
+#### Compilation options
+
+Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet.
+
+- For static builds, add `-DBUILD_SHARED_LIBS=OFF` and `-DCMAKE_POSITION_INDEPENDENT_CODE=ON`:
+  ```
+  cmake -B build -DGGML_MUSA=ON \
+    -DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON
  cmake --build build --config Release
  ```

-The environment variable [`MUSA_VISIBLE_DEVICES`](https://docs.mthreads.com/musa-sdk/musa-sdk-doc-online/programming_guide/Z%E9%99%84%E5%BD%95/) can be used to specify which GPU(s) will be used.
+### Runtime MUSA environmental variables
+
+You may set the [musa environmental variables](https://docs.mthreads.com/musa-sdk/musa-sdk-doc-online/programming_guide/Z%E9%99%84%E5%BD%95/) at runtime.
+
+```bash
+# Use `MUSA_VISIBLE_DEVICES` to hide the first compute device.
+MUSA_VISIBLE_DEVICES="-0" ./build/bin/llama-server --model /srv/models/llama.gguf
+```
+
+### Unified Memory

 The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted.

-Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet.
-
 ## HIP

 This provides GPU acceleration on HIP-supported AMD GPUs.
@@ -182,6 +259,12 @@ You can download it from your Linux distro's package manager or from here: [ROCm
  On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`.
  However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).

+  To enhance flash attention performance on RDNA3+ or CDNA architectures, you can utilize the rocWMMA library by enabling the `-DGGML_HIP_ROCWMMA_FATTN=ON` option. This requires rocWMMA headers to be installed on the build system.
+
+  The rocWMMA library is included by default when installing the ROCm SDK using the `rocm` meta package provided by AMD. Alternatively, if you are not using the meta package, you can install the library using the `rocwmma-dev` or `rocwmma-devel` package, depending on your system's package manager.
+
+  As an alternative, you can manually install the library by cloning it from the official [GitHub repository](https://github.com/ROCm/rocWMMA), checkout the corresponding version tag (e.g. `rocm-6.2.4`) and set `-DCMAKE_CXX_FLAGS="-I<path/to/rocwmma>/library/include/"` in CMake. This also works under Windows despite not officially supported by AMD.
+
  Note that if you get the following error:
  ```
  clang: error: cannot find ROCm device library; provide its path via '--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build without ROCm device library
@@ -286,7 +369,7 @@ You don't need to install Vulkan SDK. It will be installed inside the container.

 ```sh
 # Build the image
-docker build -t llama-cpp-vulkan -f .devops/llama-cli-vulkan.Dockerfile .
+docker build -t llama-cpp-vulkan --target light -f .devops/vulkan.Dockerfile .

 # Then, use it:
 docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
--- a/docs/cuda-fedora.md
+++ b/docs/cuda-fedora.md
@@ -1,17 +1,16 @@
 # Setting Up CUDA on Fedora

 In this guide we setup [Nvidia CUDA](https://docs.nvidia.com/cuda/) in a toolbox container. This guide is applicable for:
+
 - [Fedora Workstation](https://fedoraproject.org/workstation/)
 - [Atomic Desktops for Fedora](https://fedoraproject.org/atomic-desktops/)
 - [Fedora Spins](https://fedoraproject.org/spins)
- [Other Distributions](https://containertoolbx.org/distros/), including `Red Hat Enterprise Linux >= 8.`, `Arch Linux`, and `Ubuntu`.
-
+- [Other Distributions](https://containertoolbx.org/distros/), including `Red Hat Enterprise Linux >= 8.5`, `Arch Linux`, and `Ubuntu`.

 ## Table of Contents

 - [Prerequisites](#prerequisites)
- [Monitoring NVIDIA CUDA Repositories](#monitoring-nvidia-cuda-repositories)
- [Using the Fedora 39 CUDA Repository](#using-the-fedora-39-cuda-repository)
+- [Using the Fedora 41 CUDA Repository](#using-the-fedora-41-cuda-repository)
 - [Creating a Fedora Toolbox Environment](#creating-a-fedora-toolbox-environment)
 - [Installing Essential Development Tools](#installing-essential-development-tools)
 - [Adding the CUDA Repository](#adding-the-cuda-repository)
@@ -29,44 +28,33 @@ In this guide we setup [Nvidia CUDA](https://docs.nvidia.com/cuda/) in a toolbox
 ## Prerequisites

 - **Toolbox Installed on the Host System** `Fedora Silverblue` and `Fedora Workstation` both have toolbox by default, other distributions may need to install the [toolbox package](https://containertoolbx.org/install/).
- **NVIDIA Drivers and Graphics Card installed on Host System (optional)** To run CUDA program, such as `llama.cpp`, the host should be setup to access your NVIDIA hardware. Fedora Hosts can use the [RPM Fusion Repository](https://rpmfusion.org/Howto/NVIDIA).
+- **NVIDIA Drivers and Graphics Card installed on Host System (recommended)** To run CUDA program, such as `llama.cpp`, the host should be setup to access your NVIDIA hardware. Fedora Hosts can use the [RPM Fusion Repository](https://rpmfusion.org/Howto/NVIDIA).
 - **Internet connectivity** to download packages.

-### Monitoring NVIDIA CUDA Repositories
+### Using the Fedora 41 CUDA Repository

-Before proceeding, it is advisable to check if NVIDIA has updated their CUDA repositories for your Fedora version. NVIDIA's repositories can be found at:
+The latest release is 41.

- [Fedora 40 CUDA Repository](https://developer.download.nvidia.com/compute/cuda/repos/fedora40/x86_64/)
 - [Fedora 41 CUDA Repository](https://developer.download.nvidia.com/compute/cuda/repos/fedora41/x86_64/)

-As of the latest update, these repositories do not contain the `cuda` meta-package or are missing essential components.
-
-### Using the Fedora 39 CUDA Repository
-
-Since the newer repositories are incomplete, we'll use the Fedora 39 repository:
-
- [Fedora 39 CUDA Repository](https://developer.download.nvidia.com/compute/cuda/repos/fedora39/x86_64/)
-
-**Note:** Fedora 39 is no longer maintained, so we recommend using a toolbox environment to prevent system conflicts.
+**Note:** We recommend using a toolbox environment to prevent system conflicts.

 ## Creating a Fedora Toolbox Environment

-This guide focuses on Fedora hosts, but with small adjustments, it can work for other hosts. Using a Fedora 39 toolbox allows us to install the necessary packages without affecting the host system.
+This guide focuses on Fedora hosts, but with small adjustments, it can work for other hosts. Using the Fedora Toolbox allows us to install the necessary packages without affecting the host system.

 **Note:** Toolbox is available for other systems, and even without Toolbox, it is possible to use Podman or Docker.

-We do not recommend installing on the host system, as Fedora 39 is out-of-maintenance, and instead you should upgrade to a maintained version of Fedora for your host.
-
-1. **Create a Fedora 39 Toolbox:**
+1. **Create a Fedora 41 Toolbox:**

   ```bash
-   toolbox create --image registry.fedoraproject.org/fedora-toolbox:39 --container fedora-toolbox-39-cuda
+   toolbox create --image registry.fedoraproject.org/fedora-toolbox:41 --container fedora-toolbox-41-cuda
   ```

 2. **Enter the Toolbox:**

   ```bash
-   toolbox enter --container fedora-toolbox-39-cuda
+   toolbox enter --container fedora-toolbox-41-cuda
   ```

   Inside the toolbox, you have root privileges and can install packages without affecting the host system.
@@ -85,7 +73,7 @@ We do not recommend installing on the host system, as Fedora 39 is out-of-mainte
   sudo dnf install vim-default-editor --allowerasing
   ```

-   The `--allowerasing` flag resolves any package conflicts.
+   The `--allowerasing` flag will allow the removal of the conflicting `nano-default-editor` package.

 3. **Install Development Tools and Libraries:**

@@ -100,7 +88,7 @@ We do not recommend installing on the host system, as Fedora 39 is out-of-mainte
 Add the NVIDIA CUDA repository to your DNF configuration:

 ```bash
-sudo dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/fedora39/x86_64/cuda-fedora39.repo
+sudo dnf config-manager addrepo --from-repofile=https://developer.download.nvidia.com/compute/cuda/repos/fedora41/x86_64/cuda-fedora41.repo
 ```

 After adding the repository, synchronize the package manager again:
@@ -109,106 +97,62 @@ After adding the repository, synchronize the package manager again:
 sudo dnf distro-sync
 ```

-## Installing `nvidia-driver-libs`
+## Installing `nvidia-driver-libs` and `nvidia-driver-cuda-libs`

-Attempt to install `nvidia-driver-libs`:
+We need to detect if the host is supplying the [NVIDIA driver libraries into the toolbox](https://github.com/containers/toolbox/blob/main/src/pkg/nvidia/nvidia.go).

 ```bash
-sudo dnf install nvidia-driver-libs
+ls -la /usr/lib64/libcuda.so.1
 ```

 **Explanation:**

- `nvidia-driver-libs` contains necessary NVIDIA driver libraries required by CUDA.
- This step might fail due to conflicts with existing NVIDIA drivers on the host system.
+- `nvidia-driver-libs` and `nvidia-driver-cuda-libs` contains necessary NVIDIA driver libraries required by CUDA,
+  on hosts with NVIDIA drivers installed the Fedora Container will supply the host libraries.

-## Manually Resolving Package Conflicts
+### Install Nvidia Driver Libraries on Guest (if `libcuda.so.1` was NOT found).
+
+```bash
+sudo dnf install nvidia-driver-libs nvidia-driver-cuda-libs
+```
+
+### Manually Updating the RPM database for host-supplied NVIDIA drivers (if `libcuda.so.1` was found).

 If the installation fails due to conflicts, we'll manually download and install the required packages, excluding conflicting files.

-### 1. Download the `nvidia-driver-libs` RPM
+#### 1. Download `nvidia-driver-libs` and `nvidia-driver-cuda-libs` RPM's (with dependencies)

 ```bash
-sudo dnf download --arch x86_64 nvidia-driver-libs
+sudo dnf download --destdir=/tmp/nvidia-driver-libs --resolve --arch x86_64 nvidia-driver-libs nvidia-driver-cuda-libs
 ```

-You should see a file similar to:
-
-```
-nvidia-driver-libs-560.35.05-1.fc39.x86_64.rpm
-```
-
-### 2. Attempt to Install the RPM
+#### 2. Update the RPM database to assume the installation of these packages.

 ```bash
-sudo dnf install nvidia-driver-libs-560.35.05-1.fc39.x86_64.rpm
-```
-
-**Expected Error:**
-
-Installation may fail with errors pointing to conflicts with `egl-gbm` and `egl-wayland`.
-
-**Note: It is important to carefully read the error messages to identify the exact paths that need to be excluded.**
-
-### 3. Download Dependencies
-
-```bash
-sudo dnf download --arch x86_64 egl-gbm egl-wayland
-```
-
-### 4. Install `egl-gbm` with Excluded Paths
-
-Exclude conflicting files during installation:
-
-```bash
-sudo rpm --install --verbose --hash \
-  --excludepath=/usr/lib64/libnvidia-egl-gbm.so.1.1.2 \
-  --excludepath=/usr/share/egl/egl_external_platform.d/15_nvidia_gbm.json \
-  egl-gbm-1.1.2^20240919gitb24587d-3.fc39.x86_64.rpm
-```
-
-**Explanation:**
-
- The `--excludepath` option skips installing files that conflict with existing files.
- Adjust the paths based on the error messages you receive.
-
-### 5. Install `egl-wayland` with Excluded Paths
-
-```bash
-sudo rpm --install --verbose --hash \
-  --excludepath=/usr/share/egl/egl_external_platform.d/10_nvidia_wayland.json \
-  egl-wayland-1.1.17^20241118giteeb29e1-5.fc39.x86_64.rpm
-```
-
-### 6. Install `nvidia-driver-libs` with Excluded Paths
-
-```bash
-sudo rpm --install --verbose --hash \
-  --excludepath=/usr/share/glvnd/egl_vendor.d/10_nvidia.json \
-  --excludepath=/usr/share/nvidia/nvoptix.bin \
-  nvidia-driver-libs-560.35.05-1.fc39.x86_64.rpm
+sudo rpm --install --verbose --hash --justdb /tmp/nvidia-driver-libs/*
 ```

 **Note:**

- Replace the paths with the ones causing conflicts in your installation if they differ.
- The `--verbose` and `--hash` options provide detailed output during installation.
+- The `--justdb` option only updates the RPM database, without touching the filesystem.

-## Finalizing the Installation of `nvidia-driver-libs`
+#### Finalizing the Installation of `nvidia-driver-libs` and `nvidia-driver-cuda-libs`

 After manually installing the dependencies, run:

 ```bash
-sudo dnf install nvidia-driver-libs
+sudo dnf install nvidia-driver-libs nvidia-driver-cuda-libs
 ```

 You should receive a message indicating the package is already installed:

 ```
-Package nvidia-driver-libs-3:560.35.05-1.fc39.x86_64 is already installed.
-Dependencies resolved.
+Updating and loading repositories:
+Repositories loaded.
+Package "nvidia-driver-libs-3:570.86.10-1.fc41.x86_64" is already installed.
+Package "nvidia-driver-cuda-libs-3:570.86.10-1.fc41.x86_64" is already installed.
+
 Nothing to do.
-Complete!
 ```

 ## Installing the CUDA Meta-Package
@@ -233,7 +177,7 @@ To use CUDA, add its binary directory to your system's `PATH`.

   **Explanation:**

-   - We add to  `/etc/profile.d/` as the `/etc/` folder is unique to this particular container, and is not shared with other containers or the host system.
+   - We add to `/etc/profile.d/` as the `/etc/` folder is unique to this particular container, and is not shared with other containers or the host system.
   - The backslash `\` before `$PATH` ensures the variable is correctly written into the script.

 2. **Make the Script Executable:**
@@ -262,26 +206,33 @@ You should see output similar to:

 ```
 nvcc: NVIDIA (R) Cuda compiler driver
-Copyright (c) 2005-2024 NVIDIA Corporation
-Built on Tue_Oct_29_23:50:19_PDT_2024
-Cuda compilation tools, release 12.6, V12.6.85
-Build cuda_12.6.r12.6/compiler.35059454_0
+Copyright (c) 2005-2025 NVIDIA Corporation
+Built on Wed_Jan_15_19:20:09_PST_2025
+Cuda compilation tools, release 12.8, V12.8.61
+Build cuda_12.8.r12.8/compiler.35404655_0
 ```

 This output confirms that the CUDA compiler is accessible and indicates the installed version.

 ## Conclusion

-You have successfully set up CUDA on Fedora within a toolbox environment using the Fedora 39 CUDA repository. By manually resolving package conflicts and configuring the environment, you can develop CUDA applications without affecting your host system.
+You have successfully set up CUDA on Fedora within a toolbox environment using the Fedora 41 CUDA repository. By manually updating the RPM db and configuring the environment, you can develop CUDA applications without affecting your host system.

 ## Troubleshooting

 - **Installation Failures:**
-  - If you encounter errors during installation, carefully read the error messages. They often indicate conflicting files or missing dependencies.
-  - Use the `--excludepath` option with `rpm` to exclude conflicting files during manual installations.

- **Driver Conflicts:**
-  - Since the host system may already have NVIDIA drivers installed, conflicts can arise. Using the toolbox environment helps isolate these issues.
+  - If you encounter errors during installation, carefully read the error messages. They often indicate conflicting files or missing dependencies.
+  - You may use the `--excludepath` option with `rpm` to exclude conflicting files during manual RPM installations.
+
+- **Rebooting the Container:**
+
+  - Sometimes there may be a bug in the NVIDIA driver host passthrough (such as missing a shared library). Rebooting the container may solve this issue:
+
+  ```bash
+  # on the host system
+  podman container restart --all
+  ```

 - **Environment Variables Not Set:**
  - If `nvcc` is not found after installation, ensure that `/usr/local/cuda/bin` is in your `PATH`.
@@ -291,11 +242,13 @@ You have successfully set up CUDA on Fedora within a toolbox environment using t
 ## Additional Notes

 - **Updating CUDA in the Future:**
+
  - Keep an eye on the official NVIDIA repositories for updates to your Fedora version.
  - When an updated repository becomes available, adjust your `dnf` configuration accordingly.

 - **Building `llama.cpp`:**
-  - With CUDA installed, you can follow these [build instructions for `llama.cpp`](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md) to compile it with CUDA support.
+
+  - With CUDA installed, you can follow these [build instructions for `llama.cpp`](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md) to compile it with CUDA support.
  - Ensure that any CUDA-specific build flags or paths are correctly set in your build configuration.

 - **Using the Toolbox Environment:**
--- a/docs/development/HOWTO-add-model.md
+++ b/docs/development/HOWTO-add-model.md
@@ -104,16 +104,16 @@ Note: to debug the inference graph: you can use [llama-eval-callback](/examples/

 ## GGUF specification

-https://github.com/ggerganov/ggml/blob/master/docs/gguf.md
+https://github.com/ggml-org/ggml/blob/master/docs/gguf.md

 ## Resources

- YaRN RoPE scaling https://github.com/ggerganov/llama.cpp/pull/2268
- support Baichuan serial models https://github.com/ggerganov/llama.cpp/pull/3009
- support attention bias https://github.com/ggerganov/llama.cpp/pull/4283
- Mixtral support https://github.com/ggerganov/llama.cpp/pull/4406
- BERT embeddings https://github.com/ggerganov/llama.cpp/pull/5423
- Grok-1 support https://github.com/ggerganov/llama.cpp/pull/6204
- Command R Plus support https://github.com/ggerganov/llama.cpp/pull/6491
- support arch DBRX https://github.com/ggerganov/llama.cpp/pull/6515
- How to convert HuggingFace model to GGUF format https://github.com/ggerganov/llama.cpp/discussions/2948
+- YaRN RoPE scaling https://github.com/ggml-org/llama.cpp/pull/2268
+- support Baichuan serial models https://github.com/ggml-org/llama.cpp/pull/3009
+- support attention bias https://github.com/ggml-org/llama.cpp/pull/4283
+- Mixtral support https://github.com/ggml-org/llama.cpp/pull/4406
+- BERT embeddings https://github.com/ggml-org/llama.cpp/pull/5423
+- Grok-1 support https://github.com/ggml-org/llama.cpp/pull/6204
+- Command R Plus support https://github.com/ggml-org/llama.cpp/pull/6491
+- support arch DBRX https://github.com/ggml-org/llama.cpp/pull/6515
+- How to convert HuggingFace model to GGUF format https://github.com/ggml-org/llama.cpp/discussions/2948
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -7,21 +7,21 @@
 ## Images
 We have three Docker images available for this project:

-1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
-2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
-3. `ghcr.io/ggerganov/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`)
+1. `ghcr.io/ggml-org/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
+2. `ghcr.io/ggml-org/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
+3. `ghcr.io/ggml-org/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`)

 Additionally, there the following images, similar to the above:

- `ghcr.io/ggerganov/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`)
- `ghcr.io/ggerganov/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`)
- `ghcr.io/ggerganov/llama.cpp:server-cuda`: Same as `server` but compiled with CUDA support. (platforms: `linux/amd64`)
- `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggerganov/llama.cpp:full-musa`: Same as `full` but compiled with MUSA support. (platforms: `linux/amd64`)
- `ghcr.io/ggerganov/llama.cpp:light-musa`: Same as `light` but compiled with MUSA support. (platforms: `linux/amd64`)
- `ghcr.io/ggerganov/llama.cpp:server-musa`: Same as `server` but compiled with MUSA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggml-org/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggml-org/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggml-org/llama.cpp:server-cuda`: Same as `server` but compiled with CUDA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggml-org/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
+- `ghcr.io/ggml-org/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
+- `ghcr.io/ggml-org/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
+- `ghcr.io/ggml-org/llama.cpp:full-musa`: Same as `full` but compiled with MUSA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggml-org/llama.cpp:light-musa`: Same as `light` but compiled with MUSA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggml-org/llama.cpp:server-musa`: Same as `server` but compiled with MUSA support. (platforms: `linux/amd64`)

 The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA, ROCm or MUSA library, you'll need to build the images locally for now).

@@ -32,25 +32,25 @@ The easiest way to download the models, convert them to ggml and optimize them i
 Replace `/path/to/models` below with the actual path where you downloaded the models.

 ```bash
-docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B
+docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --all-in-one "/models/" 7B
 ```

 On completion, you are ready to play!

 ```bash
-docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
+docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
 ```

 or with a light image:

 ```bash
-docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
+docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
 ```

 or with a server image:

 ```bash
-docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggerganov/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512
+docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggml-org/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512
 ```

 ## Docker With CUDA
@@ -60,16 +60,16 @@ Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia
 ## Building Docker locally

 ```bash
-docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile .
-docker build -t local/llama.cpp:light-cuda -f .devops/llama-cli-cuda.Dockerfile .
-docker build -t local/llama.cpp:server-cuda -f .devops/llama-server-cuda.Dockerfile .
+docker build -t local/llama.cpp:full-cuda --target full -f .devops/cuda.Dockerfile .
+docker build -t local/llama.cpp:light-cuda --target light -f .devops/cuda.Dockerfile .
+docker build -t local/llama.cpp:server-cuda --target server -f .devops/cuda.Dockerfile .
 ```

 You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture.

 The defaults are:

- `CUDA_VERSION` set to `12.6.0`
+- `CUDA_VERSION` set to `12.4.0`
 - `CUDA_DOCKER_ARCH` set to the cmake build default, which includes all the supported architectures

 The resulting images, are essentially the same as the non-CUDA images:
@@ -95,16 +95,16 @@ Assuming one has the [mt-container-toolkit](https://developer.mthreads.com/musa/
 ## Building Docker locally

 ```bash
-docker build -t local/llama.cpp:full-musa -f .devops/full-musa.Dockerfile .
-docker build -t local/llama.cpp:light-musa -f .devops/llama-cli-musa.Dockerfile .
-docker build -t local/llama.cpp:server-musa -f .devops/llama-server-musa.Dockerfile .
+docker build -t local/llama.cpp:full-musa --target full -f .devops/musa.Dockerfile .
+docker build -t local/llama.cpp:light-musa --target light -f .devops/musa.Dockerfile .
+docker build -t local/llama.cpp:server-musa --target server -f .devops/musa.Dockerfile .
 ```

 You may want to pass in some different `ARGS`, depending on the MUSA environment supported by your container host, as well as the GPU architecture.

 The defaults are:

- `MUSA_VERSION` set to `rc3.1.0`
+- `MUSA_VERSION` set to `rc3.1.1`

 The resulting images, are essentially the same as the non-MUSA images:

--- a/docs/function-calling.md
+++ b/docs/function-calling.md
@@ -0,0 +1,394 @@
+# Function Calling
+
+[chat.h](../common/chat.h) (https://github.com/ggml-org/llama.cpp/pull/9639) adds support for [OpenAI-style function calling](https://platform.openai.com/docs/guides/function-calling) and is used in:
+- `llama-server` when started w/ `--jinja` flag
+- `llama-cli` (WIP: https://github.com/ggml-org/llama.cpp/pull/11556)
+
+## Universal support w/ Native & Generic handlers
+
+Function calling is supported for all models (see https://github.com/ggml-org/llama.cpp/pull/9639):
+
+- Native tool call formats supported:
+  - Llama 3.1 / 3.3 (including builtin tools support - tool names for `wolfram_alpha`, `web_search` / `brave_search`, `code_interpreter`), Llama 3.2
+  - Functionary v3.1 / v3.2
+  - Hermes 2/3, Qwen 2.5
+  - Qwen 2.5 Coder (WIP: https://github.com/ggml-org/llama.cpp/pull/12034)
+  - Mistral Nemo
+  - Firefunction v2
+  - Command R7B
+  - DeepSeek R1 (WIP / seems reluctant to call any tools?)
+
+- Generic tool call is supported when the template isn't recognized by native format handlers (you'll see `Chat format: Generic` in the logs).
+  - Use `--chat-template-file` to override the template when appropriate (see examples below)
+  - Generic support may consume more tokens and be less efficient than a model's native format.
+
+<details>
+<summary>Show some common templates and which format handler they use</summary>
+
+| Template | Format |
+|----------|--------|
+| Almawave-Velvet-14B.jinja | Hermes 2 Pro |
+| AtlaAI-Selene-1-Mini-Llama-3.1-8B.jinja | Llama 3.x |
+| CohereForAI-aya-expanse-8b.jinja | Generic |
+| CohereForAI-c4ai-command-r-plus-default.jinja | Generic |
+| CohereForAI-c4ai-command-r-plus-rag.jinja | Generic |
+| CohereForAI-c4ai-command-r-plus-tool_use.jinja | Generic |
+| CohereForAI-c4ai-command-r7b-12-2024-default.jinja | Command R7B (extract reasoning) |
+| CohereForAI-c4ai-command-r7b-12-2024-rag.jinja | Command R7B (extract reasoning) |
+| CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja | Command R7B (extract reasoning) |
+| CohereForAI-c4ai-command-r7b-12-2024.jinja | Generic |
+| DavieLion-Llama-3.2-1B-SPIN-iter3.jinja | Generic |
+| Delta-Vector-Rei-12B.jinja | Mistral Nemo |
+| EpistemeAI-Mistral-Nemo-Instruct-12B-Philosophy-Math.jinja | Mistral Nemo |
+| FlofloB-83k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit.jinja | Hermes 2 Pro |
+| FlofloB-test_continued_pretraining_Phi-3-mini-4k-instruct_Unsloth_merged_16bit.jinja | Generic |
+| HelpingAI-HAI-SER.jinja | Generic |
+| HuggingFaceTB-SmolLM2-1.7B-Instruct.jinja | Generic |
+| HuggingFaceTB-SmolLM2-135M-Instruct.jinja | Generic |
+| HuggingFaceTB-SmolLM2-360M-Instruct.jinja | Generic |
+| INSAIT-Institute-BgGPT-Gemma-2-27B-IT-v1.0.jinja | Generic |
+| Ihor-Text2Graph-R1-Qwen2.5-0.5b.jinja | Hermes 2 Pro |
+| Infinigence-Megrez-3B-Instruct.jinja | Generic |
+| Josephgflowers-TinyLlama_v1.1_math_code-world-test-1.jinja | Generic |
+| LGAI-EXAONE-EXAONE-3.5-2.4B-Instruct.jinja | Generic |
+| LGAI-EXAONE-EXAONE-3.5-7.8B-Instruct.jinja | Generic |
+| LatitudeGames-Wayfarer-12B.jinja | Generic |
+| Magpie-Align-Llama-3-8B-Magpie-Align-v0.1.jinja | Generic |
+| Magpie-Align-Llama-3.1-8B-Magpie-Align-v0.1.jinja | Generic |
+| MaziyarPanahi-calme-3.2-instruct-78b.jinja | Generic |
+| MiniMaxAI-MiniMax-Text-01.jinja | Generic |
+| MiniMaxAI-MiniMax-VL-01.jinja | Generic |
+| NaniDAO-deepseek-r1-qwen-2.5-32B-ablated.jinja | DeepSeek R1 (extract reasoning) |
+| NexaAIDev-Octopus-v2.jinja | Generic |
+| NousResearch-Hermes-2-Pro-Llama-3-8B-default.jinja | Generic |
+| NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja | Hermes 2 Pro |
+| NousResearch-Hermes-2-Pro-Mistral-7B-default.jinja | Generic |
+| NousResearch-Hermes-2-Pro-Mistral-7B-tool_use.jinja | Hermes 2 Pro |
+| NousResearch-Hermes-3-Llama-3.1-70B-default.jinja | Generic |
+| NousResearch-Hermes-3-Llama-3.1-70B-tool_use.jinja | Hermes 2 Pro |
+| NovaSky-AI-Sky-T1-32B-Flash.jinja | Hermes 2 Pro |
+| NovaSky-AI-Sky-T1-32B-Preview.jinja | Hermes 2 Pro |
+| OnlyCheeini-greesychat-turbo.jinja | Generic |
+| Orenguteng-Llama-3.1-8B-Lexi-Uncensored-V2.jinja | Llama 3.x |
+| OrionStarAI-Orion-14B-Chat.jinja | Generic |
+| PowerInfer-SmallThinker-3B-Preview.jinja | Generic |
+| PrimeIntellect-INTELLECT-1-Instruct.jinja | Generic |
+| Qwen-QVQ-72B-Preview.jinja | Generic |
+| Qwen-QwQ-32B-Preview.jinja | Hermes 2 Pro |
+| Qwen-Qwen1.5-7B-Chat.jinja | Generic |
+| Qwen-Qwen2-7B-Instruct.jinja | Generic |
+| Qwen-Qwen2-VL-72B-Instruct.jinja | Generic |
+| Qwen-Qwen2-VL-7B-Instruct.jinja | Generic |
+| Qwen-Qwen2.5-0.5B.jinja | Hermes 2 Pro |
+| Qwen-Qwen2.5-1.5B-Instruct.jinja | Hermes 2 Pro |
+| Qwen-Qwen2.5-14B-Instruct-1M.jinja | Hermes 2 Pro |
+| Qwen-Qwen2.5-14B.jinja | Hermes 2 Pro |
+| Qwen-Qwen2.5-32B-Instruct.jinja | Hermes 2 Pro |
+| Qwen-Qwen2.5-32B.jinja | Hermes 2 Pro |
+| Qwen-Qwen2.5-3B-Instruct.jinja | Hermes 2 Pro |
+| Qwen-Qwen2.5-72B-Instruct.jinja | Hermes 2 Pro |
+| Qwen-Qwen2.5-7B-Instruct-1M.jinja | Hermes 2 Pro |
+| Qwen-Qwen2.5-7B-Instruct.jinja | Hermes 2 Pro |
+| Qwen-Qwen2.5-7B.jinja | Hermes 2 Pro |
+| Qwen-Qwen2.5-Coder-32B-Instruct.jinja | Hermes 2 Pro |
+| Qwen-Qwen2.5-Coder-7B-Instruct.jinja | Hermes 2 Pro |
+| Qwen-Qwen2.5-Math-1.5B.jinja | Hermes 2 Pro |
+| Qwen-Qwen2.5-Math-7B-Instruct.jinja | Hermes 2 Pro |
+| Qwen-Qwen2.5-VL-3B-Instruct.jinja | Hermes 2 Pro |
+| Qwen-Qwen2.5-VL-72B-Instruct.jinja | Hermes 2 Pro |
+| Qwen-Qwen2.5-VL-7B-Instruct.jinja | Hermes 2 Pro |
+| RWKV-Red-Team-ARWKV-7B-Preview-0.1.jinja | Hermes 2 Pro |
+| SakanaAI-TinySwallow-1.5B-Instruct.jinja | Hermes 2 Pro |
+| SakanaAI-TinySwallow-1.5B.jinja | Hermes 2 Pro |
+| Sao10K-70B-L3.3-Cirrus-x1.jinja | Llama 3.x |
+| SentientAGI-Dobby-Mini-Leashed-Llama-3.1-8B.jinja | Llama 3.x |
+| SentientAGI-Dobby-Mini-Unhinged-Llama-3.1-8B.jinja | Llama 3.x |
+| Steelskull-L3.3-Damascus-R1.jinja | Llama 3.x |
+| Steelskull-L3.3-MS-Nevoria-70b.jinja | Llama 3.x |
+| Steelskull-L3.3-Nevoria-R1-70b.jinja | Llama 3.x |
+| THUDM-glm-4-9b-chat.jinja | Generic |
+| THUDM-glm-edge-1.5b-chat.jinja | Generic |
+| Tarek07-Progenitor-V1.1-LLaMa-70B.jinja | Llama 3.x |
+| TheBloke-FusionNet_34Bx2_MoE-AWQ.jinja | Generic |
+| TinyLlama-TinyLlama-1.1B-Chat-v1.0.jinja | Generic |
+| UCLA-AGI-Mistral7B-PairRM-SPPO-Iter3.jinja | Generic |
+| ValiantLabs-Llama3.1-8B-Enigma.jinja | Llama 3.x |
+| abacusai-Fewshot-Metamath-OrcaVicuna-Mistral.jinja | Generic |
+| ai21labs-AI21-Jamba-1.5-Large.jinja | Generic |
+| allenai-Llama-3.1-Tulu-3-405B-SFT.jinja | Generic |
+| allenai-Llama-3.1-Tulu-3-405B.jinja | Generic |
+| allenai-Llama-3.1-Tulu-3-8B.jinja | Generic |
+| arcee-ai-Virtuoso-Lite.jinja | Hermes 2 Pro |
+| arcee-ai-Virtuoso-Medium-v2.jinja | Hermes 2 Pro |
+| arcee-ai-Virtuoso-Small-v2.jinja | Hermes 2 Pro |
+| avemio-GRAG-NEMO-12B-ORPO-HESSIAN-AI.jinja | Generic |
+| bespokelabs-Bespoke-Stratos-7B.jinja | Hermes 2 Pro |
+| bfuzzy1-acheron-m1a-llama.jinja | Generic |
+| bofenghuang-vigogne-2-70b-chat.jinja | Generic |
+| bytedance-research-UI-TARS-72B-DPO.jinja | Generic |
+| bytedance-research-UI-TARS-7B-DPO.jinja | Generic |
+| bytedance-research-UI-TARS-7B-SFT.jinja | Generic |
+| carsenk-phi3.5_mini_exp_825_uncensored.jinja | Generic |
+| cyberagent-DeepSeek-R1-Distill-Qwen-14B-Japanese.jinja | DeepSeek R1 (extract reasoning) |
+| cyberagent-DeepSeek-R1-Distill-Qwen-32B-Japanese.jinja | DeepSeek R1 (extract reasoning) |
+| databricks-dbrx-instruct.jinja | Generic |
+| deepseek-ai-DeepSeek-Coder-V2-Instruct.jinja | Generic |
+| deepseek-ai-DeepSeek-Coder-V2-Lite-Base.jinja | Generic |
+| deepseek-ai-DeepSeek-Coder-V2-Lite-Instruct.jinja | Generic |
+| deepseek-ai-DeepSeek-R1-Distill-Llama-70B.jinja | DeepSeek R1 (extract reasoning) |
+| deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja | DeepSeek R1 (extract reasoning) |
+| deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B.jinja | DeepSeek R1 (extract reasoning) |
+| deepseek-ai-DeepSeek-R1-Distill-Qwen-14B.jinja | DeepSeek R1 (extract reasoning) |
+| deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja | DeepSeek R1 (extract reasoning) |
+| deepseek-ai-DeepSeek-R1-Distill-Qwen-7B.jinja | DeepSeek R1 (extract reasoning) |
+| deepseek-ai-DeepSeek-R1-Zero.jinja | DeepSeek R1 (extract reasoning) |
+| deepseek-ai-DeepSeek-R1.jinja | DeepSeek R1 (extract reasoning) |
+| deepseek-ai-DeepSeek-V2-Lite.jinja | Generic |
+| deepseek-ai-DeepSeek-V2.5.jinja | DeepSeek R1 (extract reasoning) |
+| deepseek-ai-DeepSeek-V3.jinja | DeepSeek R1 (extract reasoning) |
+| deepseek-ai-deepseek-coder-33b-instruct.jinja | Generic |
+| deepseek-ai-deepseek-coder-6.7b-instruct.jinja | Generic |
+| deepseek-ai-deepseek-coder-7b-instruct-v1.5.jinja | Generic |
+| deepseek-ai-deepseek-llm-67b-chat.jinja | Generic |
+| deepseek-ai-deepseek-llm-7b-chat.jinja | Generic |
+| dicta-il-dictalm2.0-instruct.jinja | Generic |
+| ehristoforu-Falcon3-8B-Franken-Basestruct.jinja | Hermes 2 Pro |
+| fireworks-ai-llama-3-firefunction-v2.jinja | FireFunction v2 |
+| godlikehhd-alpaca_data_sampled_ifd_new_5200.jinja | Hermes 2 Pro |
+| godlikehhd-alpaca_data_score_max_0.7_2600.jinja | Hermes 2 Pro |
+| google-gemma-2-27b-it.jinja | Generic |
+| google-gemma-2-2b-it.jinja | Generic |
+| google-gemma-2-2b-jpn-it.jinja | Generic |
+| google-gemma-7b-it.jinja | Generic |
+| huihui-ai-DeepSeek-R1-Distill-Llama-70B-abliterated.jinja | DeepSeek R1 (extract reasoning) |
+| huihui-ai-DeepSeek-R1-Distill-Llama-8B-abliterated.jinja | DeepSeek R1 (extract reasoning) |
+| huihui-ai-DeepSeek-R1-Distill-Qwen-14B-abliterated-v2.jinja | DeepSeek R1 (extract reasoning) |
+| huihui-ai-DeepSeek-R1-Distill-Qwen-32B-abliterated.jinja | DeepSeek R1 (extract reasoning) |
+| huihui-ai-DeepSeek-R1-Distill-Qwen-7B-abliterated-v2.jinja | DeepSeek R1 (extract reasoning) |
+| huihui-ai-Qwen2.5-14B-Instruct-1M-abliterated.jinja | Hermes 2 Pro |
+| ibm-granite-granite-3.1-8b-instruct.jinja | Generic |
+| indischepartij-MiniCPM-3B-OpenHermes-2.5-v2.jinja | Generic |
+| inflatebot-MN-12B-Mag-Mell-R1.jinja | Generic |
+| jinaai-ReaderLM-v2.jinja | Generic |
+| kms7530-chemeng_qwen-math-7b_24_1_100_1_nonmath.jinja | Hermes 2 Pro |
+| knifeayumu-Cydonia-v1.3-Magnum-v4-22B.jinja | Mistral Nemo |
+| langgptai-qwen1.5-7b-chat-sa-v0.1.jinja | Generic |
+| lightblue-DeepSeek-R1-Distill-Qwen-7B-Japanese.jinja | DeepSeek R1 (extract reasoning) |
+| mattshumer-Reflection-Llama-3.1-70B.jinja | Generic |
+| meetkai-functionary-medium-v3.1.jinja | Functionary v3.1 Llama 3.1 |
+| meetkai-functionary-medium-v3.2.jinja | Functionary v3.2 |
+| meta-llama-Llama-2-7b-chat-hf.jinja | Generic |
+| meta-llama-Llama-3.1-8B-Instruct.jinja | Llama 3.x |
+| meta-llama-Llama-3.2-11B-Vision-Instruct.jinja | Llama 3.x |
+| meta-llama-Llama-3.2-1B-Instruct.jinja | Llama 3.x |
+| meta-llama-Llama-3.2-3B-Instruct.jinja | Llama 3.x |
+| meta-llama-Llama-3.3-70B-Instruct.jinja | Llama 3.x |
+| meta-llama-Meta-Llama-3-8B-Instruct.jinja | Generic |
+| meta-llama-Meta-Llama-3.1-8B-Instruct.jinja | Llama 3.x |
+| microsoft-Phi-3-medium-4k-instruct.jinja | Generic |
+| microsoft-Phi-3-mini-4k-instruct.jinja | Generic |
+| microsoft-Phi-3-small-8k-instruct.jinja | Generic |
+| microsoft-Phi-3.5-mini-instruct.jinja | Generic |
+| microsoft-Phi-3.5-vision-instruct.jinja | Generic |
+| microsoft-phi-4.jinja | Generic |
+| migtissera-Tess-3-Mistral-Nemo-12B.jinja | Generic |
+| ministral-Ministral-3b-instruct.jinja | Generic |
+| mistralai-Codestral-22B-v0.1.jinja | Generic |
+| mistralai-Mistral-7B-Instruct-v0.1.jinja | Generic |
+| mistralai-Mistral-7B-Instruct-v0.2.jinja | Generic |
+| mistralai-Mistral-7B-Instruct-v0.3.jinja | Mistral Nemo |
+| mistralai-Mistral-Large-Instruct-2407.jinja | Mistral Nemo |
+| mistralai-Mistral-Large-Instruct-2411.jinja | Generic |
+| mistralai-Mistral-Nemo-Instruct-2407.jinja | Mistral Nemo |
+| mistralai-Mistral-Small-24B-Instruct-2501.jinja | Generic |
+| mistralai-Mixtral-8x7B-Instruct-v0.1.jinja | Generic |
+| mkurman-Qwen2.5-14B-DeepSeek-R1-1M.jinja | Hermes 2 Pro |
+| mlabonne-AlphaMonarch-7B.jinja | Generic |
+| mlx-community-Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1-float32.jinja | Hermes 2 Pro |
+| mlx-community-Qwen2.5-VL-7B-Instruct-8bit.jinja | Hermes 2 Pro |
+| mobiuslabsgmbh-DeepSeek-R1-ReDistill-Qwen-1.5B-v1.1.jinja | DeepSeek R1 (extract reasoning) |
+| netcat420-MFANNv0.20.jinja | Generic |
+| netcat420-MFANNv0.24.jinja | Generic |
+| netease-youdao-Confucius-o1-14B.jinja | Hermes 2 Pro |
+| nvidia-AceMath-7B-RM.jinja | Hermes 2 Pro |
+| nvidia-Eagle2-1B.jinja | Hermes 2 Pro |
+| nvidia-Eagle2-9B.jinja | Hermes 2 Pro |
+| nvidia-Llama-3.1-Nemotron-70B-Instruct-HF.jinja | Llama 3.x |
+| onnx-community-DeepSeek-R1-Distill-Qwen-1.5B-ONNX.jinja | DeepSeek R1 (extract reasoning) |
+| open-thoughts-OpenThinker-7B.jinja | Hermes 2 Pro |
+| openchat-openchat-3.5-0106.jinja | Generic |
+| pankajmathur-orca_mini_v6_8b.jinja | Generic |
+| princeton-nlp-Mistral-7B-Base-SFT-RDPO.jinja | Generic |
+| princeton-nlp-Mistral-7B-Instruct-DPO.jinja | Generic |
+| princeton-nlp-Mistral-7B-Instruct-RDPO.jinja | Generic |
+| prithivMLmods-Bellatrix-Tiny-1.5B-R1.jinja | Hermes 2 Pro |
+| prithivMLmods-Bellatrix-Tiny-1B-R1.jinja | Llama 3.x |
+| prithivMLmods-Bellatrix-Tiny-1B-v3.jinja | Generic |
+| prithivMLmods-Bellatrix-Tiny-3B-R1.jinja | Llama 3.x |
+| prithivMLmods-Blaze-14B-xElite.jinja | Generic |
+| prithivMLmods-Calcium-Opus-14B-Elite2-R1.jinja | Hermes 2 Pro |
+| prithivMLmods-Calme-Ties-78B.jinja | Generic |
+| prithivMLmods-Calme-Ties2-78B.jinja | Generic |
+| prithivMLmods-Calme-Ties3-78B.jinja | Generic |
+| prithivMLmods-ChemQwen2-vL.jinja | Generic |
+| prithivMLmods-GWQ2b.jinja | Generic |
+| prithivMLmods-LatexMind-2B-Codec.jinja | Generic |
+| prithivMLmods-Llama-3.2-6B-AlgoCode.jinja | Llama 3.x |
+| prithivMLmods-Megatron-Opus-14B-Exp.jinja | Hermes 2 Pro |
+| prithivMLmods-Megatron-Opus-14B-Stock.jinja | Hermes 2 Pro |
+| prithivMLmods-Megatron-Opus-7B-Exp.jinja | Hermes 2 Pro |
+| prithivMLmods-Omni-Reasoner-Merged.jinja | Hermes 2 Pro |
+| prithivMLmods-Omni-Reasoner4-Merged.jinja | Hermes 2 Pro |
+| prithivMLmods-Primal-Opus-14B-Optimus-v1.jinja | Hermes 2 Pro |
+| prithivMLmods-QwQ-Math-IO-500M.jinja | Hermes 2 Pro |
+| prithivMLmods-Qwen-7B-Distill-Reasoner.jinja | DeepSeek R1 (extract reasoning) |
+| prithivMLmods-Qwen2.5-1.5B-DeepSeek-R1-Instruct.jinja | Hermes 2 Pro |
+| prithivMLmods-Qwen2.5-14B-DeepSeek-R1-1M.jinja | Hermes 2 Pro |
+| prithivMLmods-Qwen2.5-32B-DeepSeek-R1-Instruct.jinja | Hermes 2 Pro |
+| prithivMLmods-Qwen2.5-7B-DeepSeek-R1-1M.jinja | Hermes 2 Pro |
+| prithivMLmods-Triangulum-v2-10B.jinja | Hermes 2 Pro |
+| qingy2024-Falcon3-2x10B-MoE-Instruct.jinja | Hermes 2 Pro |
+| rubenroy-Zurich-14B-GCv2-5m.jinja | Hermes 2 Pro |
+| rubenroy-Zurich-7B-GCv2-5m.jinja | Hermes 2 Pro |
+| silma-ai-SILMA-Kashif-2B-Instruct-v1.0.jinja | Generic |
+| simplescaling-s1-32B.jinja | Hermes 2 Pro |
+| sometimesanotion-Lamarck-14B-v0.7.jinja | Hermes 2 Pro |
+| sonthenguyen-zephyr-sft-bnb-4bit-DPO-mtbr-180steps.jinja | Generic |
+| sthenno-tempesthenno-icy-0130.jinja | Generic |
+| sumink-qwft.jinja | Hermes 2 Pro |
+| teknium-OpenHermes-2.5-Mistral-7B.jinja | Generic |
+| thirdeyeai-elevate360m.jinja | Generic |
+| tiiuae-Falcon3-10B-Instruct.jinja | Hermes 2 Pro |
+| unsloth-DeepSeek-R1-Distill-Llama-8B-unsloth-bnb-4bit.jinja | DeepSeek R1 (extract reasoning) |
+| unsloth-DeepSeek-R1-Distill-Llama-8B.jinja | DeepSeek R1 (extract reasoning) |
+| unsloth-DeepSeek-R1.jinja | DeepSeek R1 (extract reasoning) |
+| unsloth-Mistral-Small-24B-Instruct-2501-unsloth-bnb-4bit.jinja | Generic |
+| upstage-solar-pro-preview-instruct.jinja | Generic |
+| whyhow-ai-PatientSeek.jinja | Generic |
+| xwen-team-Xwen-72B-Chat.jinja | Hermes 2 Pro |
+| xwen-team-Xwen-7B-Chat.jinja | Hermes 2 Pro |
+
+This table can be generated with:
+
+```bash
+./build/bin/test-chat ../minja/build/tests/*.jinja 2>/dev/null
+```
+
+</details>
+
+# Usage - need tool-aware Jinja template
+
+First, start a server with any model, but make sure it has a tools-enabled template: you can verify this by inspecting the `chat_template` or `chat_template_tool_use` properties in `http://localhost:8080/props`).
+
+Here are some models known to work (w/ chat template override when needed):
+
+```shell
+# Native support:
+
+llama-server --jinja -fa -hf bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M
+llama-server --jinja -fa -hf bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q6_K_L
+llama-server --jinja -fa -hf bartowski/Llama-3.3-70B-Instruct-GGUF:Q4_K_M
+
+# Native support for DeepSeek R1 works best w/ our template override (official template is buggy, although we do work around it)
+
+llama-server --jinja -fa -hf bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q6_K_L \
+    --chat-template-file models/templates/llama-cpp-deepseek-r1.jinja
+
+llama-server --jinja -fa -hf bartowski/DeepSeek-R1-Distill-Qwen-32B-GGUF:Q4_K_M \
+    --chat-template-file models/templates/llama-cpp-deepseek-r1.jinja
+
+# Native support requires the right template for these GGUFs:
+
+llama-server --jinja -fa -hf bartowski/functionary-small-v3.2-GGUF:Q4_K_M
+    --chat-template-file models/templates/meetkai-functionary-medium-v3.2.jinja
+
+llama-server --jinja -fa -hf bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M \
+    --chat-template-file models/templates/NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja
+
+llama-server --jinja -fa -hf bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M \
+    --chat-template-file models/templates/NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja
+
+llama-server --jinja -fa -hf bartowski/firefunction-v2-GGUF -hff firefunction-v2-IQ1_M.gguf \
+    --chat-template-file models/templates/fireworks-ai-llama-3-firefunction-v2.jinja
+
+llama-server --jinja -fa -hf bartowski/c4ai-command-r7b-12-2024-GGUF:Q6_K_L \
+    --chat-template-file models/templates/CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja
+
+# Generic format support
+llama-server --jinja -fa -hf bartowski/phi-4-GGUF:Q4_0
+llama-server --jinja -fa -hf bartowski/gemma-2-2b-it-GGUF:Q8_0
+llama-server --jinja -fa -hf bartowski/c4ai-command-r-v01-GGUF:Q2_K
+```
+
+To get the official template from original HuggingFace repos, you can use [scripts/get_chat_template.py](../scripts/get_chat_template.py) (see examples invocations in [models/templates/README.md](../models/templates/README.md))
+
+> [!TIP]
+> If there is no official `tool_use` Jinja template, you may want to set `--chat-template chatml` to use a default that works with many models (YMMV!), or write your own (e.g. we provide a custom [llama-cpp-deepseek-r1.jinja](../models/templates/llama-cpp-deepseek-r1.jinja) for DeepSeek R1 distills)
+
+Test in CLI (or with any library / software that can use OpenAI-compatible API backends):
+
+```bash
+curl http://localhost:8080/v1/chat/completions -d '{
+"model": "gpt-3.5-turbo",
+"tools": [
+    {
+    "type":"function",
+    "function":{
+        "name":"python",
+        "description":"Runs code in an ipython interpreter and returns the result of the execution after 60 seconds.",
+        "parameters":{
+        "type":"object",
+        "properties":{
+            "code":{
+            "type":"string",
+            "description":"The code to run in the ipython interpreter."
+            }
+        },
+        "required":["code"]
+        }
+    }
+    }
+],
+"messages": [
+    {
+    "role": "user",
+    "content": "Print a hello world message with python."
+    }
+]
+}'
+```
+
+<details>
+<summary>Show output</summary>
+
+```json
+{
+"choices": [
+    {
+    "finish_reason": "tool",
+    "index": 0,
+    "message": {
+        "content": null,
+        "tool_calls": [
+        {
+            "name": "python",
+            "arguments": "{\"code\":\" \\nprint(\\\"Hello, World!\\\")\"}"
+        }
+        ],
+        "role": "assistant"
+    }
+    }
+],
+"created": 1727287211,
+"model": "gpt-3.5-turbo",
+"object": "chat.completion",
+"usage": {
+    "completion_tokens": 16,
+    "prompt_tokens": 44,
+    "total_tokens": 60
+},
+"id": "chatcmpl-Htbgh9feMmGM0LEH2hmQvwsCxq3c6Ni8"
+}
+```
+
+</details>
--- a/docs/install.md
+++ b/docs/install.md
@@ -7,7 +7,7 @@ On Mac and Linux, the homebrew package manager can be used via
 ```sh
 brew install llama.cpp
 ```
-The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggerganov/llama.cpp/discussions/7668
+The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggml-org/llama.cpp/discussions/7668

 ## Nix

--- a/docs/llguidance.md
+++ b/docs/llguidance.md
@@ -0,0 +1,53 @@
+# LLGuidance Support in llama.cpp
+
+[LLGuidance](https://github.com/guidance-ai/llguidance) is a library for constrained decoding (also called constrained sampling or structured outputs) for Large Language Models (LLMs). Initially developed as the backend for the [Guidance](https://github.com/guidance-ai/guidance) library, it can also be used independently.
+
+LLGuidance supports JSON Schemas and arbitrary context-free grammars (CFGs) written in a [variant](https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md) of Lark syntax. It is [very fast](https://github.com/guidance-ai/jsonschemabench/tree/main/maskbench) and has [excellent](https://github.com/guidance-ai/llguidance/blob/main/docs/json_schema.md) JSON Schema coverage but requires the Rust compiler, which complicates the llama.cpp build process.
+
+## Building
+
+To enable LLGuidance support, build llama.cpp with the `LLAMA_LLGUIDANCE` option:
+
+```sh
+cmake -B build -DLLAMA_LLGUIDANCE=ON
+make -C build -j
+```
+
+For Windows use `cmake --build build --config Release` instead of `make`.
+
+This requires the Rust compiler and the `cargo` tool to be [installed](https://www.rust-lang.org/tools/install).
+
+## Interface
+
+There are no new command-line arguments or modifications to `common_params`. When enabled, grammars starting with `%llguidance` are passed to LLGuidance instead of the [current](../grammars/README.md) llama.cpp grammars. Additionally, JSON Schema requests (e.g., using the `-j` argument in `llama-cli`) are also passed to LLGuidance.
+
+For your existing GBNF grammars, you can use [gbnf_to_lark.py script](https://github.com/guidance-ai/llguidance/blob/main/python/llguidance/gbnf_to_lark.py) to convert them to LLGuidance Lark-like format.
+
+## Performance
+
+Computing a "token mask" (i.e., the set of allowed tokens) for a llama3 tokenizer with 128k tokens takes, on average, 50μs of single-core CPU time for the [JSON Schema Bench](https://github.com/guidance-ai/jsonschemabench). The p99 time is 0.5ms, and the p100 time is 20ms. These results are due to the lexer/parser split and several [optimizations](https://github.com/guidance-ai/llguidance/blob/main/docs/optimizations.md).
+
+## JSON Schema
+
+LLGuidance adheres closely to the JSON Schema specification. For example:
+
+- `additionalProperties` defaults to `true`, unlike current grammars, though you can set `"additionalProperties": false` if needed.
+- any whitespace is allowed.
+- The definition order in the `"properties": {}` object is maintained, regardless of whether properties are required (current grammars always puts required properties first).
+
+Unsupported schemas result in an error message—no keywords are silently ignored.
+
+## Why Not Reuse GBNF Format?
+
+GBNF lacks the concept of a lexer.
+
+Most programming languages, including JSON, use a two-step process: a lexer (built with regular expressions) converts a byte stream into lexemes, which are then processed by a CFG parser. This approach is faster because lexers are cheaper to evaluate, and there is ~10x fewer lexemes than bytes.
+LLM tokens often align with lexemes, so the parser is engaged in under 0.5% of tokens, with the lexer handling the rest.
+
+However, the user has to provide the distinction between lexemes and CFG symbols. In [Lark](https://github.com/lark-parser/lark), lexeme names are uppercase, while CFG symbols are lowercase.
+The [gbnf_to_lark.py script](https://github.com/guidance-ai/llguidance/blob/main/scripts/gbnf_to_lark.py) can often take care of this automatically.
+See [LLGuidance syntax docs](https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md#terminals-vs-rules) for more details.
+
+## Error Handling
+
+Errors are currently printed to `stderr`, and generation continues. Improved error handling may be added in the future.
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@@ -31,6 +31,11 @@ defer {
    llama_model_free(model)
 }

+guard let vocab = llama_model_get_vocab(model) else {
+    print("Failed to get vocab")
+    exit(1)
+}
+
 var tokens = tokenize(text: prompt, add_bos: true)

 let n_kv_req = UInt32(tokens.count) + UInt32((n_len - Int(tokens.count)) * n_parallel)
@@ -41,7 +46,7 @@ context_params.n_batch = UInt32(max(n_len, n_parallel))
 context_params.n_threads = 8
 context_params.n_threads_batch = 8

-let context = llama_new_context_with_model(model, context_params)
+let context = llama_init_from_model(model, context_params)
 guard context != nil else {
    print("Failed to initialize context")
    exit(1)
@@ -141,7 +146,7 @@ while n_cur <= n_len {
        let new_token_id = llama_sampler_sample(smpl, context, i_batch[i])

        // is it an end of stream? -> mark the stream as finished
-        if llama_vocab_is_eog(model, new_token_id) || n_cur == n_len {
+        if llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_len {
            i_batch[i] = -1
            // print("")
            if n_parallel > 1 {
@@ -207,7 +212,7 @@ private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
    let utf8Count = text.utf8.count
    let n_tokens = utf8Count + (add_bos ? 1 : 0)
    let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
-    let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false)
+    let tokenCount = llama_tokenize(vocab, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false)
    var swiftTokens: [llama_token] = []
    for i in 0 ..< tokenCount {
        swiftTokens.append(tokens[Int(i)])
@@ -218,12 +223,12 @@ private func tokenize(text: String, add_bos: Bool) -> [llama_token] {

 private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String? {
    var result = [CChar](repeating: 0, count: 8)
-    let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count), 0, false)
+    let nTokens = llama_token_to_piece(vocab, token, &result, Int32(result.count), 0, false)
    if nTokens < 0 {
        let actualTokensCount = -Int(nTokens)
        result = .init(repeating: 0, count: actualTokensCount)
        let check = llama_token_to_piece(
-            model,
+            vocab,
            token,
            &result,
            Int32(result.count),
--- a/examples/cvector-generator/README.md
+++ b/examples/cvector-generator/README.md
@@ -3,9 +3,9 @@
 This example demonstrates how to generate a control vector using gguf models.

 Related PRs:
- [Add support for control vectors](https://github.com/ggerganov/llama.cpp/pull/5970)
- (Issue) [Generate control vector using llama.cpp](https://github.com/ggerganov/llama.cpp/issues/6880)
- [Add cvector-generator example](https://github.com/ggerganov/llama.cpp/pull/7514)
+- [Add support for control vectors](https://github.com/ggml-org/llama.cpp/pull/5970)
+- (Issue) [Generate control vector using llama.cpp](https://github.com/ggml-org/llama.cpp/issues/6880)
+- [Add cvector-generator example](https://github.com/ggml-org/llama.cpp/pull/7514)

 ## Examples

--- a/examples/cvector-generator/cvector-generator.cpp
+++ b/examples/cvector-generator/cvector-generator.cpp
@@ -394,6 +394,8 @@ static int prepare_entries(common_params & params, train_context & ctx_train) {
 int main(int argc, char ** argv) {
    common_params params;

+    params.out_file = "control_vector.gguf";
+
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) {
        return 1;
    }
@@ -498,7 +500,7 @@ int main(int argc, char ** argv) {
    }

    // write output vectors to gguf
-    export_gguf(ctx_train.v_final, params.cvector_outfile, model_hint);
+    export_gguf(ctx_train.v_final, params.out_file, model_hint);

    llama_backend_free();

--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -4,6 +4,7 @@
 #include "llama.h"

 #include <ctime>
+#include <algorithm>

 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@@ -345,8 +345,18 @@ struct lora_merge_ctx {
            gf = ggml_new_graph(ctx0);
            struct ggml_tensor * cur = inp_base;
            for (size_t i = 0; i < adapters.size(); ++i) {
-                struct ggml_tensor * a_T = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_cast(ctx0, inp_a[i], GGML_TYPE_F32)));
-                struct ggml_tensor * delta = ggml_mul_mat(ctx0, a_T, ggml_cast(ctx0, inp_b[i], GGML_TYPE_F32));
+                struct ggml_tensor * delta;
+                bool is_tok_embd = string_starts_with(name_base, "token_embd");
+                if (is_tok_embd) {
+                    printf("%s :     detected token embeddings tensor\n", __func__);
+                    delta = ggml_mul_mat(ctx0,
+                        ggml_cast(ctx0, inp_b[i], GGML_TYPE_F32),
+                        ggml_cast(ctx0, inp_a[i], GGML_TYPE_F32));
+                } else {
+                    delta = ggml_mul_mat(ctx0,
+                        ggml_cont(ctx0, ggml_transpose(ctx0, ggml_cast(ctx0, inp_a[i], GGML_TYPE_F32))),
+                        ggml_cast(ctx0, inp_b[i], GGML_TYPE_F32));
+                }
                // scale
                const float alpha = adapters[i]->alpha;
                const float rank  = (float) inp_b[i]->ne[0];
@@ -403,20 +413,22 @@ static void print_usage(int, char ** argv) {
 int main(int argc, char ** argv) {
    common_params params;

+    params.out_file = "ggml-lora-merged-f16.gguf";
+
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) {
        return 1;
    }

    g_verbose = (params.verbosity > 1);
    try {
-        lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.cpuparams.n_threads);
+        lora_merge_ctx ctx(params.model, params.lora_adapters, params.out_file, params.cpuparams.n_threads);
        ctx.run_merge();
    } catch (const std::exception & err) {
        fprintf(stderr, "%s\n", err.what());
        exit(EXIT_FAILURE);
    }

-    printf("done, output file is %s\n", params.lora_outfile.c_str());
+    printf("done, output file is %s\n", params.out_file.c_str());

    return 0;
 }
--- a/examples/gbnf-validator/gbnf-validator.cpp
+++ b/examples/gbnf-validator/gbnf-validator.cpp
@@ -76,7 +76,7 @@ int main(int argc, char** argv) {
        grammar_str = buffer.str();
    }

-    llama_grammar * grammar = llama_grammar_init_impl(nullptr, grammar_str.c_str(), "root");
+    llama_grammar * grammar = llama_grammar_init_impl(nullptr, grammar_str.c_str(), "root", false, nullptr, 0, nullptr, 0);
    if (grammar == nullptr) {
        fprintf(stdout, "Failed to initialize llama_grammar\n");
        return 1;
--- a/examples/gguf-split/tests.sh
+++ b/examples/gguf-split/tests.sh
@@ -41,7 +41,7 @@ echo PASS
 echo

 # 2b. Test the sharded model is loading properly
-$MAIN --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --n-predict 32
+$MAIN -no-cnv --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --n-predict 32
 echo PASS
 echo

@@ -51,7 +51,7 @@ echo PASS
 echo

 # 3b. Test the merged model is loading properly
-$MAIN --model $WORK_PATH/ggml-model-merge.gguf --n-predict 32
+$MAIN -no-cnv --model $WORK_PATH/ggml-model-merge.gguf --n-predict 32
 echo PASS
 echo

@@ -61,7 +61,7 @@ echo PASS
 echo

 # 4b. Test the sharded model is loading properly
-$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --n-predict 32
+$MAIN -no-cnv --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --n-predict 32
 echo PASS
 echo

@@ -71,7 +71,7 @@ echo
 #echo

 # 5b. Test the merged model is loading properly
-#$MAIN --model $WORK_PATH/ggml-model-merge-2.gguf --n-predict 32
+#$MAIN -no-cnv --model $WORK_PATH/ggml-model-merge-2.gguf --n-predict 32
 #echo PASS
 #echo

@@ -81,7 +81,7 @@ echo PASS
 echo

 # 6b. Test the sharded model is loading properly
-$MAIN --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --n-predict 32
+$MAIN -no-cnv --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --n-predict 32
 echo PASS
 echo

--- a/examples/imatrix/README.md
+++ b/examples/imatrix/README.md
@@ -1,7 +1,7 @@
 # llama.cpp/examples/imatrix

-Compute an importance matrix for a model and given text dataset. Can be used during quantization to enchance the quality of the quantized models.
-More information is available here: https://github.com/ggerganov/llama.cpp/pull/4861
+Compute an importance matrix for a model and given text dataset. Can be used during quantization to enhance the quality of the quantized models.
+More information is available here: https://github.com/ggml-org/llama.cpp/pull/4861

 ## Usage

--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -3,6 +3,7 @@
 #include "log.h"
 #include "llama.h"

+#include <chrono>
 #include <cmath>
 #include <cstdio>
 #include <cstring>
@@ -99,7 +100,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
    const float * data = is_host ? (const float *) src1->data : m_src1_data.data();

    // this has been adapted to the new format of storing merged experts in a single 3d tensor
-    // ref: https://github.com/ggerganov/llama.cpp/pull/6387
+    // ref: https://github.com/ggml-org/llama.cpp/pull/6387
    if (t->op == GGML_OP_MUL_MAT_ID) {
        //   ids  -> [n_experts_used, n_tokens]
        //   src1 -> [cols, n_expert_used, n_tokens]
@@ -205,9 +206,6 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *

 void IMatrixCollector::save_imatrix(int ncall) const {
    auto fname = m_params.out_file;
-    if (fname.empty()) {
-        fname = "imatrix.dat";
-    }

    if (ncall > 0) {
        fname += ".at_";
@@ -582,6 +580,8 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
 int main(int argc, char ** argv) {
    common_params params;

+    params.out_file = "imatrix.dat" ;
+
    params.n_ctx = 512;
    params.logits_all = true;
    params.escape = false;
--- a/examples/json_schema_to_grammar.py
+++ b/examples/json_schema_to_grammar.py
@@ -195,7 +195,7 @@ class BuiltinRule:
        self.deps = deps or []

 # Constraining spaces to prevent model "running away".
-SPACE_RULE = '| " " | "\\n" [ \\t]{0,20}'
+SPACE_RULE = '| " " | "\\n"{1,2} [ \\t]{0,20}'

 PRIMITIVE_RULES = {
    'boolean'      : BuiltinRule('("true" | "false") space', []),
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -683,7 +683,7 @@ struct cmd_params_instance {
    bool               cpu_strict;
    int                poll;
    int                n_gpu_layers;
-    std::string        rpc_servers;
+    std::string        rpc_servers_str;
    llama_split_mode   split_mode;
    int                main_gpu;
    bool               no_kv_offload;
@@ -696,8 +696,37 @@ struct cmd_params_instance {
        llama_model_params mparams = llama_model_default_params();

        mparams.n_gpu_layers = n_gpu_layers;
-        if (!rpc_servers.empty()) {
-            mparams.rpc_servers = rpc_servers.c_str();
+        if (!rpc_servers_str.empty()) {
+            auto rpc_servers = string_split<std::string>(rpc_servers_str, ',');
+
+            // add RPC devices
+            if (!rpc_servers.empty()) {
+                ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
+                if (!rpc_reg) {
+                    fprintf(stderr, "%s: failed to find RPC backend\n", __func__);
+                    exit(1);
+                }
+
+                typedef ggml_backend_dev_t (*ggml_backend_rpc_add_device_t)(const char * endpoint);
+                ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
+                if (!ggml_backend_rpc_add_device_fn) {
+                    fprintf(stderr, "%s: failed to find RPC device add function\n", __func__);
+                    exit(1);
+                }
+                static std::vector<ggml_backend_dev_t> devices;
+                devices.clear();
+                for (const std::string & server : rpc_servers) {
+                    ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str());
+                    if (dev) {
+                        devices.push_back(dev);
+                    } else {
+                        fprintf(stderr, "%s: failed to add RPC device for server '%s'\n", __func__, server.c_str());
+                        exit(1);
+                    }
+                }
+                devices.push_back(nullptr);
+                mparams.devices = devices.data();
+            }
        }
        mparams.split_mode   = split_mode;
        mparams.main_gpu     = main_gpu;
@@ -708,7 +737,7 @@ struct cmd_params_instance {
    }

    bool equal_mparams(const cmd_params_instance & other) const {
-        return model == other.model && n_gpu_layers == other.n_gpu_layers && rpc_servers == other.rpc_servers &&
+        return model == other.model && n_gpu_layers == other.n_gpu_layers && rpc_servers_str == other.rpc_servers_str &&
               split_mode == other.split_mode && main_gpu == other.main_gpu && use_mmap == other.use_mmap &&
               tensor_split == other.tensor_split;
    }
@@ -847,8 +876,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
 struct test {
    static const std::string build_commit;
    static const int         build_number;
-    static const std::string cpu_info;
-    static const std::string gpu_info;
+    const std::string        cpu_info;
+    const std::string        gpu_info;
    std::string              model_filename;
    std::string              model_type;
    uint64_t                 model_size;
@@ -874,7 +903,10 @@ struct test {
    std::string              test_time;
    std::vector<uint64_t>    samples_ns;

-    test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) {
+    test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) :
+        cpu_info(get_cpu_info()),
+        gpu_info(get_gpu_info()) {
+
        model_filename = inst.model;
        char buf[128];
        llama_model_desc(lmodel, buf, sizeof(buf));
@@ -1029,8 +1061,6 @@ struct test {

 const std::string test::build_commit = LLAMA_COMMIT;
 const int         test::build_number = LLAMA_BUILD_NUMBER;
-const std::string test::cpu_info     = get_cpu_info();
-const std::string test::gpu_info     = get_gpu_info();

 struct printer {
    virtual ~printer() {}
--- a/examples/llama.android/llama/src/main/cpp/CMakeLists.txt
+++ b/examples/llama.android/llama/src/main/cpp/CMakeLists.txt
@@ -14,7 +14,7 @@ project("llama-android")
 #include(FetchContent)
 #FetchContent_Declare(
 #        llama
-#        GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
+#        GIT_REPOSITORY https://github.com/ggml-org/llama.cpp
 #        GIT_TAG        master
 #)

--- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
@@ -347,6 +347,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init(
        jlong context_pointer,
        jlong batch_pointer,
        jstring jtext,
+        jboolean format_chat,
        jint n_len
    ) {

@@ -356,10 +357,11 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init(
    const auto context = reinterpret_cast<llama_context *>(context_pointer);
    const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);

-    const auto tokens_list = common_tokenize(context, text, 1);
+    bool parse_special = (format_chat == JNI_TRUE);
+    const auto tokens_list = common_tokenize(context, text, true, parse_special);

    auto n_ctx = llama_n_ctx(context);
-    auto n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
+    auto n_kv_req = tokens_list.size() + n_len;

    LOGi("n_len = %d, n_ctx = %d, n_kv_req = %d", n_len, n_ctx, n_kv_req);

@@ -368,7 +370,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init(
    }

    for (auto id : tokens_list) {
-        LOGi("%s", common_token_to_piece(context, id).c_str());
+        LOGi("token: `%s`-> %d ", common_token_to_piece(context, id).c_str(), id);
    }

    common_batch_clear(*batch);
--- a/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt
+++ b/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt
@@ -65,6 +65,7 @@ class LLamaAndroid {
        context: Long,
        batch: Long,
        text: String,
+        formatChat: Boolean,
        nLen: Int
    ): Int

@@ -115,10 +116,10 @@ class LLamaAndroid {
        }
    }

-    fun send(message: String): Flow<String> = flow {
+    fun send(message: String, formatChat: Boolean = false): Flow<String> = flow {
        when (val state = threadLocalState.get()) {
            is State.Loaded -> {
-                val ncur = IntVar(completion_init(state.context, state.batch, message, nlen))
+                val ncur = IntVar(completion_init(state.context, state.batch, message, formatChat, nlen))
                while (ncur.value <= nlen) {
                    val str = completion_loop(state.context, state.batch, state.sampler, nlen, ncur)
                    if (str == null) {
--- a/examples/llama.swiftui/README.md
+++ b/examples/llama.swiftui/README.md
@@ -3,9 +3,24 @@
 Local inference of llama.cpp on an iPhone. This is a sample app that can be used as a starting
 point for more advanced projects.

-For usage instructions and performance stats, check the following discussion: https://github.com/ggerganov/llama.cpp/discussions/4508
+For usage instructions and performance stats, check the following discussion: https://github.com/ggml-org/llama.cpp/discussions/4508

-![image](https://github.com/ggerganov/llama.cpp/assets/1991296/2b40284f-8421-47a2-b634-74eece09a299)
+
+### Building
+First llama.cpp need to be built and a XCFramework needs to be created. This can be done by running
+the following script from the llama.cpp project root:
+```console
+$ ./build-xcframework.sh
+```
+Open `llama.swiftui.xcodeproj` project in Xcode and you should be able to build and run the app on
+a simulator or a real device.
+
+To use the framework with a different project, the XCFramework can be added to the project by
+adding `build-ios/llama.xcframework` by dragging and dropping it into the project navigator, or
+by manually selecting the framework in the "Frameworks, Libraries, and Embedded Content" section
+of the project settings.
+
+![image](https://github.com/ggml-org/llama.cpp/assets/1991296/2b40284f-8421-47a2-b634-74eece09a299)

 Video demonstration:

--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@@ -24,6 +24,7 @@ func llama_batch_add(_ batch: inout llama_batch, _ id: llama_token, _ pos: llama
 actor LlamaContext {
    private var model: OpaquePointer
    private var context: OpaquePointer
+    private var vocab: OpaquePointer
    private var sampling: UnsafeMutablePointer<llama_sampler>
    private var batch: llama_batch
    private var tokens_list: [llama_token]
@@ -47,6 +48,7 @@ actor LlamaContext {
        self.sampling = llama_sampler_chain_init(sparams)
        llama_sampler_chain_add(self.sampling, llama_sampler_init_temp(0.4))
        llama_sampler_chain_add(self.sampling, llama_sampler_init_dist(1234))
+        vocab = llama_model_get_vocab(model)
    }

    deinit {
@@ -79,7 +81,7 @@ actor LlamaContext {
        ctx_params.n_threads       = Int32(n_threads)
        ctx_params.n_threads_batch = Int32(n_threads)

-        let context = llama_new_context_with_model(model, ctx_params)
+        let context = llama_init_from_model(model, ctx_params)
        guard let context else {
            print("Could not load context!")
            throw LlamaError.couldNotInitializeContext
@@ -151,7 +153,7 @@ actor LlamaContext {

        new_token_id = llama_sampler_sample(sampling, context, batch.n_tokens - 1)

-        if llama_vocab_is_eog(model, new_token_id) || n_cur == n_len {
+        if llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_len {
            print("\n")
            is_done = true
            let new_token_str = String(cString: temporary_invalid_cchars + [0])
@@ -297,7 +299,7 @@ actor LlamaContext {
        let utf8Count = text.utf8.count
        let n_tokens = utf8Count + (add_bos ? 1 : 0) + 1
        let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
-        let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, false)
+        let tokenCount = llama_tokenize(vocab, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, false)

        var swiftTokens: [llama_token] = []
        for i in 0..<tokenCount {
@@ -316,7 +318,7 @@ actor LlamaContext {
        defer {
            result.deallocate()
        }
-        let nTokens = llama_token_to_piece(model, token, result, 8, 0, false)
+        let nTokens = llama_token_to_piece(vocab, token, result, 8, 0, false)

        if nTokens < 0 {
            let newResult = UnsafeMutablePointer<Int8>.allocate(capacity: Int(-nTokens))
@@ -324,7 +326,7 @@ actor LlamaContext {
            defer {
                newResult.deallocate()
            }
-            let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens, 0, false)
+            let nNewTokens = llama_token_to_piece(vocab, token, newResult, -nTokens, 0, false)
            let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens))
            return Array(bufferPointer)
        } else {
--- a/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj
+++ b/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj
@@ -7,7 +7,6 @@
 	objects = {

 /* Begin PBXBuildFile section */
-		1809696D2D05A39F00400EE8 /* llama in Frameworks */ = {isa = PBXBuildFile; productRef = 1809696C2D05A39F00400EE8 /* llama */; };
 		549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 549479CA2AC9E16000E0F78B /* Metal.framework */; };
 		79E1D9CD2B4CD16E005F8E46 /* InputButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 79E1D9CC2B4CD16E005F8E46 /* InputButton.swift */; };
 		7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */; };
@@ -18,9 +17,25 @@
 		8A3F84242AC4C891005E2EE8 /* models in Resources */ = {isa = PBXBuildFile; fileRef = 8A3F84232AC4C891005E2EE8 /* models */; };
 		8A907F332AC7138A006146EA /* LibLlama.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A907F322AC7134E006146EA /* LibLlama.swift */; };
 		8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */; };
+		DD84C9FD2D747FED007778EC /* llama.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = DD84C9FC2D747FED007778EC /* llama.xcframework */; };
+		DD84C9FE2D747FED007778EC /* llama.xcframework in Embed Frameworks */ = {isa = PBXBuildFile; fileRef = DD84C9FC2D747FED007778EC /* llama.xcframework */; settings = {ATTRIBUTES = (CodeSignOnCopy, RemoveHeadersOnCopy, ); }; };
 		F1FE20E22B465ECA00B45541 /* LoadCustomButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = F1FE20E12B465EC900B45541 /* LoadCustomButton.swift */; };
 /* End PBXBuildFile section */

+/* Begin PBXCopyFilesBuildPhase section */
+		DD84C9FF2D747FED007778EC /* Embed Frameworks */ = {
+			isa = PBXCopyFilesBuildPhase;
+			buildActionMask = 2147483647;
+			dstPath = "";
+			dstSubfolderSpec = 10;
+			files = (
+				DD84C9FE2D747FED007778EC /* llama.xcframework in Embed Frameworks */,
+			);
+			name = "Embed Frameworks";
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXCopyFilesBuildPhase section */
+
 /* Begin PBXFileReference section */
 		549479CA2AC9E16000E0F78B /* Metal.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Metal.framework; path = System/Library/Frameworks/Metal.framework; sourceTree = SDKROOT; };
 		79E1D9CC2B4CD16E005F8E46 /* InputButton.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = InputButton.swift; sourceTree = "<group>"; };
@@ -33,6 +48,7 @@
 		8A3F84232AC4C891005E2EE8 /* models */ = {isa = PBXFileReference; lastKnownFileType = folder; name = models; path = llama.swiftui/Resources/models; sourceTree = "<group>"; };
 		8A907F322AC7134E006146EA /* LibLlama.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LibLlama.swift; sourceTree = "<group>"; };
 		8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaState.swift; sourceTree = "<group>"; };
+		DD84C9FC2D747FED007778EC /* llama.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = llama.xcframework; path = "../../build-apple/llama.xcframework"; sourceTree = "<group>"; };
 		DF2D2FE72B4A59BE00FCB72D /* llama.cpp */ = {isa = PBXFileReference; lastKnownFileType = wrapper; name = llama.cpp; path = ../..; sourceTree = "<group>"; };
 		F1FE20E12B465EC900B45541 /* LoadCustomButton.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LoadCustomButton.swift; sourceTree = "<group>"; };
 /* End PBXFileReference section */
@@ -42,9 +58,9 @@
 			isa = PBXFrameworksBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
-				1809696D2D05A39F00400EE8 /* llama in Frameworks */,
 				549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */,
 				8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */,
+				DD84C9FD2D747FED007778EC /* llama.xcframework in Frameworks */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
@@ -86,6 +102,7 @@
 		8A39BE082AC7601000BFEB40 /* Frameworks */ = {
 			isa = PBXGroup;
 			children = (
+				DD84C9FC2D747FED007778EC /* llama.xcframework */,
 				549479CA2AC9E16000E0F78B /* Metal.framework */,
 				8A39BE092AC7601000BFEB40 /* Accelerate.framework */,
 			);
@@ -144,6 +161,7 @@
 				8A1C836F2AC328BD0096AF73 /* Sources */,
 				8A1C83702AC328BD0096AF73 /* Frameworks */,
 				8A1C83712AC328BD0096AF73 /* Resources */,
+				DD84C9FF2D747FED007778EC /* Embed Frameworks */,
 			);
 			buildRules = (
 			);
@@ -151,7 +169,6 @@
 			);
 			name = llama.swiftui;
 			packageProductDependencies = (
-				1809696C2D05A39F00400EE8 /* llama */,
 			);
 			productName = llama.swiftui;
 			productReference = 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */;
@@ -427,13 +444,6 @@
 			defaultConfigurationName = Release;
 		};
 /* End XCConfigurationList section */
-
-/* Begin XCSwiftPackageProductDependency section */
-		1809696C2D05A39F00400EE8 /* llama */ = {
-			isa = XCSwiftPackageProductDependency;
-			productName = llama;
-		};
-/* End XCSwiftPackageProductDependency section */
 	};
 	rootObject = 8A1C836B2AC328BD0096AF73 /* Project object */;
 }
--- a/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift
+++ b/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift
@@ -124,15 +124,26 @@ struct ContentView: View {
                    }
                }
            }.sheet(isPresented: $showingHelp) {    // Sheet for help modal
-                VStack(alignment: .leading) {
+                NavigationView {
                    VStack(alignment: .leading) {
-                        Text("1. Make sure the model is in GGUF Format")
-                               .padding()
-                        Text("2. Copy the download link of the quantized model")
-                               .padding()
+                        VStack(alignment: .leading) {
+                            Text("1. Make sure the model is in GGUF Format")
+                                    .padding()
+                            Text("2. Copy the download link of the quantized model")
+                                    .padding()
+                        }
+                        Spacer()
                    }
-                    Spacer()
-                   }
+                    .navigationTitle("Help")
+                    .navigationBarTitleDisplayMode(.inline)
+                    .toolbar {
+                        ToolbarItem(placement: .navigationBarTrailing) {
+                            Button("Done") {
+                                showingHelp = false
+                            }
+                        }
+                    }
+                }
            }
        }
    }
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -39,7 +39,7 @@
 "
 "   :call llama#init()
 "
-" more info: https://github.com/ggerganov/llama.cpp/pull/9787
+" more info: https://github.com/ggml-org/llama.cpp/pull/9787
 "

 " colors (adjust to your liking)
--- a/examples/llava/CMakeLists.txt
+++ b/examples/llava/CMakeLists.txt
@@ -50,3 +50,17 @@ set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-qwen2vl-cli)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+set(TARGET llama-gemma3-cli)
+add_executable(${TARGET} gemma3-cli.cpp)
+set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-gemma3-cli)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+set(TARGET llama-llava-clip-quantize-cli)
+add_executable(${TARGET} clip-quantize-cli.cpp)
+set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-clip-quantize-cli)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/llava/README-gemma3.md
+++ b/examples/llava/README-gemma3.md
@@ -0,0 +1,30 @@
+# Gemma 3 vision
+
+> [!IMPORTANT]
+>
+> This is very experimental, only used for demo purpose.
+
+## How to get mmproj.gguf?
+
+```bash
+cd gemma-3-4b-it
+python ../llama.cpp/examples/llava/gemma3_convert_encoder_to_gguf.py .
+
+# output file is mmproj.gguf
+```
+
+## How to run it?
+
+What you need:
+- The text model GGUF, can be converted using `convert_hf_to_gguf.py`
+- The mmproj file from step above
+- An image file
+
+```bash
+# build
+cmake -B build
+cmake --build build --target llama-gemma3-cli
+
+# run it
+./build/bin/llama-gemma3-cli -m {text_model}.gguf --mmproj mmproj.gguf --image your_image.jpg
+```
--- a/examples/llava/README-glmedge.md
+++ b/examples/llava/README-glmedge.md
@@ -0,0 +1,43 @@
+# GLMV-EDGE
+
+Currently this implementation supports [glm-edge-v-2b](https://huggingface.co/THUDM/glm-edge-v-2b) and [glm-edge-v-5b](https://huggingface.co/THUDM/glm-edge-v-5b).
+
+## Usage
+Build with cmake or run `make llama-llava-cli` to build it.
+
+After building, run: `./llama-llava-cli` to see the usage. For example:
+
+```sh
+./llama-llava-cli -m model_path/ggml-model-f16.gguf --mmproj model_path/mmproj-model-f16.gguf --image img_path/image.jpg -p "<|system|>\n system prompt <image><|user|>\n prompt <|assistant|>\n"
+```
+
+**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
+**note**: For GPU offloading ensure to use the `-ngl` flag just like usual
+
+## GGUF conversion
+
+1. Clone a GLMV-EDGE model ([2B](https://huggingface.co/THUDM/glm-edge-v-2b) or [5B](https://huggingface.co/THUDM/glm-edge-v-5b)). For example:
+
+```sh
+git clone https://huggingface.co/THUDM/glm-edge-v-5b or https://huggingface.co/THUDM/glm-edge-v-2b
+```
+
+2. Use `glmedge-surgery.py` to split the GLMV-EDGE model to LLM and multimodel projector constituents:
+
+```sh
+python ./examples/llava/glmedge-surgery.py -m ../model_path
+```
+
+4. Use `glmedge-convert-image-encoder-to-gguf.py` to convert the GLMV-EDGE image encoder to GGUF:
+
+```sh
+python ./examples/llava/glmedge-convert-image-encoder-to-gguf.py -m ../model_path --llava-projector ../model_path/glm.projector --output-dir ../model_path
+```
+
+5. Use `examples/convert_hf_to_gguf.py` to convert the LLM part of GLMV-EDGE to GGUF:
+
+```sh
+python convert_hf_to_gguf.py ../model_path
+```
+
+Now both the LLM part and the image encoder are in the `model_path` directory.
--- a/examples/llava/README-granitevision.md
+++ b/examples/llava/README-granitevision.md
@@ -0,0 +1,190 @@
+# Granite Vision
+
+Download the model and point your `GRANITE_MODEL` environment variable to the path.
+
+```bash
+$ git clone https://huggingface.co/ibm-granite/granite-vision-3.2-2b
+$ export GRANITE_MODEL=./granite-vision-3.2-2b
+```
+
+
+### 1. Running llava surgery v2.
+First, we need to run the llava surgery script as shown below:
+
+`python llava_surgery_v2.py -C -m $GRANITE_MODEL`
+
+You should see two new files (`llava.clip` and `llava.projector`) written into your model's directory, as shown below.
+
+```bash
+$ ls $GRANITE_MODEL | grep -i llava
+llava.clip
+llava.projector
+```
+
+We should see that the projector and visual encoder get split out into the llava files. Quick check to make sure they aren't empty:
+```python
+import os
+import torch
+
+MODEL_PATH = os.getenv("GRANITE_MODEL")
+if not MODEL_PATH:
+    raise ValueError("env var GRANITE_MODEL is unset!")
+
+encoder_tensors = torch.load(os.path.join(MODEL_PATH, "llava.clip"))
+projector_tensors = torch.load(os.path.join(MODEL_PATH, "llava.projector"))
+
+assert len(encoder_tensors) > 0
+assert len(projector_tensors) > 0
+```
+
+If you actually inspect the `.keys()` of the loaded tensors, you should see a lot of `vision_model` tensors in the `encoder_tensors`, and 5 tensors (`'multi_modal_projector.linear_1.bias'`, `'multi_modal_projector.linear_1.weight'`, `'multi_modal_projector.linear_2.bias'`, `'multi_modal_projector.linear_2.weight'`, `'image_newline'`) in the multimodal `projector_tensors`.
+
+
+### 2. Creating the Visual Component GGUF
+Next, create a new directory to hold the visual components, and copy the llava.clip/projector files, as shown below.
+
+```bash
+$ ENCODER_PATH=$PWD/visual_encoder
+$ mkdir $ENCODER_PATH
+
+$ cp $GRANITE_MODEL/llava.clip $ENCODER_PATH/pytorch_model.bin
+$ cp $GRANITE_MODEL/llava.projector $ENCODER_PATH/
+```
+
+Now, we need to write a config for the visual encoder. In order to convert the model, be sure to use the correct `image_grid_pinpoints`, as these may vary based on the model. You can find the `image_grid_pinpoints` in `$GRANITE_MODEL/config.json`.
+
+```json
+{
+    "_name_or_path": "siglip-model",
+    "architectures": [
+      "SiglipVisionModel"
+    ],
+    "image_grid_pinpoints": [
+        [384,384],
+        [384,768],
+        [384,1152],
+        [384,1536],
+        [384,1920],
+        [384,2304],
+        [384,2688],
+        [384,3072],
+        [384,3456],
+        [384,3840],
+        [768,384],
+        [768,768],
+        [768,1152],
+        [768,1536],
+        [768,1920],
+        [1152,384],
+        [1152,768],
+        [1152,1152],
+        [1536,384],
+        [1536,768],
+        [1920,384],
+        [1920,768],
+        [2304,384],
+        [2688,384],
+        [3072,384],
+        [3456,384],
+        [3840,384]
+    ],
+    "mm_patch_merge_type": "spatial_unpad",
+    "hidden_size": 1152,
+    "image_size": 384,
+    "intermediate_size": 4304,
+    "model_type": "siglip_vision_model",
+    "num_attention_heads": 16,
+    "num_hidden_layers": 27,
+    "patch_size": 14,
+    "layer_norm_eps": 1e-6,
+    "hidden_act": "gelu_pytorch_tanh",
+    "projection_dim": 0,
+    "vision_feature_layer": [-24, -20, -12, -1]
+}
+```
+
+At this point you should have something like this:
+```bash
+$ ls $ENCODER_PATH
+config.json             llava.projector         pytorch_model.bin
+```
+
+Now convert the components to GGUF; Note that we also override the image mean/std dev to `[.5,.5,.5]` since we use the SigLIP visual encoder - in the transformers model, you can find these numbers in the `preprocessor_config.json`.
+```bash
+$ python convert_image_encoder_to_gguf.py \
+    -m $ENCODER_PATH \
+    --llava-projector $ENCODER_PATH/llava.projector \
+    --output-dir $ENCODER_PATH \
+    --clip-model-is-vision \
+    --clip-model-is-siglip \
+    --image-mean 0.5 0.5 0.5 \
+    --image-std 0.5 0.5 0.5
+```
+
+This will create the first GGUF file at `$ENCODER_PATH/mmproj-model-f16.gguf`; we will refer to the absolute path of this file as the `$VISUAL_GGUF_PATH.`
+
+
+### 3. Creating the LLM GGUF.
+The granite vision model contains a granite LLM as its language model. For now, the easiest way to get the GGUF for LLM is by loading the composite model in `transformers` and exporting the LLM so that it can be directly converted with the normal conversion path.
+
+First, set the `LLM_EXPORT_PATH` to the path to export the `transformers` LLM to.
+```bash
+$ export LLM_EXPORT_PATH=$PWD/granite_vision_llm
+```
+
+```python
+import os
+import transformers
+
+MODEL_PATH = os.getenv("GRANITE_MODEL")
+if not MODEL_PATH:
+    raise ValueError("env var GRANITE_MODEL is unset!")
+
+LLM_EXPORT_PATH = os.getenv("LLM_EXPORT_PATH")
+if not LLM_EXPORT_PATH:
+    raise ValueError("env var LLM_EXPORT_PATH is unset!")
+
+tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_PATH)
+
+# NOTE: granite vision support was added to transformers very recently (4.49);
+# if you get size mismatches, your version is too old.
+# If you are running with an older version, set `ignore_mismatched_sizes=True`
+# as shown below; it won't be loaded correctly, but the LLM part of the model that
+# we are exporting will be loaded correctly.
+model = transformers.AutoModelForImageTextToText.from_pretrained(MODEL_PATH, ignore_mismatched_sizes=True)
+
+tokenizer.save_pretrained(LLM_EXPORT_PATH)
+model.language_model.save_pretrained(LLM_EXPORT_PATH)
+```
+
+Now you can convert the exported LLM to GGUF with the normal converter in the root of the llama cpp project.
+```bash
+$ LLM_GGUF_PATH=$LLM_EXPORT_PATH/granite_llm.gguf
+...
+$ python convert_hf_to_gguf.py --outfile $LLM_GGUF_PATH $LLM_EXPORT_PATH
+```
+
+
+### 4. Quantization
+If you want to quantize the LLM, you can do so with `llama-quantize` as you would any other LLM. For example:
+```bash
+$ ./build/bin/llama-quantize $LLM_EXPORT_PATH/granite_llm.gguf $LLM_EXPORT_PATH/granite_llm_q4_k_m.gguf Q4_K_M
+$ LLM_GGUF_PATH=$LLM_EXPORT_PATH/granite_llm_q4_k_m.gguf
+```
+
+Note that currently you cannot quantize the visual encoder because granite vision models use SigLIP as the visual encoder, which has tensor dimensions that are not divisible by 32.
+
+
+### 5. Running the Model in Llama cpp
+Build llama cpp normally; you should have a target binary named `llama-llava-cli`, which you can pass two binaries to. As an example, we pass the the llama.cpp banner.
+
+```bash
+$ ./build/bin/llama-llava-cli -m $LLM_GGUF_PATH \
+    --mmproj $VISUAL_GGUF_PATH \
+    --image ./media/llama0-banner.png \
+    -c 16384 \
+    -p "<|system|>\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n<|user|>\n\<image>\nWhat does the text in this image say?\n<|assistant|>\n" \
+    --temp 0
+```
+
+Sample output: `The text in the image reads "LLAMA C++ Can it run DOOM Llama?"`
--- a/examples/llava/README-minicpmo2.6.md
+++ b/examples/llava/README-minicpmo2.6.md
@@ -0,0 +1,48 @@
+## MiniCPM-o 2.6
+Currently, this readme only supports minicpm-omni's image capabilities, and we will update the full-mode support as soon as possible.
+
+### Prepare models and code
+
+Download [MiniCPM-o-2_6](https://huggingface.co/openbmb/MiniCPM-o-2_6) PyTorch model from huggingface to "MiniCPM-o-2_6" folder.
+
+
+### Build llama.cpp
+Readme modification time: 20250206
+
+If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
+
+Clone llama.cpp:
+```bash
+git clone https://github.com/ggerganov/llama.cpp
+cd llama.cpp
+```
+
+Build llama.cpp using `CMake`:
+```bash
+cmake -B build
+cmake --build build --config Release
+```
+
+
+### Usage of MiniCPM-o 2.6
+
+Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf) by us)
+
+```bash
+python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-o-2_6
+python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-o-2_6 --minicpmv-projector ../MiniCPM-o-2_6/minicpmv.projector --output-dir ../MiniCPM-o-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 4
+python ./convert_hf_to_gguf.py ../MiniCPM-o-2_6/model
+
+# quantize int4 version
+./build/bin/llama-quantize ../MiniCPM-o-2_6/model/ggml-model-f16.gguf ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M
+```
+
+
+Inference on Linux or Mac
+```bash
+# run f16 version
+./build/bin/llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
+
+# run quantized int4 version
+./build/bin/llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg  -p "What is in the image?"
+```
--- a/examples/llava/README-minicpmv2.5.md
+++ b/examples/llava/README-minicpmv2.5.md
@@ -4,13 +4,26 @@

 Download [MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5) PyTorch model from huggingface to "MiniCPM-Llama3-V-2_5" folder.

+
+### Build llama.cpp
+Readme modification time: 20250206
+
+If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
+
 Clone llama.cpp:
 ```bash
-git clone https://github.com/ggerganov/llama.cpp
+git clone https://github.com/ggml-org/llama.cpp
 cd llama.cpp
 ```

-### Usage
+Build llama.cpp using `CMake`:
+```bash
+cmake -B build
+cmake --build build --config Release
+```
+
+
+### Usage of MiniCPM-Llama3-V 2.5

 Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us)

@@ -20,80 +33,15 @@ python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-
 python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model

 # quantize int4 version
-./llama-quantize ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf Q4_K_M
+./build/bin/llama-quantize ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf Q4_K_M
 ```

-Build for Linux or Mac
-
-```bash
-make
-make llama-minicpmv-cli
-```

 Inference on Linux or Mac
-```
+```bash
 # run f16 version
-./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
+./build/bin/llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"

 # run quantized int4 version
-./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg  -p "What is in the image?"
-
-# or run in interactive mode
-./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i
-```
-
-### Android
-
-#### Build on Android device using Termux
-We found that build on Android device would bring better runtime performance, so we recommend to build on device.
-
-[Termux](https://github.com/termux/termux-app#installation) is a terminal app on Android device (no root required).
-
-Install tools in Termux:
-```
-apt update && apt upgrade -y
-apt install git make cmake
-```
-
-It's recommended to move your model inside the `~/` directory for best performance:
-```
-cd storage/downloads
-mv model.gguf ~/
-```
-
-#### Building the Project using Android NDK
-Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake.
-
-Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux:
-
-```bash
-mkdir build-android
-cd build-android
-export NDK=/your_ndk_path
-cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
-make
-```
-
-Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).
-
-Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
-
-(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
-```
-$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/
-$cd /data/data/com.termux/files/home/bin
-$chmod +x ./*
-```
-
-Download models and push them to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/`
-
-```
-$mv /sdcard/llama.cpp/ggml-model-Q4_K_M.gguf /data/data/com.termux/files/home/model/
-$mv /sdcard/llama.cpp/mmproj-model-f16.gguf /data/data/com.termux/files/home/model/
-```
-
-Now, you can start chatting:
-```
-$cd /data/data/com.termux/files/home/bin
-$./llama-minicpmv-cli -m ../model/ggml-model-Q4_K_M.gguf --mmproj ../model/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg  -p "What is in the image?"
+./build/bin/llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg  -p "What is in the image?"
 ```
--- a/examples/llava/README-minicpmv2.6.md
+++ b/examples/llava/README-minicpmv2.6.md
@@ -4,13 +4,25 @@

 Download [MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6) PyTorch model from huggingface to "MiniCPM-V-2_6" folder.

+
+### Build llama.cpp
+Readme modification time: 20250206
+
+If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
+
 Clone llama.cpp:
 ```bash
-git clone git@github.com:OpenBMB/llama.cpp.git
+git clone https://github.com/ggerganov/llama.cpp
 cd llama.cpp
-git checkout minicpmv-main
 ```

+Build llama.cpp using `CMake`:
+```bash
+cmake -B build
+cmake --build build --config Release
+```
+
+
 ### Usage of MiniCPM-V 2.6

 Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) by us)
@@ -21,87 +33,15 @@ python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-
 python ./convert_hf_to_gguf.py ../MiniCPM-V-2_6/model

 # quantize int4 version
-./llama-quantize ../MiniCPM-V-2_6/model/ggml-model-f16.gguf ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M
+./build/bin/llama-quantize ../MiniCPM-V-2_6/model/ggml-model-f16.gguf ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M
 ```

-Build for Linux or Mac
-
-```bash
-make
-make llama-minicpmv-cli
-```

 Inference on Linux or Mac
-```
+```bash
 # run f16 version
-./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
+./build/bin/llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"

 # run quantized int4 version
-./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg  -p "What is in the image?"
-
-# or run in interactive mode
-./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i
-```
-
-### Video
-Install FFmpeg
-```
-brew install ffmpeg
-brew install pkg-config
-```
-
-### Android
-
-#### Build on Android device using Termux
-We found that build on Android device would bring better runtime performance, so we recommend to build on device.
-
-[Termux](https://github.com/termux/termux-app#installation) is a terminal app on Android device (no root required).
-
-Install tools in Termux:
-```
-apt update && apt upgrade -y
-apt install git make cmake
-```
-
-It's recommended to move your model inside the `~/` directory for best performance:
-```
-cd storage/downloads
-mv model.gguf ~/
-```
-
-#### Building the Project using Android NDK
-Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake.
-
-Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux:
-
-```bash
-mkdir build-android
-cd build-android
-export NDK=/your_ndk_path
-cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
-make
-```
-
-Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).
-
-Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
-
-(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
-```
-$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/
-$cd /data/data/com.termux/files/home/bin
-$chmod +x ./*
-```
-
-Download models and push them to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/`
-
-```
-$mv /sdcard/llama.cpp/ggml-model-Q4_K_M.gguf /data/data/com.termux/files/home/model/
-$mv /sdcard/llama.cpp/mmproj-model-f16.gguf /data/data/com.termux/files/home/model/
-```
-
-Now, you can start chatting:
-```
-$cd /data/data/com.termux/files/home/bin
-$./llama-minicpmv-cli -m ../model/ggml-model-Q4_K_M.gguf --mmproj ../model/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg  -p "What is in the image?"
+./build/bin/llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg  -p "What is in the image?"
 ```
--- a/examples/llava/README-quantize.md
+++ b/examples/llava/README-quantize.md
@@ -0,0 +1,44 @@
+# Quantizing CLIP Visual Projector
+
+This is the tool for quantizing the CLIP visual projector model. Quantization reduces the precision of the model's weights, which can significantly decrease the model size and improve inference speed, often with minimal impact on performance.
+
+## Usage
+
+To quantize a CLIP visual projector model, use the following command:
+
+```sh
+./bin/llama-llava-clip-quantize-cli /path/to/ggml-model-f32.gguf /path/to/ggml-model-quantized.gguf <type>
+```
+
+After the quantization, the visual projector can be used freely with the existing LLAVA cli (LLAVA, Qwen2VL, etc).
+
+### Arguments
+
+- `/path/to/ggml-model-f32.gguf`: The path to the input model file in FP32 or FP16 format.
+- `/path/to/ggml-model-quantized.gguf`: The path where the quantized model will be saved.
+- `<type>`: The quantization type to apply. This should be an integer corresponding to one of the quantization types defined in the `enum ggml_type`.
+
+### Quantization Types
+
+The following quantization types are supported, based on the `enum ggml_type` definition:
+
+- `2` - `q4_0`: 4-bit quantization with a single scale value.
+- `3` - `q4_1`: 4-bit quantization with a separate scale value for each block.
+- `6` - `q5_0`: 5-bit quantization with a single scale value.
+- `7` - `q5_1`: 5-bit quantization with a separate scale value for each block.
+- `8` - `q8_0`: 8-bit quantization with a single scale value.
+
+### Example
+
+To quantize a model using the `q4_0` quantization type, you would run:
+
+```sh
+./bin/llama-llava-clip-quantize-cli /path/to/ggml-model-f32.gguf /path/to/ggml-model-quantized.gguf 2
+```
+
+This command will generate a quantized model at `/path/to/ggml-model-quantized.gguf` using the `q4_0` quantization method.
+
+## Notes
+
+- Quantization can lead to a loss in model accuracy, depending on the chosen quantization type. It is recommended to evaluate the quantized model's performance on your specific task to ensure it meets your requirements.
+- The quantized model will typically be smaller in size and faster to run, making it more suitable for deployment in resource-constrained environments.
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@@ -101,8 +101,27 @@ python ./examples/convert_legacy_llama.py ../llava-v1.6-vicuna-7b/ --skip-unknow
 ```

 **note** llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096)
+
 **note** llava-1.6 greatly benefits from batched prompt processing (defaults work)

+**note** if the language model in step `6)` is incompatible with the legacy conversion script, the easiest way handle the LLM model conversion is to load the model in transformers, and export only the LLM from the llava next model.
+
+```python
+import os
+import transformers
+
+model_path = ...
+llm_export_path = ...
+
+tokenizer = transformers.AutoTokenizer.from_pretrained(model_path)
+model = transformers.AutoModelForImageTextToText.from_pretrained(model_path)
+
+tokenizer.save_pretrained(llm_export_path)
+model.language_model.save_pretrained(llm_export_path)
+```
+
+Then, you can convert the LLM using the `convert_hf_to_gguf.py` script, which handles more LLM architectures.
+
 ## llava-cli templating and llava-1.6 prompting

 llava-1.5 models all use the same vicuna prompt, here you can just add your image question like `-p "Provide a full description."`
--- a/examples/llava/clip-quantize-cli.cpp
+++ b/examples/llava/clip-quantize-cli.cpp
@@ -0,0 +1,59 @@
+#include "arg.h"
+#include "base64.hpp"
+#include "log.h"
+#include "common.h"
+#include "sampling.h"
+#include "clip.h"
+#include "llava.h"
+#include "llama.h"
+#include "ggml.h"
+
+static void print_usage(int argc, char ** argv) {
+    (void) argc;
+
+    fprintf(stderr, "usage: %s /path/to/ggml-model-f32.gguf /path/to/ggml-model-quantized.gguf type\n", argv[0]);
+    fprintf(stderr, "  type = 2 - q4_0\n");
+    fprintf(stderr, "  type = 3 - q4_1\n");
+    fprintf(stderr, "  type = 6 - q5_0\n");
+    fprintf(stderr, "  type = 7 - q5_1\n");
+    fprintf(stderr, "  type = 8 - q8_0\n");
+}
+
+int main(int argc, char ** argv) {
+    if (argc != 4) {
+        print_usage(argc, argv);
+        return 1;
+    }
+
+    const std::string fname_inp = argv[1];
+    const std::string fname_out = argv[2];
+
+    const int itype = atoi(argv[3]);
+
+    const int64_t t_main_start_us = ggml_time_us();
+
+    int64_t t_quantize_us = 0;
+
+    // load the model
+    {
+        const int64_t t_start_us = ggml_time_us();
+
+        if (!clip_model_quantize(fname_inp.c_str(), fname_out.c_str(), itype)) {
+            fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
+            return 1;
+        }
+
+        t_quantize_us = ggml_time_us() - t_start_us;
+    }
+
+    // report timing
+    {
+        const int64_t t_main_end_us = ggml_time_us();
+
+        printf("\n");
+        printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us / 1000.0f);
+        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us) / 1000.0f);
+    }
+
+    return 0;
+}
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@@ -39,8 +39,15 @@ struct clip_image_f32_batch {
    size_t size;
 };

-CLIP_API struct clip_ctx * clip_model_load    (const char * fname, int verbosity);
-CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity);
+struct clip_context_params {
+    bool use_gpu;
+    int verbosity;
+};
+
+// deprecated, use clip_init
+CLIP_API struct clip_ctx * clip_model_load(const char * fname, int verbosity);
+
+CLIP_API struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params);

 CLIP_API void clip_free(struct clip_ctx * ctx);

@@ -55,6 +62,7 @@ CLIP_API int32_t clip_hidden_size(const struct clip_ctx * ctx);
 CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);

 CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
+CLIP_API size_t get_clip_image_grid_size(const struct clip_ctx * ctx);

 CLIP_API int clip_n_patches        (const struct clip_ctx * ctx);
 CLIP_API int clip_n_patches_by_img (const struct clip_ctx * ctx, struct clip_image_f32 * img);
@@ -73,6 +81,12 @@ CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
 CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch  * batch);
 CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);

+/**
+ * Build image from pixels decoded by other libraries instead of stb_image.h for better performance.
+ * The memory layout is RGBRGBRGB..., input buffer length must be 3*nx*ny bytes
+ */
+CLIP_API void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img);
+
 CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);

 /** interpret bytes as an image file with length bytes_length, and use the result to populate img */
@@ -89,10 +103,14 @@ CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, cons
 CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);

 CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
+CLIP_API bool clip_is_glm(const struct clip_ctx * ctx);
 CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);

+CLIP_API int get_deepest_feature_layer(const struct clip_ctx * ctx);
+
 CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);

+
 #ifdef __cplusplus
 }
 #endif
--- a/examples/llava/convert_image_encoder_to_gguf.py
+++ b/examples/llava/convert_image_encoder_to_gguf.py
@@ -6,7 +6,7 @@ import re
 import torch
 import numpy as np
 from gguf import *
-from transformers import CLIPModel, CLIPProcessor, CLIPVisionModel
+from transformers import CLIPModel, CLIPProcessor, CLIPVisionModel, SiglipVisionModel

 TEXT = "clip.text"
 VISION = "clip.vision"
@@ -37,6 +37,18 @@ def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: b


 def get_tensor_name(name: str) -> str:
+    # Standardize the transformers llava next keys for
+    # image newline / mm projector with the classes in haotian-liu LLaVA
+    if name == "image_newline":
+        return "model.image_newline"
+    if name.startswith("multi_modal_projector"):
+        name = name.replace("multi_modal_projector", "mm")
+        if "linear_1" in name:
+            name = name.replace("linear_1", "0")
+        if "linear_2" in name:
+            name = name.replace("linear_2", "2")
+        return name
+
    if "projection" in name:
        return name
    if "mm_projector" in name:
@@ -77,14 +89,21 @@ def bytes_to_unicode():
 ap = argparse.ArgumentParser()
 ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True)
 ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
+ap.add_argument('--bigendian', action="store_true", default=False, help="Model is executed on big-endian machine")
 ap.add_argument("--text-only", action="store_true", required=False,
                help="Save a text-only model. It can't be used to encode images")
 ap.add_argument("--vision-only", action="store_true", required=False,
                help="Save a vision-only model. It can't be used to encode texts")
 ap.add_argument("--clip-model-is-vision", action="store_true", required=False,
                help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
-ap.add_argument("--clip-model-is-openclip", action="store_true", required=False,
+
+# Selectable visual encoders that are compatible with this script
+encoder_group = ap.add_mutually_exclusive_group()
+encoder_group.add_argument("--clip-model-is-openclip", action="store_true", required=False,
                help="The clip model is from openclip (for ViT-SO400M type))")
+encoder_group.add_argument("--clip-model-is-siglip", action="store_true", required=False,
+                help="the visual encoder is Siglip.")
+
 ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
 ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2"], default="mlp")
 ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
@@ -109,7 +128,12 @@ if args.use_f32:
 # output in the same directory as the model if output_dir is None
 dir_model = args.model_dir

-if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip:
+if (
+    args.clip_model_is_vision or
+    not os.path.exists(dir_model + "/vocab.json") or
+    args.clip_model_is_openclip or
+    args.clip_model_is_siglip
+):
    vocab = None
    tokens = None
 else:
@@ -137,7 +161,10 @@ ftype = 1
 if args.use_f32:
    ftype = 0

-if args.clip_model_is_vision or args.clip_model_is_openclip:
+if args.clip_model_is_siglip:
+    model = SiglipVisionModel.from_pretrained(dir_model)
+    processor = None
+elif args.clip_model_is_vision or args.clip_model_is_openclip:
    model = CLIPVisionModel.from_pretrained(dir_model)
    processor = None
 else:
@@ -165,7 +192,7 @@ output_dir = args.output_dir if args.output_dir is not None else dir_model
 os.makedirs(output_dir, exist_ok=True)
 output_prefix = os.path.basename(output_dir).replace("ggml_", "")
 fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf")
-fout = GGUFWriter(path=fname_out, arch="clip")
+fout = GGUFWriter(path=fname_out, arch="clip", endianess=GGUFEndian.LITTLE if not args.bigendian else GGUFEndian.BIG)

 fout.add_bool("clip.has_text_encoder", has_text_encoder)
 fout.add_bool("clip.has_vision_encoder", has_vision_encoder)
@@ -187,26 +214,71 @@ else:
 if has_text_encoder:
    assert t_hparams is not None
    assert tokens is not None
+    if args.clip_model_is_siglip:
+        text_projection_dim = 0
+    else:
+        text_projection_dim = t_hparams.get("projection_dim", config["projection_dim"])
    # text_model hparams
    fout.add_uint32(k(KEY_CONTEXT_LENGTH, TEXT), t_hparams["max_position_embeddings"])
    fout.add_uint32(k(KEY_EMBEDDING_LENGTH, TEXT), t_hparams["hidden_size"])
    fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, TEXT), t_hparams["intermediate_size"])
-    fout.add_uint32("clip.text.projection_dim", t_hparams.get("projection_dim", config["projection_dim"]))
+    fout.add_uint32("clip.text.projection_dim", text_projection_dim)
    fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, TEXT), t_hparams["num_attention_heads"])
    fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, TEXT), t_hparams["layer_norm_eps"])
    fout.add_uint32(k(KEY_BLOCK_COUNT, TEXT), t_hparams["num_hidden_layers"])
    fout.add_token_list(tokens)

+
+
+def get_non_negative_vision_feature_layers(v_hparams):
+    """
+    Determine the vision feature layer(s) for the llava model, which are indices into the
+    hidden states of the visual encoder. Note that the hidden states array generally takes the
+    form:
+
+        [<emb input>, <output of enc block 0>, ... <output of enc block num_hidden_layers>]
+
+    so feature indices should be offset as n+1 to get the output of encoder block n.
+    We convert all vision feature layers to non-negative so that -1 can be used in
+    the model as an unset value. If no vision feature layer is found, we leave it unset.
+    """
+    num_hidden_layers = v_hparams["num_hidden_layers"]
+    to_non_negative = lambda layer_idx: layer_idx  if layer_idx >= 0 else num_hidden_layers + layer_idx + 1
+    feature_layers_key = None
+    # Key used for llava models in transformers
+    if "vision_feature_layer" in config:
+        feature_layers_key = "vision_feature_layer"
+    # Key used for llava models in the original format
+    elif "mm_vision_select_layer" in config:
+        feature_layers_key = "mm_vision_select_layer"
+    if feature_layers_key is not None:
+        feature_layers = config[feature_layers_key]
+        if isinstance(feature_layers, int):
+            feature_layers = [feature_layers]
+        return [to_non_negative(feature_layer) for feature_layer in feature_layers]
+
+# Determine if we have explicitly specified vision feature layers in our config
+feature_layers = get_non_negative_vision_feature_layers(v_hparams)
+
 if has_vision_encoder:
-    # vision_model hparams
+    # Siglip does not have a visual projector; set projection dim to 0
+    if args.clip_model_is_siglip:
+        visual_projection_dim = 0
+    else:
+        visual_projection_dim = v_hparams.get("projection_dim", config["projection_dim"])
+
+    # set vision_model hparams
    fout.add_uint32("clip.vision.image_size", v_hparams["image_size"])
    fout.add_uint32("clip.vision.patch_size", v_hparams["patch_size"])
    fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), v_hparams["hidden_size"])
    fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), v_hparams["intermediate_size"])
-    fout.add_uint32("clip.vision.projection_dim", v_hparams.get("projection_dim", config["projection_dim"]))
+    fout.add_uint32("clip.vision.projection_dim", visual_projection_dim)
    fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), v_hparams["num_attention_heads"])
    fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), v_hparams["layer_norm_eps"])
-    block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"]
+    if feature_layers:
+        block_count = max(feature_layers)
+    else:
+        block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"]
    fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count)
                            #     /**
                            #      "image_grid_pinpoints": [
@@ -258,7 +330,8 @@ if has_vision_encoder:
        fout.add_string("clip.vision.mm_patch_merge_type", v_hparams["mm_patch_merge_type"])
    if "mm_projector_type" in v_hparams:
        fout.add_string("clip.vision.mm_projector_type", v_hparams["mm_projector_type"])
-
+    if feature_layers:
+        fout.add_array("clip.vision.feature_layer", feature_layers)

    if processor is not None:
        image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean  # pyright: ignore[reportAttributeAccessIssue]
@@ -274,7 +347,13 @@ fout.add_bool("clip.use_gelu", use_gelu)


 if has_llava_projector:
-    model.vision_model.encoder.layers.pop(-1)
+    # By default, we drop the last layer for llava projector
+    # models unless we have explicitly set vision feature layers
+    if feature_layers is None:
+        model.vision_model.encoder.layers.pop(-1)
+    else:
+        model.vision_model.encoder.layers = model.vision_model.encoder.layers[:max(feature_layers)]
+
    projector = torch.load(args.llava_projector)
    for name, data in projector.items():
        name = get_tensor_name(name)
--- a/Show More
+++ b/Show More