Vulkan: Add device architecture enum and logic to recognize AMD generations

vulkan: subgroup size test
2026-04-23 16:37:33 +03:00 · 2025-03-08 08:04:45 +00:00 · 2025-02-26 15:44:42 +01:00
281 changed files with 6422 additions and 17745 deletions
--- a/.devops/cuda.Dockerfile
+++ b/.devops/cuda.Dockerfile
@@ -1,6 +1,6 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
-ARG CUDA_VERSION=12.4.0
+ARG CUDA_VERSION=12.6.0
 # Target the CUDA build image
 ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}

--- a/.devops/llama-cpp-cuda.srpm.spec
+++ b/.devops/llama-cpp-cuda.srpm.spec
@@ -17,10 +17,10 @@ Version:        %( date "+%%Y%%m%%d" )
 Release:        1%{?dist}
 Summary:        CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
 License:        MIT
-Source0:        https://github.com/ggml-org/llama.cpp/archive/refs/heads/master.tar.gz
+Source0:        https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
 BuildRequires:  coreutils make gcc-c++ git cuda-toolkit
 Requires:       cuda-toolkit
-URL:            https://github.com/ggml-org/llama.cpp
+URL:            https://github.com/ggerganov/llama.cpp

 %define debug_package %{nil}
 %define source_date_epoch_from_changelog 0
--- a/.devops/llama-cpp.srpm.spec
+++ b/.devops/llama-cpp.srpm.spec
@@ -18,10 +18,10 @@ Version:        %( date "+%%Y%%m%%d" )
 Release:        1%{?dist}
 Summary:        CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
 License:        MIT
-Source0:        https://github.com/ggml-org/llama.cpp/archive/refs/heads/master.tar.gz
+Source0:        https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
 BuildRequires:  coreutils make gcc-c++ git libstdc++-devel
 Requires:       libstdc++
-URL:            https://github.com/ggml-org/llama.cpp
+URL:            https://github.com/ggerganov/llama.cpp

 %define debug_package %{nil}
 %define source_date_epoch_from_changelog 0
--- a/.devops/musa.Dockerfile
+++ b/.devops/musa.Dockerfile
@@ -1,6 +1,6 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
-ARG MUSA_VERSION=rc3.1.1
+ARG MUSA_VERSION=rc3.1.0
 # Target the MUSA build image
 ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}

--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -133,12 +133,12 @@ effectiveStdenv.mkDerivation (finalAttrs: {
      --replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
  '';

-  # With PR#6015 https://github.com/ggml-org/llama.cpp/pull/6015,
+  # With PR#6015 https://github.com/ggerganov/llama.cpp/pull/6015,
  # `default.metallib` may be compiled with Metal compiler from XCode
  # and we need to escape sandbox on MacOS to access Metal compiler.
  # `xcrun` is used find the path of the Metal compiler, which is varible
  # and not on $PATH
-  # see https://github.com/ggml-org/llama.cpp/pull/6118 for discussion
+  # see https://github.com/ggerganov/llama.cpp/pull/6118 for discussion
  __noChroot = effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders;

  nativeBuildInputs =
@@ -220,7 +220,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
    broken = (useMetalKit && !effectiveStdenv.isDarwin);

    description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
-    homepage = "https://github.com/ggml-org/llama.cpp/";
+    homepage = "https://github.com/ggerganov/llama.cpp/";
    license = lib.licenses.mit;

    # Accommodates `nix run` and `lib.getExe`
--- a/.devops/rocm.Dockerfile
+++ b/.devops/rocm.Dockerfile
@@ -11,7 +11,7 @@ ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-co
 FROM ${BASE_ROCM_DEV_CONTAINER} AS build

 # Unless otherwise specified, we make a fat build.
-# List from https://github.com/ggml-org/llama.cpp/pull/1087#issuecomment-1682807878
+# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
 # This is mostly tied to rocBLAS supported archs.
 # gfx803, gfx900, gfx1032, gfx1101, gfx1102,not officialy supported
 # gfx906 is deprecated
--- a/.github/ISSUE_TEMPLATE/020-enhancement.yml
+++ b/.github/ISSUE_TEMPLATE/020-enhancement.yml
@@ -6,7 +6,7 @@ body:
  - type: markdown
    attributes:
      value: |
-        [Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggml-org/llama.cpp/discussions/categories/ideas)
+        [Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggerganov/llama.cpp/discussions/categories/ideas)

  - type: checkboxes
    id: prerequisites
@@ -16,11 +16,11 @@ body:
      options:
        - label: I am running the latest code. Mention the version if possible as well.
          required: true
-        - label: I carefully followed the [README.md](https://github.com/ggml-org/llama.cpp/blob/master/README.md).
+        - label: I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
          required: true
        - label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed).
          required: true
-        - label: I reviewed the [Discussions](https://github.com/ggml-org/llama.cpp/discussions), and have a new and useful enhancement to share.
+        - label: I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new and useful enhancement to share.
          required: true

  - type: textarea
--- a/.github/ISSUE_TEMPLATE/030-research.yml
+++ b/.github/ISSUE_TEMPLATE/030-research.yml
@@ -6,7 +6,7 @@ body:
  - type: markdown
    attributes:
      value: |
-        Don't forget to check for any [duplicate research issue tickets](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)
+        Don't forget to check for any [duplicate research issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)

  - type: checkboxes
    id: research-stage
--- a/.github/ISSUE_TEMPLATE/040-refactor.yml
+++ b/.github/ISSUE_TEMPLATE/040-refactor.yml
@@ -6,8 +6,8 @@ body:
  - type: markdown
    attributes:
      value: |
-        Don't forget to [check for existing refactor issue tickets](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
-        Also you may want to check [Pull request refactor label as well](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.
+        Don't forget to [check for existing refactor issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
+        Also you may want to check [Pull request refactor label as well](https://github.com/ggerganov/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.

  - type: textarea
    id: background-description
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -1,11 +1,11 @@
 blank_issues_enabled: true
 contact_links:
  - name: Got an idea?
-    url: https://github.com/ggml-org/llama.cpp/discussions/categories/ideas
+    url: https://github.com/ggerganov/llama.cpp/discussions/categories/ideas
    about: Pop it there. It may then become an enhancement ticket.
  - name: Got a question?
-    url: https://github.com/ggml-org/llama.cpp/discussions/categories/q-a
+    url: https://github.com/ggerganov/llama.cpp/discussions/categories/q-a
    about: Ask a question there!
  - name: Want to contribute?
-    url: https://github.com/ggml-org/llama.cpp/wiki/contribute
+    url: https://github.com/ggerganov/llama.cpp/wiki/contribute
    about: Head to the contribution guide page of the wiki for areas you can help with
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -1 +1 @@
-*Make sure to read the [contributing guidelines](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md) before submitting a PR*
+*Make sure to read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md) before submitting a PR*
--- a/.github/workflows/bench.yml.disabled
+++ b/.github/workflows/bench.yml.disabled
@@ -1,5 +1,5 @@
 # TODO: there have been some issues with the workflow, so disabling for now
-#       https://github.com/ggml-org/llama.cpp/issues/7893
+#       https://github.com/ggerganov/llama.cpp/issues/7893
 #
 # Benchmark
 name: Benchmark
@@ -57,7 +57,17 @@ jobs:

    if: |
      inputs.gpu-series == 'Standard_NC4as_T4_v3'
+      || (
+        github.event_name == 'schedule'
+        && github.ref_name == 'master'
+        && github.repository_owner == 'ggerganov'
+      )
      || github.event_name == 'pull_request_target'
+      || (
+        github.event_name == 'push'
+        && github.event.ref == 'refs/heads/master'
+        && github.repository_owner == 'ggerganov'
+      )
    steps:
      - name: Clone
        id: checkout
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -129,7 +129,7 @@ jobs:
        run: |
          sysctl -a
          # Metal is disabled due to intermittent failures with Github runners not having a GPU:
-          # https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
+          # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
          cmake -B build \
            -DCMAKE_BUILD_RPATH="@loader_path" \
            -DLLAMA_FATAL_WARNINGS=ON \
@@ -173,15 +173,7 @@ jobs:
          name: llama-bin-macos-x64.zip

  ubuntu-cpu-cmake:
-    strategy:
-      matrix:
-        include:
-          - build: 'x64'
-            os: ubuntu-22.04
-          - build: 'arm64'
-            os: ubuntu-22.04-arm
-
-    runs-on: ${{ matrix.os }}
+    runs-on: ubuntu-22.04

    steps:
      - name: Clone
@@ -247,14 +239,14 @@ jobs:
        run: |
          cp LICENSE ./build/bin/
          cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
-          zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/*
+          zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/*

      - name: Upload artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        uses: actions/upload-artifact@v4
        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip
-          name: llama-bin-ubuntu-${{ matrix.build }}.zip
+          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip
+          name: llama-bin-ubuntu-x64.zip

  ubuntu-latest-cmake-sanitizer:
    runs-on: ubuntu-latest
@@ -382,8 +374,6 @@ jobs:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0

      - name: ccache
        uses: hendrikmuhs/ccache-action@v1.2.16
@@ -411,35 +401,7 @@ jobs:
        run: |
          cd build
          # This is using llvmpipe and runs slower than other backends
-          ctest -L main --verbose --timeout 2700
-
-      - name: Determine tag name
-        id: tag
-        shell: bash
-        run: |
-          BUILD_NUMBER="$(git rev-list --count HEAD)"
-          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
-          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
-            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
-          else
-            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
-            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        run: |
-          cp LICENSE ./build/bin/
-          cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
-          zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip ./build/bin/*
-
-      - name: Upload artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip
-          name: llama-bin-ubuntu-vulkan-x64.zip
+          ctest -L main --verbose --timeout 1800

  ubuntu-22-cmake-hip:
    runs-on: ubuntu-22.04
@@ -481,7 +443,7 @@ jobs:

  ubuntu-22-cmake-musa:
    runs-on: ubuntu-22.04
-    container: mthreads/musa:rc3.1.1-devel-ubuntu22.04
+    container: mthreads/musa:rc3.1.0-devel-ubuntu22.04

    steps:
      - name: Clone
@@ -1383,10 +1345,8 @@ jobs:

    needs:
      - ubuntu-cpu-cmake
-      - ubuntu-22-cmake-vulkan
      - windows-latest-cmake
      - windows-2019-cmake-cuda
-      - windows-latest-cmake-sycl
      - windows-latest-cmake-hip-release
      - macOS-latest-cmake-arm64
      - macOS-latest-cmake-x64
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -51,8 +51,6 @@ jobs:

      - name: Set up QEMU
        uses: docker/setup-qemu-action@v3
-        with:
-          image: tonistiigi/binfmt:qemu-v7.0.0-28

      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
--- a/.github/workflows/labeler.yml
+++ b/.github/workflows/labeler.yml
@@ -11,7 +11,7 @@ jobs:
    steps:
    - uses: actions/checkout@v4
      with:
-        repository: "ggml-org/llama.cpp"
+        repository: "ggerganov/llama.cpp"
    - uses: actions/labeler@v5
      with:
        configuration-path: '.github/labeler.yml'
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -161,8 +161,6 @@ jobs:
      - name: Tests
        id: server_integration_tests
        if: ${{ matrix.sanitizer == '' }}
-        env:
-          GITHUB_ACTIONS: "true"
        run: |
          cd examples/server/tests
          ./tests.sh
--- a/.gitignore
+++ b/.gitignore
@@ -45,8 +45,6 @@ lcov-report/
 tags
 .build/
 build*
-release
-debug
 !build-info.cmake
 !build-info.cpp.in
 !build-info.sh
@@ -100,7 +98,6 @@ examples/server/*.css.hpp
 examples/server/*.html.hpp
 examples/server/*.js.hpp
 examples/server/*.mjs.hpp
-examples/server/*.gz.hpp
 !build_64.sh
 !examples/*.bat
 !examples/*/*.kts
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,12 +1,10 @@
 # Pull requests (for contributors)

- llama.cpp uses the ggml tensor library for model evaluation. If you are unfamiliar with ggml, consider taking a look at the [examples in the ggml repository](https://github.com/ggml-org/ggml/tree/master/examples/). [simple](https://github.com/ggml-org/ggml/tree/master/examples/simple) shows the bare minimum for using ggml. [gpt-2](https://github.com/ggml-org/ggml/tree/master/examples/gpt-2) has minimal implementations for language model inference using GPT-2. [mnist](https://github.com/ggml-org/ggml/tree/master/examples/mnist) demonstrates how to train and evaluate a simple image classifier
 - Test your changes:
    - Execute [the full CI locally on your machine](ci/README.md) before publishing
    - Verify that the perplexity and the performance are not affected negatively by your changes (use `llama-perplexity` and `llama-bench`)
    - If you modified the `ggml` source, run the `test-backend-ops` tool to check whether different backend implementations of the `ggml` operators produce consistent results (this requires access to at least two different `ggml` backends)
    - If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops`
- Create separate PRs for each feature or fix. Avoid combining unrelated changes in a single PR
 - Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
 - If your PR becomes stale, don't hesitate to ping the maintainers in the comments

@@ -14,7 +12,7 @@

 - Squash-merge PRs
 - Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
- Optionally pick a `<module>` from here: https://github.com/ggml-org/llama.cpp/wiki/Modules
+- Optionally pick a `<module>` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules
 - Consider adding yourself to [CODEOWNERS](CODEOWNERS)

 # Coding guidelines
@@ -39,17 +37,17 @@

    _(NOTE: this guideline is yet to be applied to the `llama.cpp` codebase. New code should follow this guideline.)_

- Try to follow the existing patterns in the code (indentation, spaces, etc.). In case of doubt use `clang-format` (from clang-tools v15+) to format the added code
+- Try to follow the existing patterns in the code (indentation, spaces, etc.). In case of doubt use `clang-format` to format the added code
 - For anything not covered in the current guidelines, refer to the [C++ Core Guidelines](https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines)
 - Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggml-org/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$
+- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$

 ![matmul](media/matmul.png)

 # Naming guidelines

 - Use `snake_case` for function, variable and type names
- Naming usually optimizes for longest common prefix (see https://github.com/ggml-org/ggml/pull/302#discussion_r1243240963)
+- Naming usually optimizes for longest common prefix (see https://github.com/ggerganov/ggml/pull/302#discussion_r1243240963)

    ```cpp
    // not OK
@@ -124,4 +122,4 @@

 The Github issues, PRs and discussions contain a lot of information that can be useful to get familiar with the codebase. For convenience, some of the more important information is referenced from Github projects:

-https://github.com/ggml-org/llama.cpp/projects
+https://github.com/ggerganov/llama.cpp/projects
--- a/24
+++ b/24
@@ -1,5 +1,5 @@
 ifndef LLAMA_MAKEFILE
-$(error The Makefile build is deprecated. Use the CMake build instead. For more details, see https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md)
+$(error The Makefile build is deprecated. Use the CMake build instead. For more details, see https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
 endif

 # Define the default target now so that it is always the first target
@@ -463,7 +463,7 @@ endif
 ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))'
 	# The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves.
 	# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
-	# https://github.com/ggml-org/llama.cpp/issues/2922
+	# https://github.com/ggerganov/llama.cpp/issues/2922
 	MK_CFLAGS   += -Xassembler -muse-unaligned-vector-move
 	MK_CXXFLAGS += -Xassembler -muse-unaligned-vector-move

@@ -680,10 +680,6 @@ ifdef GGML_CUDA_CCBIN
 	MK_NVCCFLAGS += -ccbin $(GGML_CUDA_CCBIN)
 endif # GGML_CUDA_CCBIN

-ifdef GGML_CUDA_NO_FA
-	MK_NVCCFLAGS += -DGGML_CUDA_NO_FA
-endif # GGML_CUDA_NO_FA
-
 ifdef GGML_CUDA_FA_ALL_QUANTS
 	MK_NVCCFLAGS += -DGGML_CUDA_FA_ALL_QUANTS
 endif # GGML_CUDA_FA_ALL_QUANTS
@@ -804,10 +800,6 @@ ifdef GGML_CUDA_NO_PEER_COPY
 	HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY
 endif # GGML_CUDA_NO_PEER_COPY

-ifdef GGML_CUDA_NO_FA
-	HIPFLAGS += -DGGML_CUDA_NO_FA
-endif # GGML_CUDA_NO_FA
-
 	OBJ_GGML_EXT += ggml/src/ggml-cuda/ggml-cuda.o
 	OBJ_GGML_EXT += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
 	OBJ_GGML_EXT += $(OBJ_CUDA_TMPL)
@@ -855,7 +847,7 @@ ifdef GGML_MUSA
 	CXX := $(MUSA_PATH)/bin/clang++
 	MCC := $(CCACHE) $(MUSA_PATH)/bin/mcc

-	MUSAFLAGS  = -fsigned-char -x musa -mtgpu
+	MUSAFLAGS  = -x musa -mtgpu
 	MUSAFLAGS += $(foreach arch,$(subst ;, ,$(MUSA_ARCHITECTURES)),--cuda-gpu-arch=mp_$(arch))

 ifdef GGML_CUDA_FORCE_MMQ
@@ -884,10 +876,6 @@ ifdef GGML_CUDA_NO_PEER_COPY
 	MUSAFLAGS += -DGGML_CUDA_NO_PEER_COPY
 endif # GGML_CUDA_NO_PEER_COPY

-ifdef GGML_CUDA_NO_FA
-	MUSAFLAGS += -DGGML_CUDA_NO_FA
-endif # GGML_CUDA_NO_FA
-
 ifdef GGML_CUDA_FA_ALL_QUANTS
 	MUSAFLAGS += -DGGML_CUDA_FA_ALL_QUANTS
 endif # GGML_CUDA_FA_ALL_QUANTS
@@ -1090,8 +1078,8 @@ endif
 ifdef REMOVE_WARNING
 $(info !!! REMOVAL WARNING !!!)
 $(info The following LLAMA_ options have been removed and are no longer supported)
-$(info   - LLAMA_DISABLE_LOGS   (https://github.com/ggml-org/llama.cpp/pull/9418))
-$(info   - LLAMA_SERVER_VERBOSE (https://github.com/ggml-org/llama.cpp/pull/9418))
+$(info   - LLAMA_DISABLE_LOGS   (https://github.com/ggerganov/llama.cpp/pull/9418))
+$(info   - LLAMA_SERVER_VERBOSE (https://github.com/ggerganov/llama.cpp/pull/9418))
 $(info )
 endif

@@ -1376,7 +1364,7 @@ llama-server: \
 	examples/server/index.html.hpp \
 	examples/server/loading.html.hpp \
 	common/chat.cpp \
-	common/chat.h \
+	common/chat.hpp \
 	common/chat-template.hpp \
 	common/json.hpp \
 	common/minja.hpp \
--- a/README.md
+++ b/README.md
@@ -3,33 +3,26 @@
 ![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)

 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
-[![Server](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml)
+[![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)

-[Roadmap](https://github.com/users/ggml-org/projects/7) / [Project status](https://github.com/ggml-org/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml)
+[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)

 Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++

-> [!IMPORTANT]
-> New `llama.cpp` package location: [ggml-org/llama.cpp](https://github.com/ggml-org/llama.cpp/pkgs/container/llama.cpp)
->
-> Update your container URLs to: `ghcr.io/ggml-org/llama.cpp`
->
-> More info: https://github.com/ggml-org/llama.cpp/discussions/11801
-
 ## Recent API changes

- [Changelog for `libllama` API](https://github.com/ggml-org/llama.cpp/issues/9289)
- [Changelog for `llama-server` REST API](https://github.com/ggml-org/llama.cpp/issues/9291)
+- [Changelog for `libllama` API](https://github.com/ggerganov/llama.cpp/issues/9289)
+- [Changelog for `llama-server` REST API](https://github.com/ggerganov/llama.cpp/issues/9291)

 ## Hot topics

- **How to use [MTLResidencySet](https://developer.apple.com/documentation/metal/mtlresidencyset?language=objc) to keep the GPU memory active?** https://github.com/ggml-org/llama.cpp/pull/11427
+- **How to use [MTLResidencySet](https://developer.apple.com/documentation/metal/mtlresidencyset?language=objc) to keep the GPU memory active?** https://github.com/ggerganov/llama.cpp/pull/11427
 - **VS Code extension for FIM completions:** https://github.com/ggml-org/llama.vscode
- Universal tool call support in `llama-server`: https://github.com/ggml-org/llama.cpp/pull/9639
+- Universal tool call support in `llama-server`: https://github.com/ggerganov/llama.cpp/pull/9639
 - Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
- Introducing GGUF-my-LoRA https://github.com/ggml-org/llama.cpp/discussions/10123
- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggml-org/llama.cpp/discussions/9669
- Hugging Face GGUF editor: [discussion](https://github.com/ggml-org/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
+- Introducing GGUF-my-LoRA https://github.com/ggerganov/llama.cpp/discussions/10123
+- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669
+- Hugging Face GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)

 ----

@@ -46,7 +39,7 @@ range of hardware - locally and in the cloud.
 - Vulkan and SYCL backend support
 - CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity

-The `llama.cpp` project is the main playground for developing new features for the [ggml](https://github.com/ggml-org/ggml) library.
+The `llama.cpp` project is the main playground for developing new features for the [ggml](https://github.com/ggerganov/ggml) library.

 <details>
 <summary>Models</summary>
@@ -66,23 +59,23 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon)
 - [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
 - [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
- [X] [BERT](https://github.com/ggml-org/llama.cpp/pull/5423)
+- [X] [BERT](https://github.com/ggerganov/llama.cpp/pull/5423)
 - [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
 - [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft)
 - [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila)
- [X] [Starcoder models](https://github.com/ggml-org/llama.cpp/pull/3187)
+- [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187)
 - [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim)
- [X] [MPT](https://github.com/ggml-org/llama.cpp/pull/3417)
- [X] [Bloom](https://github.com/ggml-org/llama.cpp/pull/3553)
+- [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417)
+- [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553)
 - [x] [Yi models](https://huggingface.co/models?search=01-ai/Yi)
 - [X] [StableLM models](https://huggingface.co/stabilityai)
 - [x] [Deepseek models](https://huggingface.co/models?search=deepseek-ai/deepseek)
 - [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen)
- [x] [PLaMo-13B](https://github.com/ggml-org/llama.cpp/pull/3557)
+- [x] [PLaMo-13B](https://github.com/ggerganov/llama.cpp/pull/3557)
 - [x] [Phi models](https://huggingface.co/models?search=microsoft/phi)
- [x] [PhiMoE](https://github.com/ggml-org/llama.cpp/pull/11003)
+- [x] [PhiMoE](https://github.com/ggerganov/llama.cpp/pull/11003)
 - [x] [GPT-2](https://huggingface.co/gpt2)
- [x] [Orion 14B](https://github.com/ggml-org/llama.cpp/pull/5118)
+- [x] [Orion 14B](https://github.com/ggerganov/llama.cpp/pull/5118)
 - [x] [InternLM2](https://huggingface.co/models?search=internlm2)
 - [x] [CodeShell](https://github.com/WisdomShell/codeshell)
 - [x] [Gemma](https://ai.google.dev/gemma)
@@ -153,7 +146,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
 - Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
 - Flutter: [xuegao-tzx/Fllama](https://github.com/xuegao-tzx/Fllama)
- PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggml-org/llama.cpp/pull/6326)
+- PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326)
 - Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp)
 - Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift)
 - Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama)
@@ -219,7 +212,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly
 - [llama-swap](https://github.com/mostlygeek/llama-swap) - transparent proxy that adds automatic model switching with llama-server
 - [Kalavai](https://github.com/kalavai-net/kalavai-client) - Crowdsource end to end LLM deployment at any scale
- [llmaz](https://github.com/InftyAI/llmaz) - ☸️ Easy, advanced inference platform for large language models on Kubernetes.
+
 </details>

 <details>
@@ -242,7 +235,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 | [HIP](docs/build.md#hip) | AMD GPU |
 | [Vulkan](docs/build.md#vulkan) | GPU |
 | [CANN](docs/build.md#cann) | Ascend NPU |
-| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |

 ## Building the project

@@ -252,7 +244,7 @@ The project also includes many example programs and tools using the `llama` libr
 - Clone this repository and build locally, see [how to build](docs/build.md)
 - On MacOS or Linux, install `llama.cpp` via [brew, flox or nix](docs/install.md)
 - Use a Docker image, see [documentation for Docker](docs/docker.md)
- Download pre-built binaries from [releases](https://github.com/ggml-org/llama.cpp/releases)
+- Download pre-built binaries from [releases](https://github.com/ggerganov/llama.cpp/releases)

 ## Obtaining and quantizing models

@@ -265,14 +257,14 @@ You can either manually download the GGUF file or directly use any `llama.cpp`-c

 After downloading a model, use the CLI tools to run it locally - see below.

-`llama.cpp` requires the model to be stored in the [GGUF](https://github.com/ggml-org/ggml/blob/master/docs/gguf.md) file format. Models in other data formats can be converted to GGUF using the `convert_*.py` Python scripts in this repo.
+`llama.cpp` requires the model to be stored in the [GGUF](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) file format. Models in other data formats can be converted to GGUF using the `convert_*.py` Python scripts in this repo.

 The Hugging Face platform provides a variety of online tools for converting, quantizing and hosting models with `llama.cpp`:

 - Use the [GGUF-my-repo space](https://huggingface.co/spaces/ggml-org/gguf-my-repo) to convert to GGUF format and quantize model weights to smaller sizes
- Use the [GGUF-my-LoRA space](https://huggingface.co/spaces/ggml-org/gguf-my-lora) to convert LoRA adapters to GGUF format (more info: https://github.com/ggml-org/llama.cpp/discussions/10123)
- Use the [GGUF-editor space](https://huggingface.co/spaces/CISCai/gguf-editor) to edit GGUF meta data in the browser (more info: https://github.com/ggml-org/llama.cpp/discussions/9268)
- Use the [Inference Endpoints](https://ui.endpoints.huggingface.co/) to directly host `llama.cpp` in the cloud (more info: https://github.com/ggml-org/llama.cpp/discussions/9669)
+- Use the [GGUF-my-LoRA space](https://huggingface.co/spaces/ggml-org/gguf-my-lora) to convert LoRA adapters to GGUF format (more info: https://github.com/ggerganov/llama.cpp/discussions/10123)
+- Use the [GGUF-editor space](https://huggingface.co/spaces/CISCai/gguf-editor) to edit GGUF meta data in the browser (more info: https://github.com/ggerganov/llama.cpp/discussions/9268)
+- Use the [Inference Endpoints](https://ui.endpoints.huggingface.co/) to directly host `llama.cpp` in the cloud (more info: https://github.com/ggerganov/llama.cpp/discussions/9669)

 To learn more about model quantization, [read this documentation](examples/quantize/README.md)

@@ -495,9 +487,9 @@ To learn more about model quantization, [read this documentation](examples/quant
 - Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
 - Collaborators will be invited based on contributions
 - Any help with managing issues, PRs and projects is very appreciated!
- See [good first issues](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
+- See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
 - Read the [CONTRIBUTING.md](CONTRIBUTING.md) for more information
- Make sure to read this: [Inference at the edge](https://github.com/ggml-org/llama.cpp/discussions/205)
+- Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205)
 - A bit of backstory for those who are interested: [Changelog podcast](https://changelog.com/podcast/532)

 ## Other documentation
@@ -512,7 +504,7 @@ To learn more about model quantization, [read this documentation](examples/quant
 - [Running on Docker](docs/docker.md)
 - [Build on Android](docs/android.md)
 - [Performance troubleshooting](docs/development/token_generation_performance_tips.md)
- [GGML tips & tricks](https://github.com/ggml-org/llama.cpp/wiki/GGML-Tips-&-Tricks)
+- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)

 #### Seminal papers and background on the models

@@ -526,18 +518,5 @@ If your issue is with model generation quality, then please at least scan the fo
    - [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
    - [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)

-## Completions
-Command-line completion is available for some environments.
+#### References

-#### Bash Completion
-```bash
-$ build/bin/llama-cli --completion-bash > ~/.llama-completion.bash
-$ source ~/.llama-completion.bash
-```
-Optionally this can be added to your `.bashrc` or `.bash_profile` to load it
-automatically. For example:
-```console
-$ echo "source ~/.llama-completion.bash" >> ~/.bashrc
-```
-
-## References
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -62,6 +62,6 @@ Beware that none of the topics under [Using llama.cpp securely](#using-llamacpp-
 <!-- normal version -->
 However, If you have discovered a security vulnerability in this project, please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released.

-Please disclose it as a private [security advisory](https://github.com/ggml-org/llama.cpp/security/advisories/new).
+Please disclose it as a private [security advisory](https://github.com/ggerganov/llama.cpp/security/advisories/new).

 A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.
--- a/ci/README.md
+++ b/ci/README.md
@@ -1,11 +1,11 @@
 # CI

-In addition to [Github Actions](https://github.com/ggml-org/llama.cpp/actions) `llama.cpp` uses a custom CI framework:
+In addition to [Github Actions](https://github.com/ggerganov/llama.cpp/actions) `llama.cpp` uses a custom CI framework:

 https://github.com/ggml-org/ci

 It monitors the `master` branch for new commits and runs the
-[ci/run.sh](https://github.com/ggml-org/llama.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us
+[ci/run.sh](https://github.com/ggerganov/llama.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us
 to execute heavier workloads compared to just using Github Actions. Also with time, the cloud instances will be scaled
 to cover various hardware architectures, including GPU and Apple Silicon instances.

--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -57,7 +57,8 @@ add_library(${TARGET} STATIC
    arg.h
    base64.hpp
    chat.cpp
-    chat.h
+    chat.hpp
+    chat-template.hpp
    common.cpp
    common.h
    console.cpp
@@ -67,8 +68,7 @@ add_library(${TARGET} STATIC
    llguidance.cpp
    log.cpp
    log.h
-    minja/chat-template.hpp
-    minja/minja.hpp
+    minja.hpp
    ngram-cache.cpp
    ngram-cache.h
    sampling.cpp
@@ -96,22 +96,6 @@ if (LLAMA_LLGUIDANCE)
    include(ExternalProject)
    set(LLGUIDANCE_SRC ${CMAKE_BINARY_DIR}/llguidance/source)
    set(LLGUIDANCE_PATH ${LLGUIDANCE_SRC}/target/release)
-
-    # Set the correct library file extension based on platform
-    if (WIN32)
-        set(LLGUIDANCE_LIB_NAME "llguidance.lib")
-        # Add Windows-specific libraries
-        set(LLGUIDANCE_PLATFORM_LIBS
-            ws2_32    # Windows Sockets API
-            userenv   # For GetUserProfileDirectoryW
-            ntdll     # For NT functions
-            bcrypt    # For BCryptGenRandom
-        )
-    else()
-        set(LLGUIDANCE_LIB_NAME "libllguidance.a")
-        set(LLGUIDANCE_PLATFORM_LIBS "")
-    endif()
-
    ExternalProject_Add(llguidance_ext
        GIT_REPOSITORY https://github.com/guidance-ai/llguidance
        # v0.6.12:
@@ -122,18 +106,17 @@ if (LLAMA_LLGUIDANCE)
        CONFIGURE_COMMAND ""
        BUILD_COMMAND cargo build --release
        INSTALL_COMMAND ""
-        BUILD_BYPRODUCTS ${LLGUIDANCE_PATH}/${LLGUIDANCE_LIB_NAME} ${LLGUIDANCE_PATH}/llguidance.h
+        BUILD_BYPRODUCTS ${LLGUIDANCE_PATH}/libllguidance.a ${LLGUIDANCE_PATH}/llguidance.h
        UPDATE_COMMAND ""
    )
    target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_LLGUIDANCE)

    add_library(llguidance STATIC IMPORTED)
-    set_target_properties(llguidance PROPERTIES IMPORTED_LOCATION ${LLGUIDANCE_PATH}/${LLGUIDANCE_LIB_NAME})
+    set_target_properties(llguidance PROPERTIES IMPORTED_LOCATION ${LLGUIDANCE_PATH}/libllguidance.a)
    add_dependencies(llguidance llguidance_ext)

    target_include_directories(${TARGET} PRIVATE ${LLGUIDANCE_PATH})
-    # Add platform libraries to the main target
-    set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance ${LLGUIDANCE_PLATFORM_LIBS})
+    set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance)
 endif ()

 target_include_directories(${TARGET} PUBLIC .)
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2,7 +2,6 @@

 #include "log.h"
 #include "sampling.h"
-#include "chat.h"

 #include <algorithm>
 #include <climits>
@@ -366,112 +365,6 @@ static void common_params_print_usage(common_params_context & ctx_arg) {
    print_options(specific_options);
 }

-static void common_params_print_completion(common_params_context & ctx_arg) {
-    std::vector<common_arg *> common_options;
-    std::vector<common_arg *> sparam_options;
-    std::vector<common_arg *> specific_options;
-
-    for (auto & opt : ctx_arg.options) {
-        if (opt.is_sparam) {
-            sparam_options.push_back(&opt);
-        } else if (opt.in_example(ctx_arg.ex)) {
-            specific_options.push_back(&opt);
-        } else {
-            common_options.push_back(&opt);
-        }
-    }
-
-    printf("_llama_completions() {\n");
-    printf("    local cur prev opts\n");
-    printf("    COMPREPLY=()\n");
-    printf("    cur=\"${COMP_WORDS[COMP_CWORD]}\"\n");
-    printf("    prev=\"${COMP_WORDS[COMP_CWORD-1]}\"\n\n");
-
-    printf("    opts=\"");
-    auto print_options = [](const std::vector<common_arg *> & options) {
-        for (const common_arg * opt : options) {
-            for (const char * arg : opt->args) {
-                printf("%s ", arg);
-            }
-        }
-    };
-
-    print_options(common_options);
-    print_options(sparam_options);
-    print_options(specific_options);
-    printf("\"\n\n");
-
-    printf("    case \"$prev\" in\n");
-    printf("        --model)\n");
-    printf("            COMPREPLY=( $(compgen -f -X '!*.gguf' -- \"$cur\") $(compgen -d -- \"$cur\") )\n");
-    printf("            return 0\n");
-    printf("            ;;\n");
-    printf("        --grammar-file)\n");
-    printf("            COMPREPLY=( $(compgen -f -X '!*.gbnf' -- \"$cur\") $(compgen -d -- \"$cur\") )\n");
-    printf("            return 0\n");
-    printf("            ;;\n");
-    printf("        --chat-template-file)\n");
-    printf("            COMPREPLY=( $(compgen -f -X '!*.jinja' -- \"$cur\") $(compgen -d -- \"$cur\") )\n");
-    printf("            return 0\n");
-    printf("            ;;\n");
-    printf("        *)\n");
-    printf("            COMPREPLY=( $(compgen -W \"${opts}\" -- \"$cur\") )\n");
-    printf("            return 0\n");
-    printf("            ;;\n");
-    printf("    esac\n");
-    printf("}\n\n");
-
-    std::set<std::string> executables = {
-        "llama-batched",
-        "llama-batched-bench",
-        "llama-bench",
-        "llama-cli",
-        "llama-convert-llama2c-to-ggml",
-        "llama-cvector-generator",
-        "llama-embedding",
-        "llama-eval-callback",
-        "llama-export-lora",
-        "llama-gbnf-validator",
-        "llama-gen-docs",
-        "llama-gguf",
-        "llama-gguf-hash",
-        "llama-gguf-split",
-        "llama-gritlm",
-        "llama-imatrix",
-        "llama-infill",
-        "llama-llava-cli",
-        "llama-llava-clip-quantize-cli",
-        "llama-lookahead",
-        "llama-lookup",
-        "llama-lookup-create",
-        "llama-lookup-merge",
-        "llama-lookup-stats",
-        "llama-minicpmv-cli",
-        "llama-parallel",
-        "llama-passkey",
-        "llama-perplexity",
-        "llama-q8dot",
-        "llama-quantize",
-        "llama-quantize-stats",
-        "llama-qwen2vl-cli",
-        "llama-retrieval",
-        "llama-run",
-        "llama-save-load-state",
-        "llama-server",
-        "llama-simple",
-        "llama-simple-chat",
-        "llama-speculative",
-        "llama-speculative-simple",
-        "llama-tokenize",
-        "llama-tts",
-        "llama-vdot"
-    };
-
-    for (const auto& exe : executables) {
-        printf("complete -F _llama_completions %s\n", exe.c_str());
-    }
-}
-
 static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & value) {
    std::vector<ggml_backend_dev_t> devices;
    auto dev_names = string_split<std::string>(value, ',');
@@ -533,10 +426,6 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
            }
            exit(0);
        }
-        if (ctx_arg.params.completion) {
-            common_params_print_completion(ctx_arg);
-            exit(0);
-        }
    } catch (const std::invalid_argument & ex) {
        fprintf(stderr, "%s\n", ex.what());
        ctx_arg.params = params_org;
@@ -605,13 +494,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            exit(0);
        }
    ));
-    add_opt(common_arg(
-        {"--completion-bash"},
-        "print source-able bash completion script for llama.cpp",
-        [](common_params & params) {
-            params.completion = true;
-        }
-    ));
    add_opt(common_arg(
        {"--verbose-prompt"},
        string_format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"),
@@ -792,7 +674,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ));
    add_opt(common_arg(
        {"--no-context-shift"},
-        string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
+        string_format("disables context shift on inifinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
        [](common_params & params) {
            params.ctx_shift = false;
        }
@@ -813,18 +695,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_env("LLAMA_ARG_FLASH_ATTN"));
    add_opt(common_arg(
        {"-p", "--prompt"}, "PROMPT",
-        "prompt to start generation with; for system message, use -sys",
+        ex == LLAMA_EXAMPLE_MAIN
+            ? "prompt to start generation with\nif -cnv is set, this will be used as system prompt"
+            : "prompt to start generation with",
        [](common_params & params, const std::string & value) {
            params.prompt = value;
        }
    ).set_excludes({LLAMA_EXAMPLE_SERVER}));
-    add_opt(common_arg(
-        {"-sys", "--system-prompt"}, "PROMPT",
-        "system prompt to use with model (if applicable, depending on chat template)",
-        [](common_params & params, const std::string & value) {
-            params.system_prompt = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
    add_opt(common_arg(
        {"--no-perf"},
        string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
@@ -1069,13 +946,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.sampling.min_p = std::stof(value);
        }
    ).set_sparam());
-    add_opt(common_arg(
-        {"--top-nsigma"}, "N",
-        string_format("top-n-sigma sampling (default: %.1f, -1.0 = disabled)", params.sampling.top_n_sigma),
-        [](common_params & params, const std::string & value) {
-            params.sampling.top_n_sigma = std::stof(value);
-        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}).set_sparam());
    add_opt(common_arg(
        {"--xtc-probability"}, "N",
        string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability),
@@ -1575,7 +1445,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        "- isolate: only spawn threads on CPUs on the node that execution started on\n"
        "- numactl: use the CPU map provided by numactl\n"
        "if run without this previously, it is recommended to drop the system page cache before using this\n"
-        "see https://github.com/ggml-org/llama.cpp/issues/1437",
+        "see https://github.com/ggerganov/llama.cpp/issues/1437",
        [](common_params & params, const std::string & value) {
            /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
            else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
@@ -2105,17 +1975,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.use_jinja = true;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
-    add_opt(common_arg(
-        {"--reasoning-format"}, "FORMAT",
-        "reasoning format (default: deepseek; allowed values: deepseek, none)\n"
-        "controls whether thought tags are extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only).\n"
-        "only supported for non-streamed responses",
-        [](common_params & params, const std::string & value) {
-            /**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
-            else if (value == "none") {     params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
-            else { std::invalid_argument("invalid value"); }
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
    add_opt(common_arg(
        {"--chat-template"}, "JINJA_TEMPLATE",
        string_format(
@@ -2253,7 +2112,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_env("LLAMA_LOG_VERBOSITY"));
    add_opt(common_arg(
        {"--log-prefix"},
-        "Enable prefix in log messages",
+        "Enable prefx in log messages",
        [](common_params &) {
            common_log_set_prefix(common_log_main(), true);
        }
@@ -2452,13 +2311,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.vocoder.use_guide_tokens = true;
        }
    ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
-    add_opt(common_arg(
-        {"--tts-speaker-file"}, "FNAME",
-        "speaker file path for audio generation",
-        [](common_params & params, const std::string & value) {
-            params.vocoder.speaker_file = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_TTS}));

    // model-specific
    add_opt(common_arg(
@@ -2514,53 +2366,5 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));

-    add_opt(common_arg(
-        {"--fim-qwen-1.5b-default"},
-        string_format("use default Qwen 2.5 Coder 1.5B (note: can download weights from the internet)"),
-        [](common_params & params) {
-            params.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
-            params.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
-            params.port = 8012;
-            params.n_gpu_layers = 99;
-            params.flash_attn = true;
-            params.n_ubatch = 1024;
-            params.n_batch = 1024;
-            params.n_ctx = 0;
-            params.n_cache_reuse = 256;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
-
-    add_opt(common_arg(
-        {"--fim-qwen-3b-default"},
-        string_format("use default Qwen 2.5 Coder 3B (note: can download weights from the internet)"),
-        [](common_params & params) {
-            params.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
-            params.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
-            params.port = 8012;
-            params.n_gpu_layers = 99;
-            params.flash_attn = true;
-            params.n_ubatch = 1024;
-            params.n_batch = 1024;
-            params.n_ctx = 0;
-            params.n_cache_reuse = 256;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
-
-    add_opt(common_arg(
-        {"--fim-qwen-7b-default"},
-        string_format("use default Qwen 2.5 Coder 7B (note: can download weights from the internet)"),
-        [](common_params & params) {
-            params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
-            params.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
-            params.port = 8012;
-            params.n_gpu_layers = 99;
-            params.flash_attn = true;
-            params.n_ubatch = 1024;
-            params.n_batch = 1024;
-            params.n_ctx = 0;
-            params.n_cache_reuse = 256;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
-
    return ctx_arg;
 }
--- a/common/minja/chat-template.hpp
+++ b/common/minja/chat-template.hpp
@@ -249,30 +249,16 @@ class chat_template {
                    inputs.add_generation_prompt = false;
                    full = apply(inputs);
                }
-                auto eos_pos_last = full.rfind(eos_token_);
-                if (eos_pos_last == prefix.size() - eos_token_.size() ||
-                      (full[full.size() - 1] == '\n' && (eos_pos_last == full.size() - eos_token_.size() - 1))) {
-                    full = full.substr(0, eos_pos_last);
-                }
-                size_t common_prefix_length = 0;
-                for (size_t i = 0; i < prefix.size() && i < full.size(); ++i) {
-                    if (prefix[i] != full[i]) {
-                        break;
+
+                if (full.find(prefix) != 0) {
+                    if (prefix.rfind(eos_token_) == prefix.size() - eos_token_.size()) {
+                        prefix = prefix.substr(0, prefix.size() - eos_token_.size());
                    }
-                    if (prefix[i] == '<') {
-                        // DeepSeek R1's template (as of 20250209) adds a trailing <think> if add_generation_prompt,
-                        // but it removes thinking tags for past messages.
-                        // The prefix and full strings diverge at <think> vs. <｜tool▁calls▁begin｜>, we avoid consuming the leading <.
-                        continue;
-                    }
-                    common_prefix_length = i + 1;
                }
-                auto example = full.substr(common_prefix_length);
-                if (example.find("tool_name") == std::string::npos && example.find("some_value") == std::string::npos) {
+                if (full.find(prefix) != 0) {
                    fprintf(stderr, "Failed to infer a tool call example (possible template bug)\n");
-                } else {
-                    tool_call_example_ = example;
                }
+                tool_call_example_ = full.substr(prefix.size());
            }
        } catch (const std::exception & e) {
            fprintf(stderr, "Failed to generate tool call example: %s\n", e.what());
@@ -377,7 +363,7 @@ class chat_template {
            if (polyfill_tools) {
                adjusted_messages = add_system(inputs.messages,
                    "You can call any of the following tools to satisfy the user's requests: " + minja::Value(inputs.tools).dump(2, /* to_json= */ true) +
-                    (!polyfill_tool_call_example || tool_call_example_.empty() ? "" : "\n\nExample tool call syntax:\n\n" + tool_call_example_ + "\n\n"));
+                    (!polyfill_tool_call_example || tool_call_example_.empty() ? "" : "\n\nExample tool call syntax:\n\n" + tool_call_example_));
            } else {
                adjusted_messages = inputs.messages;
            }
--- a/common/chat.cpp
+++ b/common/chat.cpp
--- a/common/chat.h
+++ b/common/chat.h
@@ -1,134 +0,0 @@
-// Chat support (incl. tool call grammar constraining & output parsing) w/ generic & custom template handlers.
-
-#pragma once
-
-#include "common.h"
-#include <string>
-#include <vector>
-
-struct common_chat_templates;
-
-struct common_chat_tool_call {
-    std::string name;
-    std::string arguments;
-    std::string id;
-};
-
-struct common_chat_msg_content_part {
-    std::string type;
-    std::string text;
-};
-
-struct common_chat_msg {
-    std::string role;
-    std::string content;
-    std::vector<common_chat_msg_content_part> content_parts = {};
-    std::vector<common_chat_tool_call> tool_calls = {};
-    std::string reasoning_content;
-    std::string tool_name;
-    std::string tool_call_id;
-};
-
-struct common_chat_tool {
-    std::string name;
-    std::string description;
-    std::string parameters;
-};
-
-enum common_chat_tool_choice {
-    COMMON_CHAT_TOOL_CHOICE_AUTO,
-    COMMON_CHAT_TOOL_CHOICE_REQUIRED,
-    COMMON_CHAT_TOOL_CHOICE_NONE,
-};
-
-enum common_chat_format {
-    COMMON_CHAT_FORMAT_CONTENT_ONLY,
-    COMMON_CHAT_FORMAT_GENERIC,
-    COMMON_CHAT_FORMAT_MISTRAL_NEMO,
-    COMMON_CHAT_FORMAT_LLAMA_3_X,
-    COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
-    COMMON_CHAT_FORMAT_DEEPSEEK_R1,
-    COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING,
-    COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
-    COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
-    COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
-    COMMON_CHAT_FORMAT_HERMES_2_PRO,
-    COMMON_CHAT_FORMAT_COMMAND_R7B,
-    COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING,
-
-    COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
-};
-
-struct common_chat_templates_inputs {
-    std::vector<common_chat_msg> messages;
-    std::string grammar;
-    std::string json_schema;
-    bool add_generation_prompt = true;
-    bool use_jinja = true;
-    // Parameters below only supported when use_jinja is true
-    std::vector<common_chat_tool> tools;
-    common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
-    bool parallel_tool_calls = false;
-    bool extract_reasoning     = true;
-};
-
-struct common_chat_params {
-    common_chat_format                  format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
-    std::string                         prompt;
-    std::string                         grammar;
-    bool                                grammar_lazy = false;
-    std::vector<common_grammar_trigger> grammar_triggers;
-    std::vector<std::string>            preserved_tokens;
-    std::vector<std::string>            additional_stops;
-};
-
-// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
-bool common_chat_verify_template(const std::string & tmpl, bool use_jinja);
-
-void common_chat_templates_free(struct common_chat_templates * tmpls);
-
-struct common_chat_templates_deleter { void operator()(common_chat_templates * tmpls) { common_chat_templates_free(tmpls); } };
-
-typedef std::unique_ptr<struct common_chat_templates, common_chat_templates_deleter> common_chat_templates_ptr;
-
-common_chat_templates_ptr common_chat_templates_init(
-                                    const struct llama_model * model,
-                                           const std::string & chat_template_override,
-                                           const std::string & bos_token_override = "",
-                                           const std::string & eos_token_override = "");
-
-bool         common_chat_templates_was_explicit(const struct common_chat_templates * tmpls);
-const char * common_chat_templates_source(const struct common_chat_templates * tmpls, const char * variant = nullptr);
-
-
-struct common_chat_params      common_chat_templates_apply(
-    const struct common_chat_templates * tmpls,
-    const struct common_chat_templates_inputs & inputs);
-
-// Format single message, while taking into account the position of that message in chat history
-std::string common_chat_format_single(
-        const struct common_chat_templates * tmpls,
-        const std::vector<common_chat_msg> & past_msg,
-        const common_chat_msg & new_msg,
-        bool add_ass,
-        bool use_jinja);
-
-// Returns an example of formatted chat
-std::string common_chat_format_example(
-    const struct common_chat_templates * tmpls,
-    bool use_jinja);
-
-std::string               common_chat_format_name(common_chat_format format);
-common_chat_msg           common_chat_parse(      const std::string & input, common_chat_format format);
-
-common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
-
-// Parses a JSON array of messages in OpenAI's chat completion API format.
-// T can be std::string containing JSON or nlohmann::ordered_json
-template <class T> std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const T & messages);
-template <class T> T common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msgs, bool concat_typed_text = false);
-
-// Parses a JSON array of tools in OpenAI's chat completion tool call API format.
-// T can be std::string containing JSON or nlohmann::ordered_json
-template <class T> std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const T & tools);
-template <class T> T common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools);
--- a/common/chat.hpp
+++ b/common/chat.hpp
@@ -0,0 +1,52 @@
+// Chat support (incl. tool call grammar constraining & output parsing) w/ generic & custom template handlers.
+
+#pragma once
+
+#include "common.h"
+#include <json.hpp>
+#include <optional>
+#include <string>
+#include <vector>
+
+using json = nlohmann::ordered_json;
+
+struct common_chat_inputs {
+    json messages;
+    json tools;
+    json tool_choice;
+    json json_schema;
+    bool parallel_tool_calls;
+    bool stream;
+    std::string grammar;
+    bool add_generation_prompt = true;
+};
+
+enum common_chat_format {
+    COMMON_CHAT_FORMAT_CONTENT_ONLY,
+    COMMON_CHAT_FORMAT_GENERIC,
+    COMMON_CHAT_FORMAT_MISTRAL_NEMO,
+    COMMON_CHAT_FORMAT_LLAMA_3_X,
+    COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
+    COMMON_CHAT_FORMAT_DEEPSEEK_R1,
+    COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
+    COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
+    COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
+    COMMON_CHAT_FORMAT_HERMES_2_PRO,
+    COMMON_CHAT_FORMAT_COMMAND_R7B,
+
+    COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
+};
+
+struct common_chat_params {
+    common_chat_format                  format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
+    json                                prompt;
+    std::string                         grammar;
+    bool                                grammar_lazy = false;
+    std::vector<common_grammar_trigger> grammar_triggers;
+    std::vector<std::string>            preserved_tokens;
+    std::vector<std::string>            additional_stops;
+};
+
+struct common_chat_params common_chat_params_init(const common_chat_template & tmpl, const struct common_chat_inputs & params);
+std::string               common_chat_format_name(common_chat_format format);
+common_chat_msg           common_chat_parse(      const std::string & input, common_chat_format format);
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -12,6 +12,8 @@
 #include "json.hpp"
 #include "json-schema-to-grammar.h"
 #include "llama.h"
+#include "chat.hpp"
+#include "chat-template.hpp"

 #include <algorithm>
 #include <cinttypes>
@@ -1766,6 +1768,174 @@ std::string common_detokenize(const struct llama_vocab * vocab, const std::vecto
    return text;
 }

+//
+// Chat template utils
+//
+
+bool common_chat_verify_template(const std::string & tmpl, bool use_jinja) {
+    if (use_jinja) {
+        try {
+            auto chat_template = common_chat_template(tmpl, "<s>", "</s>");
+            common_chat_inputs inputs;
+            inputs.messages = json::array({{
+                {"role", "user"},
+                {"content", "test"},
+            }});
+            common_chat_params_init(chat_template, inputs);
+            return true;
+        } catch (const std::exception & e) {
+            LOG_ERR("%s: failed to apply template: %s\n", __func__, e.what());
+            return false;
+        }
+    }
+    llama_chat_message chat[] = {{"user", "test"}};
+    const int res = llama_chat_apply_template(tmpl.c_str(), chat, 1, true, nullptr, 0);
+    return res >= 0;
+}
+
+std::string common_chat_apply_template(
+        const common_chat_template & tmpl,
+        const std::vector<common_chat_msg> & msgs,
+        bool add_ass,
+        bool use_jinja) {
+    if (use_jinja) {
+        auto messages = json::array();
+        for (const auto & msg : msgs) {
+            messages.push_back({{"role", msg.role}, {"content", msg.content}});
+        }
+        common_chat_inputs inputs;
+        inputs.messages = messages;
+        inputs.add_generation_prompt = add_ass;
+        return common_chat_params_init(tmpl, inputs).prompt;
+    }
+
+    int alloc_size = 0;
+    std::vector<llama_chat_message> chat;
+    for (const auto & msg : msgs) {
+        chat.push_back({msg.role.c_str(), msg.content.c_str()});
+        alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
+    }
+
+    std::vector<char> buf(alloc_size);
+
+    // run the first time to get the total output length
+    int32_t res = llama_chat_apply_template(tmpl.source().c_str(), chat.data(), chat.size(), add_ass, buf.data(), buf.size());
+
+    // error: chat template is not supported
+    if (res < 0) {
+        // if the custom "tmpl" is not supported, we throw an error
+        // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
+        throw std::runtime_error("this custom template is not supported");
+    }
+
+    // if it turns out that our buffer is too small, we resize it
+    if ((size_t) res > buf.size()) {
+        buf.resize(res);
+        res = llama_chat_apply_template(tmpl.source().c_str(), chat.data(), chat.size(), add_ass, buf.data(), buf.size());
+    }
+
+    std::string formatted_chat(buf.data(), res);
+    return formatted_chat;
+}
+
+std::string common_chat_format_single(
+        const common_chat_template & tmpl,
+        const std::vector<common_chat_msg> & past_msg,
+        const common_chat_msg & new_msg,
+        bool add_ass,
+        bool use_jinja) {
+    std::ostringstream ss;
+    auto fmt_past_msg = past_msg.empty() ? "" : common_chat_apply_template(tmpl, past_msg, false, use_jinja);
+    std::vector<common_chat_msg> chat_new(past_msg);
+    // if the past_msg ends with a newline, we must preserve it in the formatted version
+    if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
+        ss << "\n";
+    };
+    // format chat with new_msg
+    chat_new.push_back(new_msg);
+    auto fmt_new_msg = common_chat_apply_template(tmpl, chat_new, add_ass, use_jinja);
+    // get the diff part
+    ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
+    return ss.str();
+}
+
+std::string common_chat_format_example(const common_chat_template & tmpl, bool use_jinja) {
+    std::vector<common_chat_msg> msgs = {
+        {"system",    "You are a helpful assistant", {}},
+        {"user",      "Hello", {}},
+        {"assistant", "Hi there", {}},
+        {"user",      "How are you?", {}},
+    };
+    return common_chat_apply_template(tmpl, msgs, true, use_jinja);
+}
+
+#define CHATML_TEMPLATE_SRC \
+    "{%- for message in messages -%}\n" \
+    "  {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>\n' -}}\n" \
+    "{%- endfor -%}\n" \
+    "{%- if add_generation_prompt -%}\n" \
+    "  {{- '<|im_start|>assistant\n' -}}\n" \
+    "{%- endif -%}"
+
+common_chat_templates common_chat_templates_from_model(const struct llama_model * model, const std::string & chat_template_override)
+{
+    std::string default_template_src;
+    std::string template_tool_use_src;
+
+    bool has_explicit_template = !chat_template_override.empty();
+    if (chat_template_override.empty()) {
+        auto str = llama_model_chat_template(model, /* name */ nullptr);
+        if (str) {
+            default_template_src = str;
+            has_explicit_template = true;
+        }
+        str = llama_model_chat_template(model, /* name */ "tool_use");
+        if (str) {
+            template_tool_use_src = str;
+            has_explicit_template = true;
+        }
+    } else {
+        default_template_src = chat_template_override;
+    }
+    if (default_template_src.empty() || default_template_src == "chatml") {
+        if (!template_tool_use_src.empty()) {
+            default_template_src = template_tool_use_src;
+        } else {
+            default_template_src = CHATML_TEMPLATE_SRC;
+        }
+    }
+    auto vocab = llama_model_get_vocab(model);
+    const auto get_token = [&](llama_token token, const char * name, const char * jinja_variable_name) {
+        if (token == LLAMA_TOKEN_NULL) {
+            if (default_template_src.find(jinja_variable_name) != std::string::npos
+                || template_tool_use_src.find(jinja_variable_name) != std::string::npos) {
+                LOG_WRN("%s: warning: vocab does not have a %s token, jinja template won't work as intended.\n", __func__, name);
+            }
+            return std::string();
+        } else {
+            return common_token_to_piece(vocab, token, true);
+        }
+    };
+    auto token_bos = get_token(llama_vocab_bos(vocab), "BOS", "bos_token");
+    auto token_eos = get_token(llama_vocab_eos(vocab), "EOS", "eos_token");
+    try {
+        return {
+            has_explicit_template,
+            std::make_unique<minja::chat_template>(default_template_src, token_bos, token_eos),
+            template_tool_use_src.empty()
+                ? nullptr
+                : std::make_unique<minja::chat_template>(template_tool_use_src, token_bos, token_eos),
+        };
+    } catch (const std::exception & e) {
+        LOG_ERR("%s: failed to parse chat template: %s\n", __func__, e.what());
+        return {
+            has_explicit_template,
+            std::make_unique<minja::chat_template>(CHATML_TEMPLATE_SRC, token_bos, token_eos),
+            nullptr,
+        };
+    }
+}
+
 //
 // KV cache utils
 //
--- a/common/common.h
+++ b/common/common.h
@@ -140,7 +140,6 @@ struct common_params_sampling {
    int32_t dry_allowed_length = 2;     // tokens extending repetitions beyond this receive penalty
    int32_t dry_penalty_last_n = -1;    // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
    int32_t mirostat           = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
-    float   top_n_sigma        = -1.00f;// -1.0 = disabled
    float   mirostat_tau       = 5.00f; // target entropy
    float   mirostat_eta       = 0.10f; // learning rate
    bool    ignore_eos         = false;
@@ -178,10 +177,10 @@ struct common_params_speculative {

    int32_t n_ctx        =     0; // draft context size
    int32_t n_max        =    16; // maximum number of tokens to draft during speculative decoding
-    int32_t n_min        =     0; // minimum number of draft tokens to use for speculative decoding
+    int32_t n_min        =     5; // minimum number of draft tokens to use for speculative decoding
    int32_t n_gpu_layers =    -1; // number of layers to store in VRAM for the draft model (-1 - use default)
    float   p_split      =  0.1f; // speculative decoding split probability
-    float   p_min        = 0.75f; // minimum speculative decoding probability (greedy)
+    float   p_min        =  0.9f; // minimum speculative decoding probability (greedy)

    struct cpu_params cpuparams;
    struct cpu_params cpuparams_batch;
@@ -200,16 +199,9 @@ struct common_params_vocoder {
    std::string model     = ""; // model path                                                // NOLINT
    std::string model_url = ""; // model url to download                                     // NOLINT

-    std::string speaker_file = ""; // speaker file path                                      // NOLINT
-
    bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy            // NOLINT
 };

-enum common_reasoning_format {
-    COMMON_REASONING_FORMAT_NONE,
-    COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`
-};
-
 struct common_params {
    int32_t n_predict             =    -1; // new tokens to predict
    int32_t n_ctx                 =  4096; // context size
@@ -263,7 +255,6 @@ struct common_params {
    std::string hf_repo              = ""; // HF repo                                                       // NOLINT
    std::string hf_file              = ""; // HF file                                                       // NOLINT
    std::string prompt               = "";                                                                  // NOLINT
-    std::string system_prompt        = "";                                                                  // NOLINT
    std::string prompt_file          = ""; // store the external prompt file name                           // NOLINT
    std::string path_prompt_cache    = ""; // path to file for saving/loading prompt eval state             // NOLINT
    std::string input_prefix         = ""; // string to prefix user inputs with                             // NOLINT
@@ -301,7 +292,6 @@ struct common_params {
    bool   kl_divergence    = false; // compute KL divergence

    bool usage             = false; // print usage
-    bool completion        = false; // print source-able completion script
    bool use_color         = false; // use color to distinguish generations and inputs
    bool special           = false; // enable special token output
    bool interactive       = false; // interactive mode
@@ -356,7 +346,6 @@ struct common_params {
    std::string chat_template = "";                                                                         // NOLINT
    bool use_jinja = false;                                                                                 // NOLINT
    bool enable_chat_template = true;
-    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;

    std::vector<std::string> api_keys;

@@ -435,13 +424,13 @@ bool set_process_priority(enum ggml_sched_priority prio);
 //

 #ifdef __GNUC__
-#    if defined(__MINGW32__) && !defined(__clang__)
-#        define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
-#    else
-#        define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
-#    endif
+#ifdef __MINGW32__
+#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
 #else
-#    define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
+#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
+#endif
+#else
+#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
 #endif

 LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
@@ -619,6 +608,62 @@ std::string common_detokenize(
        const std::vector<llama_token> & tokens,
                                  bool   special = true);

+//
+// Chat template utils
+//
+
+struct common_tool_call {
+    std::string name;
+    std::string arguments;
+    std::string id;
+};
+
+// same with llama_chat_message, but uses std::string
+struct common_chat_msg {
+    std::string role;
+    std::string content;
+    std::vector<common_tool_call> tool_calls;
+    std::string tool_plan = "";
+};
+
+// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
+bool common_chat_verify_template(const std::string & tmpl, bool use_jinja);
+
+namespace minja {
+    class chat_template;
+}
+
+typedef minja::chat_template common_chat_template;
+
+struct common_chat_templates {
+    bool has_explicit_template; // Model had builtin template or template overridde was specified.
+    std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
+    std::unique_ptr<common_chat_template> template_tool_use;
+};
+
+// CPP wrapper for llama_chat_apply_template
+// If the built-in template is not supported, we default to chatml
+// If the custom "tmpl" is not supported, we throw an error
+std::string common_chat_apply_template(
+        const common_chat_template & tmpl,
+        const std::vector<common_chat_msg> & chat,
+        bool add_ass,
+        bool use_jinja);
+
+// Format single message, while taking into account the position of that message in chat history
+std::string common_chat_format_single(
+        const common_chat_template & tmpl,
+        const std::vector<common_chat_msg> & past_msg,
+        const common_chat_msg & new_msg,
+        bool add_ass,
+        bool use_jinja);
+
+// Returns an example of formatted chat
+std::string common_chat_format_example(
+    const common_chat_template & tmpl, bool use_jinja);
+
+common_chat_templates common_chat_templates_from_model(const struct llama_model * model, const std::string & chat_template_override);
+
 //
 // KV cache utils
 //
--- a/common/log.cpp
+++ b/common/log.cpp
@@ -1,6 +1,5 @@
 #include "log.h"

-#include <chrono>
 #include <condition_variable>
 #include <cstdarg>
 #include <cstdio>
--- a/common/log.h
+++ b/common/log.h
@@ -15,7 +15,7 @@

 #ifndef __GNUC__
 #    define LOG_ATTRIBUTE_FORMAT(...)
-#elif defined(__MINGW32__) && !defined(__clang__)
+#elif defined(__MINGW32__)
 #    define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
 #else
 #    define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
--- a/common/minja/minja.hpp
+++ b/common/minja/minja.hpp
@@ -1385,13 +1385,6 @@ static std::string strip(const std::string & s) {
  return s.substr(start, end - start + 1);
 }

-static std::string capitalize(const std::string & s) {
-  if (s.empty()) return s;
-  auto result = s;
-  result[0] = std::toupper(result[0]);
-  return result;
-}
-
 static std::string html_escape(const std::string & s) {
  std::string result;
  result.reserve(s.size());
@@ -1469,9 +1462,6 @@ public:
          if (method->get_name() == "strip") {
            vargs.expectArgs("strip method", {0, 0}, {0, 0});
            return Value(strip(str));
-          } else if (method->get_name() == "capitalize") {
-            vargs.expectArgs("capitalize method", {0, 0}, {0, 0});
-            return Value(capitalize(str));
          } else if (method->get_name() == "endswith") {
            vargs.expectArgs("endswith method", {1, 1}, {0, 0});
            auto suffix = vargs.args[0].get<std::string>();
@@ -1802,7 +1792,7 @@ private:
        auto left = parseStringConcat();
        if (!left) throw std::runtime_error("Expected left side of 'logical compare' expression");

-        static std::regex compare_tok(R"(==|!=|<=?|>=?|in\b|is\b|not\s+in\b)");
+        static std::regex compare_tok(R"(==|!=|<=?|>=?|in\b|is\b|not[\r\n\s]+in\b)");
        static std::regex not_tok(R"(not\b)");
        std::string op_str;
        while (!(op_str = consumeToken(compare_tok)).empty()) {
@@ -2181,7 +2171,7 @@ private:
    using TemplateTokenIterator = TemplateTokenVector::const_iterator;

    std::vector<std::string> parseVarNames() {
-      static std::regex varnames_regex(R"(((?:\w+)(?:\s*,\s*(?:\w+))*)\s*)");
+      static std::regex varnames_regex(R"(((?:\w+)(?:[\r\n\s]*,[\r\n\s]*(?:\w+))*)[\r\n\s]*)");

      std::vector<std::string> group;
      if ((group = consumeTokenGroups(varnames_regex)).empty()) throw std::runtime_error("Expected variable names");
@@ -2204,13 +2194,13 @@ private:
    }

    TemplateTokenVector tokenize() {
-      static std::regex comment_tok(R"(\{#([-~]?)([\s\S]*?)([-~]?)#\})");
+      static std::regex comment_tok(R"(\{#([-~]?)([\s\S\r\n]*?)([-~]?)#\})");
      static std::regex expr_open_regex(R"(\{\{([-~])?)");
-      static std::regex block_open_regex(R"(^\{%([-~])?\s*)");
+      static std::regex block_open_regex(R"(^\{%([-~])?[\s\n\r]*)");
      static std::regex block_keyword_tok(R"((if|else|elif|endif|for|endfor|generation|endgeneration|set|endset|block|endblock|macro|endmacro|filter|endfilter|break|continue)\b)");
      static std::regex non_text_open_regex(R"(\{\{|\{%|\{#)");
-      static std::regex expr_close_regex(R"(\s*([-~])?\}\})");
-      static std::regex block_close_regex(R"(\s*([-~])?%\})");
+      static std::regex expr_close_regex(R"([\s\n\r]*([-~])?\}\})");
+      static std::regex block_close_regex(R"([\s\n\r]*([-~])?%\})");

      TemplateTokenVector tokens;
      std::vector<std::string> group;
@@ -2294,7 +2284,7 @@ private:
              auto post_space = parseBlockClose();
              tokens.push_back(std::make_unique<EndGenerationTemplateToken>(location, pre_space, post_space));
            } else if (keyword == "set") {
-              static std::regex namespaced_var_regex(R"((\w+)\s*\.\s*(\w+))");
+              static std::regex namespaced_var_regex(R"((\w+)[\s\n\r]*\.[\s\n\r]*(\w+))");

              std::string ns;
              std::vector<std::string> var_names;
@@ -2346,11 +2336,6 @@ private:
              throw std::runtime_error("Unexpected block: " + keyword);
            }
          } else if (std::regex_search(it, end, match, non_text_open_regex)) {
-            if (!match.position()) {
-                if (match[0] != "{#")
-                    throw std::runtime_error("Internal error: Expected a comment");
-                throw std::runtime_error("Missing end of comment tag");
-            }
            auto text_end = it + match.position();
            text = std::string(it, text_end);
            it = text_end;
@@ -2415,7 +2400,7 @@ private:

              auto text = text_token->text;
              if (post_space == SpaceHandling::Strip) {
-                static std::regex trailing_space_regex(R"(\s+$)");
+                static std::regex trailing_space_regex(R"((\s|\r|\n)+$)");
                text = std::regex_replace(text, trailing_space_regex, "");
              } else if (options.lstrip_blocks && it != end) {
                auto i = text.size();
@@ -2425,7 +2410,7 @@ private:
                }
              }
              if (pre_space == SpaceHandling::Strip) {
-                static std::regex leading_space_regex(R"(^\s+)");
+                static std::regex leading_space_regex(R"(^(\s|\r|\n)+)");
                text = std::regex_replace(text, leading_space_regex, "");
              } else if (options.trim_blocks && (it - 1) != begin && !dynamic_cast<ExpressionTemplateToken*>((*(it - 2)).get())) {
                if (text.length() > 0 && text[0] == '\n') {
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -134,11 +134,11 @@ std::string common_params_sampling::print() const {
    snprintf(result, sizeof(result),
            "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
            "\tdry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d\n"
-            "\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, top_n_sigma = %.3f, temp = %.3f\n"
+            "\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, temp = %.3f\n"
            "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
            penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
            dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n,
-            top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, top_n_sigma, temp,
+            top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, temp,
            mirostat, mirostat_eta, mirostat_tau);

    return std::string(result);
@@ -151,6 +151,12 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co

    lparams.no_perf = params.no_perf;

+    std::vector<const char *> trigger_words;
+    trigger_words.reserve(params.grammar_trigger_words.size());
+    for (const auto & str : params.grammar_trigger_words) {
+        trigger_words.push_back(str.word.c_str());
+    }
+
    struct llama_sampler * grmr;
    if (params.grammar.compare(0, 11, "%llguidance") == 0) {
 #ifdef LLAMA_USE_LLGUIDANCE
@@ -159,12 +165,6 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
        GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
 #endif // LLAMA_USE_LLGUIDANCE
    } else {
-        std::vector<const char *> trigger_words;
-        trigger_words.reserve(params.grammar_trigger_words.size());
-        for (const auto & str : params.grammar_trigger_words) {
-            trigger_words.push_back(str.word.c_str());
-        }
-
        grmr = params.grammar_lazy
             ? llama_sampler_init_grammar_lazy(vocab, params.grammar.c_str(), "root",
                                               trigger_words.data(), trigger_words.size(),
@@ -188,51 +188,45 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
                params.logit_bias.data()));

    if (params.mirostat == 0) {
-        if (params.top_n_sigma >= 0) {
-            llama_sampler_chain_add(result->chain, llama_sampler_init_top_k        (params.top_k));
-            llama_sampler_chain_add(result->chain, llama_sampler_init_temp         (params.temp));
-            llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma  (params.top_n_sigma));
-        } else {
-            for (const auto & cnstr : params.samplers) {
-                switch (cnstr) {
-                    case COMMON_SAMPLER_TYPE_DRY:
-                        {
-                            std::vector<const char *> c_breakers;
-                            c_breakers.reserve(params.dry_sequence_breakers.size());
-                            for (const auto & str : params.dry_sequence_breakers) {
-                                c_breakers.push_back(str.c_str());
-                            }
-
-                            llama_sampler_chain_add(result->chain, llama_sampler_init_dry      (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
+        for (const auto & cnstr : params.samplers) {
+            switch (cnstr) {
+                case COMMON_SAMPLER_TYPE_DRY:
+                    {
+                        std::vector<const char *> c_breakers;
+                        c_breakers.reserve(params.dry_sequence_breakers.size());
+                        for (const auto & str : params.dry_sequence_breakers) {
+                            c_breakers.push_back(str.c_str());
                        }
-                        break;
-                    case COMMON_SAMPLER_TYPE_TOP_K:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_top_k    (params.top_k));
-                        break;
-                    case COMMON_SAMPLER_TYPE_TOP_P:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_top_p    (params.top_p, params.min_keep));
-                        break;
-                    case COMMON_SAMPLER_TYPE_MIN_P:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_min_p    (params.min_p, params.min_keep));
-                        break;
-                    case COMMON_SAMPLER_TYPE_XTC:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_xtc      (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
-                        break;
-                    case COMMON_SAMPLER_TYPE_TYPICAL_P:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_typical  (params.typ_p, params.min_keep));
-                        break;
-                    case COMMON_SAMPLER_TYPE_TEMPERATURE:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
-                        break;
-                    case COMMON_SAMPLER_TYPE_INFILL:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_infill   (vocab));
-                        break;
-                    case COMMON_SAMPLER_TYPE_PENALTIES:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
-                        break;
-                    default:
-                        GGML_ASSERT(false && "unknown sampler type");
-                }
+
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_dry      (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
+                    }
+                    break;
+                case COMMON_SAMPLER_TYPE_TOP_K:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_top_k    (params.top_k));
+                    break;
+                case COMMON_SAMPLER_TYPE_TOP_P:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_top_p    (params.top_p, params.min_keep));
+                    break;
+                case COMMON_SAMPLER_TYPE_MIN_P:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_min_p    (params.min_p, params.min_keep));
+                    break;
+                case COMMON_SAMPLER_TYPE_XTC:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_xtc      (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
+                    break;
+                case COMMON_SAMPLER_TYPE_TYPICAL_P:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_typical  (params.typ_p, params.min_keep));
+                    break;
+                case COMMON_SAMPLER_TYPE_TEMPERATURE:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
+                    break;
+                case COMMON_SAMPLER_TYPE_INFILL:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_infill   (vocab));
+                    break;
+                case COMMON_SAMPLER_TYPE_PENALTIES:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
+                    break;
+                default:
+                    GGML_ASSERT(false && "unknown sampler type");
            }
        }
        llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -252,6 +252,11 @@ llama_tokens common_speculative_gen_draft(
        // add drafted token for each sequence
        const llama_token id = cur_p->data[0].id;

+        // only collect very high-confidence draft tokens
+        if (cur_p->data[0].p < params.p_min) {
+            break;
+        }
+
        common_sampler_accept(smpl, id, true);

        result.push_back(id);
@@ -260,11 +265,6 @@ llama_tokens common_speculative_gen_draft(
            break;
        }

-        // only collect very high-confidence draft tokens
-        if (cur_p->data[0].p < params.p_min) {
-            break;
-        }
-
        common_batch_add(batch, id, n_past + i + 1, { 0 }, true);

        // evaluate the drafted tokens on the draft model
--- a/common/speculative.h
+++ b/common/speculative.h
@@ -9,7 +9,7 @@ struct common_speculative_params {
    int n_draft = 16;  // max drafted tokens
    int n_reuse = 256;

-    float p_min = 0.75f; // min probability required to accept a token in the draft
+    float p_min = 0.9f; // min probabiliy required to accept a token in the draft
 };

 struct common_speculative * common_speculative_init(struct llama_context * ctx_dft);
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -558,7 +558,7 @@ class Model:

    # NOTE: this function is generated by convert_hf_to_gguf_update.py
    #       do not modify it manually!
-    # ref:  https://github.com/ggml-org/llama.cpp/pull/6920
+    # ref:  https://github.com/ggerganov/llama.cpp/pull/6920
    # Marker: Start get_vocab_base_pre
    def get_vocab_base_pre(self, tokenizer) -> str:
        # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
@@ -699,9 +699,6 @@ class Model:
        if chkhsh == "b3f499bb4255f8ca19fccd664443283318f2fd2414d5e0b040fbdd0cc195d6c5":
            # ref: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
            res = "deepseek-r1-qwen"
-        if chkhsh == "ccc2ef013c104be7bae2965776d611e1d7a8a2a9c547dd93a682c9a9fc80352e":
-            # ref: https://huggingface.co/Xenova/gpt-4o
-            res = "gpt-4o"

        if res is None:
            logger.warning("\n")
@@ -711,7 +708,7 @@ class Model:
            logger.warning("**          - the model has not been added to convert_hf_to_gguf_update.py yet")
            logger.warning("**          - the pre-tokenization config has changed upstream")
            logger.warning("**          Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
-            logger.warning("** ref:     https://github.com/ggml-org/llama.cpp/pull/6920")
+            logger.warning("** ref:     https://github.com/ggerganov/llama.cpp/pull/6920")
            logger.warning("**")
            logger.warning(f"** chkhsh:  {chkhsh}")
            logger.warning("**************************************************************************************")
@@ -2515,8 +2512,7 @@ class Phi3MiniModel(Model):
        rms_eps = self.find_hparam(["rms_norm_eps"])
        max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
        orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
-        rot_pct = self.hparams.get("partial_rotary_factor", 1.0)
-        rope_dims = int(rot_pct * n_embd) // n_head
+        rope_dims = n_embd // n_head

        self.gguf_writer.add_context_length(max_pos_embds)
        self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds)
@@ -2540,8 +2536,7 @@ class Phi3MiniModel(Model):
        n_head = self.find_hparam(["num_attention_heads", "n_head"])
        max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
        orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
-        rot_pct = self.hparams.get("partial_rotary_factor", 1.0)
-        rope_dims = int(rot_pct * n_embd) // n_head
+        rope_dims = n_embd // n_head

        # write rope scaling for long context (128k) model
        rope_scaling = self.find_hparam(['rope_scaling'], True)
@@ -2570,7 +2565,7 @@ class Phi3MiniModel(Model):
            raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')

        if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
-            raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}. long_factors = {len(long_factors)}, short_factors = {len(short_factors)}.')
+            raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')

        yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
        yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
@@ -2840,7 +2835,7 @@ class InternLM2Model(Model):
        if chat_eos_token_id is not None:
            # For the chat model, we replace the eos with '<|im_end|>'.
            # TODO: this is a hack, should be fixed
-            #       https://github.com/ggml-org/llama.cpp/pull/6745#issuecomment-2067687048
+            #       https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048
            special_vocab.special_token_ids["eos"] = chat_eos_token_id
            logger.warning(f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}"
                           " in chat mode so that the conversation can end normally.")
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -8,7 +8,7 @@
 # provide the necessary information to llama.cpp via the GGUF header in order to implement
 # the same pre-tokenizer.
 #
-# ref: https://github.com/ggml-org/llama.cpp/pull/6920
+# ref: https://github.com/ggerganov/llama.cpp/pull/6920
 #
 # Instructions:
 #
@@ -109,7 +109,6 @@ models = [
    {"name": "megrez",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Infinigence/Megrez-3B-Instruct"},
    {"name": "deepseek-v3",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-V3"},
    {"name": "deepseek-r1-qwen", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"},
-    {"name": "gpt-4o",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Xenova/gpt-4o", },
 ]


@@ -132,10 +131,6 @@ def download_model(model):

    files = ["config.json", "tokenizer.json", "tokenizer_config.json"]

-    if name == "gpt-4o":
-        # Xenova/gpt-4o is tokenizer-only, it does not contain config.json
-        files = ["tokenizer.json", "tokenizer_config.json"]
-
    if tokt == TOKENIZER_TYPE.SPM:
        files.append("tokenizer.model")

@@ -251,7 +246,7 @@ src_func = f"""
            logger.warning("**          - the model has not been added to convert_hf_to_gguf_update.py yet")
            logger.warning("**          - the pre-tokenization config has changed upstream")
            logger.warning("**          Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
-            logger.warning("** ref:     https://github.com/ggml-org/llama.cpp/pull/6920")
+            logger.warning("** ref:     https://github.com/ggerganov/llama.cpp/pull/6920")
            logger.warning("**")
            logger.warning(f"** chkhsh:  {{chkhsh}}")
            logger.warning("**************************************************************************************")
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@@ -395,7 +395,7 @@ if __name__ == '__main__':
                        logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
                        if ".embed_tokens.weight" in name or ".lm_head.weight" in name:
                            logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning")
-                            logger.error("Please refer to https://github.com/ggml-org/llama.cpp/pull/9948")
+                            logger.error("Please refer to https://github.com/ggerganov/llama.cpp/pull/9948")
                        sys.exit(1)

                    if base_name in tensor_map:
@@ -419,7 +419,7 @@ if __name__ == '__main__':
                # some archs may have the same tensor for lm_head and output (tie word embeddings)
                # in this case, adapters targeting lm_head will fail when using llama-export-lora
                # therefore, we ignore them for now
-                # see: https://github.com/ggml-org/llama.cpp/issues/9065
+                # see: https://github.com/ggerganov/llama.cpp/issues/9065
                if name == "lm_head.weight" and len(dest) == 0:
                    raise ValueError("lm_head is present in adapter, but is ignored in base model")
                for dest_name, dest_data in dest:
--- a/docs/android.md
+++ b/docs/android.md
@@ -12,7 +12,7 @@ $ apt update && apt upgrade -y
 $ apt install git cmake
 ```

-Then, follow the [build instructions](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md), specifically for CMake.
+Then, follow the [build instructions](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md), specifically for CMake.

 Once the binaries are built, download your model of choice (e.g., from Hugging Face). It's recommended to place it in the `~/` directory for best performance:

--- a/docs/backend/OPENCL.md
+++ b/docs/backend/OPENCL.md
@@ -1,205 +0,0 @@
-# llama.cpp for OpenCL
-
- [Background](#background)
- [OS](#os)
- [Hardware](#hardware)
- [DataType Supports](#datatype-supports)
- [Model Preparation](#model-preparation)
- [CMake Options](#cmake-options)
- [Android](#android)
- [Windows 11 Arm64](#windows-11-arm64)
- [Known Issue](#known-issues)
- [TODO](#todo)
-
-## Background
-
-OpenCL (Open Computing Language) is an open, royalty-free standard for cross-platform, parallel programming of diverse accelerators found in supercomputers, cloud servers, personal computers, mobile devices and embedded platforms. OpenCL specifies a programming language (based on C99) for programming these devices and application programming interfaces (APIs) to control the platform and execute programs on the compute devices. Similar to CUDA, OpenCL has been widely used to program GPUs and is supported by most GPU vendors.
-
-### Llama.cpp + OpenCL
-
-The llama.cpp OpenCL backend is designed to enable llama.cpp on **Qualcomm Adreno GPU** firstly via OpenCL. Thanks to the portabilty of OpenCL, the OpenCL backend can also run on certain Intel GPUs although the performance is not optimal.
-
-## OS
-
-| OS      | Status  | Verified                                       |
-|---------|---------|------------------------------------------------|
-| Android | Support | Snapdragon 8 Gen 3, Snapdragon 8 Elite         |
-| Windows | Support | Windows 11 Arm64 with Snapdragon X Elite       |
-| Linux   | Support | Ubuntu 22.04 WSL2 with Intel 12700H            |
-
-## Hardware
-
-### Adreno GPU
-
-**Verified devices**
-
-| Adreno GPU                           | Status  |
-|:------------------------------------:|:-------:|
-| Adreno 750 (Snapdragon 8 Gen 3)      | Support |
-| Adreno 830 (Snapdragon 8 Elite)      | Support |
-| Adreno X85 (Snapdragon X Elite)      | Support |
-
-## DataType Supports
-
-| DataType               | Status                     |
-|:----------------------:|:--------------------------:|
-| Q4_0                   | Support                    |
-| Q6_K                   | Support, but not optimized |
-
-## Model Preparation
-
-You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration.
-
-Currently we support `Q4_0` quantization and have optimize for it. To achieve best performance on Adreno GPU, add `--pure` to `llama-quantize`. For example,
-
-```sh
-./llama-quantize --pure ggml-model-qwen2.5-3b-f16.gguf ggml-model-qwen-3b-Q4_0.gguf Q4_0
-```
-
-Since `Q6_K` is also supported, `Q4_0` quantization without `--pure` will also work. However, the performance will be worse compared to pure `Q4_0` quantization.
-
-## CMake Options
-
-The OpenCL backend has the following CMake options that control the behavior of the backend.
-
-| CMake options                     | Default value  | Description                               |
-|:---------------------------------:|:--------------:|:------------------------------------------|
-| `GGML_OPENCL_EMBED_KERNELS`       | `ON`           | Embed OpenCL kernels into the executable. |
-| `GGML_OPENCL_USE_ADRENO_KERNELS`  | `ON`           | Use kernels optimized for Adreno.         |
-
-## Android
-
-Ubuntu 22.04 is used for targeting Android. Make sure the following tools are accessible from command line,
-
-* Git
-* CMake 3.29
-* Ninja
-* Python3
-
-### I. Setup Environment
-
-1. **Install NDK**
-
-```sh
-cd ~
-wget https://dl.google.com/android/repository/commandlinetools-linux-8512546_latest.zip && \
-unzip commandlinetools-linux-8512546_latest.zip && \
-mkdir -p ~/android-sdk/cmdline-tools && \
-mv cmdline-tools latest && \
-mv latest ~/android-sdk/cmdline-tools/ && \
-rm -rf commandlinetools-linux-8512546_latest.zip
-
-yes | ~/android-sdk/cmdline-tools/latest/bin/sdkmanager "ndk;26.3.11579264"
-```
-
-2. **Install OpenCL Headers and Library**
-
-```sh
-mkdir -p ~/dev/llm
-cd ~/dev/llm
-
-git clone https://github.com/KhronosGroup/OpenCL-Headers && \
-cd OpenCL-Headers && \
-cp -r CL ~/android-sdk/ndk/26.3.11579264/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
-
-cd ~/dev/llm
-
-git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader && \
-cd OpenCL-ICD-Loader && \
-mkdir build_ndk26 && cd build_ndk26 && \
-cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release \
-  -DCMAKE_TOOLCHAIN_FILE=$HOME/android-sdk/ndk/26.3.11579264/build/cmake/android.toolchain.cmake \
-  -DOPENCL_ICD_LOADER_HEADERS_DIR=$HOME/android-sdk/ndk/26.3.11579264/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include \
-  -DANDROID_ABI=arm64-v8a \
-  -DANDROID_PLATFORM=24 \
-  -DANDROID_STL=c++_shared && \
-ninja && \
-cp libOpenCL.so ~/android-sdk/ndk/26.3.11579264/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
-```
-
-### II. Build llama.cpp
-
-```sh
-cd ~/dev/llm
-
-git clone https://github.com/ggml-org/llama.cpp && \
-cd llama.cpp && \
-mkdir build-android && cd build-android
-
-cmake .. -G Ninja \
-  -DCMAKE_TOOLCHAIN_FILE=$HOME/android-sdk/ndk/26.3.11579264/build/cmake/android.toolchain.cmake \
-  -DANDROID_ABI=arm64-v8a \
-  -DANDROID_PLATFORM=android-28 \
-  -DBUILD_SHARED_LIBS=OFF \
-  -DGGML_OPENCL=ON
-
-ninja
-```
-
-## Windows 11 Arm64
-
-A Snapdragon X Elite device with Windows 11 Arm64 is used. Make sure the following tools are accessible from command line,
-
-* Git
-* CMake 3.29
-* Clang 19
-* Ninja
-* Visual Studio 2022
-
-Powershell is used for the following instructions.
-
-### I. Setup Environment
-
-1. **Install OpenCL Headers and Library**
-
-```powershell
-mkdir -p ~/dev/llm
-
-cd ~/dev/llm
-git clone https://github.com/KhronosGroup/OpenCL-Headers && cd OpenCL-Headers
-mkdir build && cd build
-cmake .. -G Ninja `
-  -DBUILD_TESTING=OFF `
-  -DOPENCL_HEADERS_BUILD_TESTING=OFF `
-  -DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
-  -DCMAKE_INSTALL_PREFIX="$HOME/dev/llm/opencl"
-cmake --build . --target install
-
-cd ~/dev/llm
-git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader && cd OpenCL-ICD-Loader
-mkdir build && cd build
-cmake .. -G Ninja `
-  -DCMAKE_BUILD_TYPE=Release `
-  -DCMAKE_PREFIX_PATH="$HOME/dev/llm/opencl" `
-  -DCMAKE_INSTALL_PREFIX="$HOME/dev/llm/opencl"
-cmake --build . --target install
-```
-
-### II. Build llama.cpp
-
-```powershell
-
-mkdir -p ~/dev/llm
-cd ~/dev/llm
-
-git clone https://github.com/ggml-org/llama.cpp && cd llama.cpp
-mkdir build && cd build
-
-cmake .. -G Ninja `
-  -DCMAKE_TOOLCHAIN_FILE="$HOME/dev/llm/llama.cpp/cmake/arm64-windows-llvm.cmake" `
-  -DCMAKE_BUILD_TYPE=Release `
-  -DCMAKE_PREFIX_PATH="$HOME/dev/llm/opencl" `
-  -DBUILD_SHARED_LIBS=OFF `
-  -DGGML_OPENCL=ON
-ninja
-```
-
-## Known Issues
-
- Qwen2.5 0.5B model produces gibberish output with Adreno kernels.
-
-## TODO
-
- Fix Qwen2.5 0.5B
- Optimization for Q6_K
- Support and optimization for Q4_K
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -36,22 +36,12 @@ The following release is verified with good quality:

 |Commit ID|Tag|Release|Verified  Platform| Update date|
 |-|-|-|-|-|
-|3bcd40b3c593d14261fb2abfabad3c0fb5b9e318|b4040 |[llama-b4040-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b4040/llama-b4040-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1| 2024-11-19|
-|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1||
+|3bcd40b3c593d14261fb2abfabad3c0fb5b9e318|b4040 |[llama-b4040-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b4040/llama-b4040-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1| 2024-11-19|
+|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1||


 ## News

- 2025.2
-  - Optimize MUL_MAT Q4_0 on Intel GPU for all dGPUs and built-in GPUs since MTL. Increase the performance of LLM (llama-2-7b.Q4_0.gguf) 21%-87% on Intel GPUs (MTL, ARL-H, Arc, Flex, PVC).
-    |GPU|Base tokens/s|Increased tokens/s|Percent|
-    |-|-|-|-|
-    |PVC 1550|39|73|+87%|
-    |Flex 170|39|50|+28%|
-    |Arc770|42|55|+30%|
-    |MTL|13|16|+23%|
-    |ARL-H|14|17|+21%|
-
 - 2024.11
  - Use syclcompat to improve the performance on some platforms. This requires to use oneAPI 2025.0 or newer.

@@ -68,7 +58,7 @@ The following release is verified with good quality:
 - 2024.3
  - Release binary files of Windows.
  - A blog is published: **Run LLM on all Intel GPUs Using llama.cpp**: [intel.com](https://www.intel.com/content/www/us/en/developer/articles/technical/run-llm-on-all-gpus-using-llama-cpp-artical.html) or [medium.com](https://medium.com/@jianyu_neo/run-llm-on-all-intel-gpus-using-llama-cpp-fd2e2dcbd9bd).
-  - New base line is ready: [tag b2437](https://github.com/ggml-org/llama.cpp/tree/b2437).
+  - New base line is ready: [tag b2437](https://github.com/ggerganov/llama.cpp/tree/b2437).
  - Support multiple cards: **--split-mode**: [none|layer]; not support [row], it's on developing.
  - Support to assign main GPU by **--main-gpu**, replace $GGML_SYCL_DEVICE.
  - Support detecting all GPUs with level-zero and same top **Max compute units**.
@@ -107,8 +97,8 @@ SYCL backend supports Intel GPU Family:
 | Intel Data Center Max Series  | Support | Max 1550, 1100                        |
 | Intel Data Center Flex Series | Support | Flex 170                              |
 | Intel Arc Series              | Support | Arc 770, 730M, Arc A750               |
-| Intel built-in Arc GPU        | Support | built-in Arc GPU in Meteor Lake, Arrow Lake    |
-| Intel iGPU                    | Support | iGPU in 13700k,iGPU in 13400, i5-1250P, i7-1260P, i7-1165G7 |
+| Intel built-in Arc GPU        | Support | built-in Arc GPU in Meteor Lake       |
+| Intel iGPU                    | Support | iGPU in 13700k, i5-1250P, i7-1260P, i7-1165G7 |

 *Notes:*

@@ -670,10 +660,8 @@ use 1 SYCL GPUs: [0] with Max compute units:512
 | Name              | Value            | Function                                                                                                                  |
 |-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------|
 | GGML_SYCL_DEBUG   | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG                                                                             |
-| GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features based on Intel GPU type, to compare the performance increase |
 | ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |

-
 ## Known Issues

 - `Split-mode:[row]` is not supported.
--- a/docs/build.md
+++ b/docs/build.md
@@ -3,7 +3,7 @@
 **To get the Code:**

 ```bash
-git clone https://github.com/ggml-org/llama.cpp
+git clone https://github.com/ggerganov/llama.cpp
 cd llama.cpp
 ```

@@ -46,7 +46,7 @@ cmake --build build --config Release
  ```

 - Building for Windows (x86, x64 and arm64) with MSVC or clang as compilers:
-    - Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...):
+    - Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/de/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...):
    - Tab Workload: Desktop-development with C++
    - Tab Components (select quickly via search): C++-_CMake_ Tools for Windows, _Git_ for Windows, C++-_Clang_ Compiler for Windows, MS-Build Support for LLVM-Toolset (clang)
    - Please remember to always use a Developer Command Prompt / PowerShell for VS2022 for git, build, test
@@ -206,14 +206,6 @@ This provides GPU acceleration using the MUSA cores of your Moore Threads MTT GP
  cmake --build build --config Release
  ```

-  For static build:
-
-  ```bash
-  cmake -B build -DGGML_MUSA=ON \
-    -DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-  cmake --build build --config Release
-  ```
-
 The environment variable [`MUSA_VISIBLE_DEVICES`](https://docs.mthreads.com/musa-sdk/musa-sdk-doc-online/programming_guide/Z%E9%99%84%E5%BD%95/) can be used to specify which GPU(s) will be used.

 The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted.
--- a/docs/cuda-fedora.md
+++ b/docs/cuda-fedora.md
@@ -248,7 +248,7 @@ You have successfully set up CUDA on Fedora within a toolbox environment using t

 - **Building `llama.cpp`:**

-  - With CUDA installed, you can follow these [build instructions for `llama.cpp`](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md) to compile it with CUDA support.
+  - With CUDA installed, you can follow these [build instructions for `llama.cpp`](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md) to compile it with CUDA support.
  - Ensure that any CUDA-specific build flags or paths are correctly set in your build configuration.

 - **Using the Toolbox Environment:**
--- a/docs/development/HOWTO-add-model.md
+++ b/docs/development/HOWTO-add-model.md
@@ -104,16 +104,16 @@ Note: to debug the inference graph: you can use [llama-eval-callback](/examples/

 ## GGUF specification

-https://github.com/ggml-org/ggml/blob/master/docs/gguf.md
+https://github.com/ggerganov/ggml/blob/master/docs/gguf.md

 ## Resources

- YaRN RoPE scaling https://github.com/ggml-org/llama.cpp/pull/2268
- support Baichuan serial models https://github.com/ggml-org/llama.cpp/pull/3009
- support attention bias https://github.com/ggml-org/llama.cpp/pull/4283
- Mixtral support https://github.com/ggml-org/llama.cpp/pull/4406
- BERT embeddings https://github.com/ggml-org/llama.cpp/pull/5423
- Grok-1 support https://github.com/ggml-org/llama.cpp/pull/6204
- Command R Plus support https://github.com/ggml-org/llama.cpp/pull/6491
- support arch DBRX https://github.com/ggml-org/llama.cpp/pull/6515
- How to convert HuggingFace model to GGUF format https://github.com/ggml-org/llama.cpp/discussions/2948
+- YaRN RoPE scaling https://github.com/ggerganov/llama.cpp/pull/2268
+- support Baichuan serial models https://github.com/ggerganov/llama.cpp/pull/3009
+- support attention bias https://github.com/ggerganov/llama.cpp/pull/4283
+- Mixtral support https://github.com/ggerganov/llama.cpp/pull/4406
+- BERT embeddings https://github.com/ggerganov/llama.cpp/pull/5423
+- Grok-1 support https://github.com/ggerganov/llama.cpp/pull/6204
+- Command R Plus support https://github.com/ggerganov/llama.cpp/pull/6491
+- support arch DBRX https://github.com/ggerganov/llama.cpp/pull/6515
+- How to convert HuggingFace model to GGUF format https://github.com/ggerganov/llama.cpp/discussions/2948
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -7,21 +7,21 @@
 ## Images
 We have three Docker images available for this project:

-1. `ghcr.io/ggml-org/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
-2. `ghcr.io/ggml-org/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
-3. `ghcr.io/ggml-org/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`)
+1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
+2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
+3. `ghcr.io/ggerganov/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`)

 Additionally, there the following images, similar to the above:

- `ghcr.io/ggml-org/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:server-cuda`: Same as `server` but compiled with CUDA support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggml-org/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggml-org/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggml-org/llama.cpp:full-musa`: Same as `full` but compiled with MUSA support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:light-musa`: Same as `light` but compiled with MUSA support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:server-musa`: Same as `server` but compiled with MUSA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggerganov/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggerganov/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggerganov/llama.cpp:server-cuda`: Same as `server` but compiled with CUDA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
+- `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
+- `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
+- `ghcr.io/ggerganov/llama.cpp:full-musa`: Same as `full` but compiled with MUSA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggerganov/llama.cpp:light-musa`: Same as `light` but compiled with MUSA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggerganov/llama.cpp:server-musa`: Same as `server` but compiled with MUSA support. (platforms: `linux/amd64`)

 The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA, ROCm or MUSA library, you'll need to build the images locally for now).

@@ -32,25 +32,25 @@ The easiest way to download the models, convert them to ggml and optimize them i
 Replace `/path/to/models` below with the actual path where you downloaded the models.

 ```bash
-docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --all-in-one "/models/" 7B
+docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B
 ```

 On completion, you are ready to play!

 ```bash
-docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
+docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
 ```

 or with a light image:

 ```bash
-docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
+docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
 ```

 or with a server image:

 ```bash
-docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggml-org/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512
+docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggerganov/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512
 ```

 ## Docker With CUDA
@@ -69,7 +69,7 @@ You may want to pass in some different `ARGS`, depending on the CUDA environment

 The defaults are:

- `CUDA_VERSION` set to `12.4.0`
+- `CUDA_VERSION` set to `12.6.0`
 - `CUDA_DOCKER_ARCH` set to the cmake build default, which includes all the supported architectures

 The resulting images, are essentially the same as the non-CUDA images:
@@ -104,7 +104,7 @@ You may want to pass in some different `ARGS`, depending on the MUSA environment

 The defaults are:

- `MUSA_VERSION` set to `rc3.1.1`
+- `MUSA_VERSION` set to `rc3.1.0`

 The resulting images, are essentially the same as the non-MUSA images:

--- a/docs/function-calling.md
+++ b/docs/function-calling.md
@@ -1,390 +0,0 @@
-# Function Calling
-
-[chat.h](../common/chat.h) (https://github.com/ggml-org/llama.cpp/pull/9639) adds support for [OpenAI-style function calling](https://platform.openai.com/docs/guides/function-calling) and is used in:
- `llama-server` when started w/ `--jinja` flag
- `llama-cli` (WIP: https://github.com/ggml-org/llama.cpp/pull/11556)
-
-## Universal support w/ Native & Generic handlers
-
-Function calling is supported for all models (see https://github.com/ggml-org/llama.cpp/pull/9639):
-
- Native tool call formats supported:
-  - Llama 3.1 / 3.3 (including builtin tools support - tool names for `wolfram_alpha`, `web_search` / `brave_search`, `code_interpreter`), Llama 3.2
-  - Functionary v3.1 / v3.2
-  - Hermes 2/3, Qwen 2.5
-  - Qwen 2.5 Coder (WIP: https://github.com/ggml-org/llama.cpp/pull/12034)
-  - Mistral Nemo
-  - Firefunction v2
-  - Command R7B
-  - DeepSeek R1 (WIP / seems reluctant to call any tools?)
-
- Generic tool call is supported when the template isn't recognized by native format handlers (you'll see `Chat format: Generic` in the logs).
-  - Use `--chat-template-file` to override the template when appropriate (see examples below)
-  - Generic support may consume more tokens and be less efficient than a model's native format.
-
-<details>
-<summary>Show some common templates and which format handler they use</summary>
-
-| Template | Format |
-|----------|--------|
-| Almawave-Velvet-14B.jinja | Hermes 2 Pro |
-| AtlaAI-Selene-1-Mini-Llama-3.1-8B.jinja | Llama 3.x |
-| CohereForAI-aya-expanse-8b.jinja | Generic |
-| CohereForAI-c4ai-command-r-plus-default.jinja | Generic |
-| CohereForAI-c4ai-command-r-plus-rag.jinja | Generic |
-| CohereForAI-c4ai-command-r-plus-tool_use.jinja | Generic |
-| CohereForAI-c4ai-command-r7b-12-2024-default.jinja | Command R7B (extract reasoning) |
-| CohereForAI-c4ai-command-r7b-12-2024-rag.jinja | Command R7B (extract reasoning) |
-| CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja | Command R7B (extract reasoning) |
-| CohereForAI-c4ai-command-r7b-12-2024.jinja | Generic |
-| DavieLion-Llama-3.2-1B-SPIN-iter3.jinja | Generic |
-| Delta-Vector-Rei-12B.jinja | Mistral Nemo |
-| EpistemeAI-Mistral-Nemo-Instruct-12B-Philosophy-Math.jinja | Mistral Nemo |
-| FlofloB-83k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit.jinja | Hermes 2 Pro |
-| FlofloB-test_continued_pretraining_Phi-3-mini-4k-instruct_Unsloth_merged_16bit.jinja | Generic |
-| HelpingAI-HAI-SER.jinja | Generic |
-| HuggingFaceTB-SmolLM2-1.7B-Instruct.jinja | Generic |
-| HuggingFaceTB-SmolLM2-135M-Instruct.jinja | Generic |
-| HuggingFaceTB-SmolLM2-360M-Instruct.jinja | Generic |
-| INSAIT-Institute-BgGPT-Gemma-2-27B-IT-v1.0.jinja | Generic |
-| Ihor-Text2Graph-R1-Qwen2.5-0.5b.jinja | Hermes 2 Pro |
-| Infinigence-Megrez-3B-Instruct.jinja | Generic |
-| Josephgflowers-TinyLlama_v1.1_math_code-world-test-1.jinja | Generic |
-| LGAI-EXAONE-EXAONE-3.5-2.4B-Instruct.jinja | Generic |
-| LGAI-EXAONE-EXAONE-3.5-7.8B-Instruct.jinja | Generic |
-| LatitudeGames-Wayfarer-12B.jinja | Generic |
-| Magpie-Align-Llama-3-8B-Magpie-Align-v0.1.jinja | Generic |
-| Magpie-Align-Llama-3.1-8B-Magpie-Align-v0.1.jinja | Generic |
-| MaziyarPanahi-calme-3.2-instruct-78b.jinja | Generic |
-| MiniMaxAI-MiniMax-Text-01.jinja | Generic |
-| MiniMaxAI-MiniMax-VL-01.jinja | Generic |
-| NaniDAO-deepseek-r1-qwen-2.5-32B-ablated.jinja | DeepSeek R1 (extract reasoning) |
-| NexaAIDev-Octopus-v2.jinja | Generic |
-| NousResearch-Hermes-2-Pro-Llama-3-8B-default.jinja | Generic |
-| NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja | Hermes 2 Pro |
-| NousResearch-Hermes-2-Pro-Mistral-7B-default.jinja | Generic |
-| NousResearch-Hermes-2-Pro-Mistral-7B-tool_use.jinja | Hermes 2 Pro |
-| NousResearch-Hermes-3-Llama-3.1-70B-default.jinja | Generic |
-| NousResearch-Hermes-3-Llama-3.1-70B-tool_use.jinja | Hermes 2 Pro |
-| NovaSky-AI-Sky-T1-32B-Flash.jinja | Hermes 2 Pro |
-| NovaSky-AI-Sky-T1-32B-Preview.jinja | Hermes 2 Pro |
-| OnlyCheeini-greesychat-turbo.jinja | Generic |
-| Orenguteng-Llama-3.1-8B-Lexi-Uncensored-V2.jinja | Llama 3.x |
-| OrionStarAI-Orion-14B-Chat.jinja | Generic |
-| PowerInfer-SmallThinker-3B-Preview.jinja | Generic |
-| PrimeIntellect-INTELLECT-1-Instruct.jinja | Generic |
-| Qwen-QVQ-72B-Preview.jinja | Generic |
-| Qwen-QwQ-32B-Preview.jinja | Hermes 2 Pro |
-| Qwen-Qwen1.5-7B-Chat.jinja | Generic |
-| Qwen-Qwen2-7B-Instruct.jinja | Generic |
-| Qwen-Qwen2-VL-72B-Instruct.jinja | Generic |
-| Qwen-Qwen2-VL-7B-Instruct.jinja | Generic |
-| Qwen-Qwen2.5-0.5B.jinja | Hermes 2 Pro |
-| Qwen-Qwen2.5-1.5B-Instruct.jinja | Hermes 2 Pro |
-| Qwen-Qwen2.5-14B-Instruct-1M.jinja | Hermes 2 Pro |
-| Qwen-Qwen2.5-14B.jinja | Hermes 2 Pro |
-| Qwen-Qwen2.5-32B-Instruct.jinja | Hermes 2 Pro |
-| Qwen-Qwen2.5-32B.jinja | Hermes 2 Pro |
-| Qwen-Qwen2.5-3B-Instruct.jinja | Hermes 2 Pro |
-| Qwen-Qwen2.5-72B-Instruct.jinja | Hermes 2 Pro |
-| Qwen-Qwen2.5-7B-Instruct-1M.jinja | Hermes 2 Pro |
-| Qwen-Qwen2.5-7B-Instruct.jinja | Hermes 2 Pro |
-| Qwen-Qwen2.5-7B.jinja | Hermes 2 Pro |
-| Qwen-Qwen2.5-Coder-32B-Instruct.jinja | Hermes 2 Pro |
-| Qwen-Qwen2.5-Coder-7B-Instruct.jinja | Hermes 2 Pro |
-| Qwen-Qwen2.5-Math-1.5B.jinja | Hermes 2 Pro |
-| Qwen-Qwen2.5-Math-7B-Instruct.jinja | Hermes 2 Pro |
-| Qwen-Qwen2.5-VL-3B-Instruct.jinja | Hermes 2 Pro |
-| Qwen-Qwen2.5-VL-72B-Instruct.jinja | Hermes 2 Pro |
-| Qwen-Qwen2.5-VL-7B-Instruct.jinja | Hermes 2 Pro |
-| RWKV-Red-Team-ARWKV-7B-Preview-0.1.jinja | Hermes 2 Pro |
-| SakanaAI-TinySwallow-1.5B-Instruct.jinja | Hermes 2 Pro |
-| SakanaAI-TinySwallow-1.5B.jinja | Hermes 2 Pro |
-| Sao10K-70B-L3.3-Cirrus-x1.jinja | Llama 3.x |
-| SentientAGI-Dobby-Mini-Leashed-Llama-3.1-8B.jinja | Llama 3.x |
-| SentientAGI-Dobby-Mini-Unhinged-Llama-3.1-8B.jinja | Llama 3.x |
-| Steelskull-L3.3-Damascus-R1.jinja | Llama 3.x |
-| Steelskull-L3.3-MS-Nevoria-70b.jinja | Llama 3.x |
-| Steelskull-L3.3-Nevoria-R1-70b.jinja | Llama 3.x |
-| THUDM-glm-4-9b-chat.jinja | Generic |
-| THUDM-glm-edge-1.5b-chat.jinja | Generic |
-| Tarek07-Progenitor-V1.1-LLaMa-70B.jinja | Llama 3.x |
-| TheBloke-FusionNet_34Bx2_MoE-AWQ.jinja | Generic |
-| TinyLlama-TinyLlama-1.1B-Chat-v1.0.jinja | Generic |
-| UCLA-AGI-Mistral7B-PairRM-SPPO-Iter3.jinja | Generic |
-| ValiantLabs-Llama3.1-8B-Enigma.jinja | Llama 3.x |
-| abacusai-Fewshot-Metamath-OrcaVicuna-Mistral.jinja | Generic |
-| ai21labs-AI21-Jamba-1.5-Large.jinja | Generic |
-| allenai-Llama-3.1-Tulu-3-405B-SFT.jinja | Generic |
-| allenai-Llama-3.1-Tulu-3-405B.jinja | Generic |
-| allenai-Llama-3.1-Tulu-3-8B.jinja | Generic |
-| arcee-ai-Virtuoso-Lite.jinja | Hermes 2 Pro |
-| arcee-ai-Virtuoso-Medium-v2.jinja | Hermes 2 Pro |
-| arcee-ai-Virtuoso-Small-v2.jinja | Hermes 2 Pro |
-| avemio-GRAG-NEMO-12B-ORPO-HESSIAN-AI.jinja | Generic |
-| bespokelabs-Bespoke-Stratos-7B.jinja | Hermes 2 Pro |
-| bfuzzy1-acheron-m1a-llama.jinja | Generic |
-| bofenghuang-vigogne-2-70b-chat.jinja | Generic |
-| bytedance-research-UI-TARS-72B-DPO.jinja | Generic |
-| bytedance-research-UI-TARS-7B-DPO.jinja | Generic |
-| bytedance-research-UI-TARS-7B-SFT.jinja | Generic |
-| carsenk-phi3.5_mini_exp_825_uncensored.jinja | Generic |
-| cyberagent-DeepSeek-R1-Distill-Qwen-14B-Japanese.jinja | DeepSeek R1 (extract reasoning) |
-| cyberagent-DeepSeek-R1-Distill-Qwen-32B-Japanese.jinja | DeepSeek R1 (extract reasoning) |
-| databricks-dbrx-instruct.jinja | Generic |
-| deepseek-ai-DeepSeek-Coder-V2-Instruct.jinja | Generic |
-| deepseek-ai-DeepSeek-Coder-V2-Lite-Base.jinja | Generic |
-| deepseek-ai-DeepSeek-Coder-V2-Lite-Instruct.jinja | Generic |
-| deepseek-ai-DeepSeek-R1-Distill-Llama-70B.jinja | DeepSeek R1 (extract reasoning) |
-| deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja | DeepSeek R1 (extract reasoning) |
-| deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B.jinja | DeepSeek R1 (extract reasoning) |
-| deepseek-ai-DeepSeek-R1-Distill-Qwen-14B.jinja | DeepSeek R1 (extract reasoning) |
-| deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja | DeepSeek R1 (extract reasoning) |
-| deepseek-ai-DeepSeek-R1-Distill-Qwen-7B.jinja | DeepSeek R1 (extract reasoning) |
-| deepseek-ai-DeepSeek-R1-Zero.jinja | DeepSeek R1 (extract reasoning) |
-| deepseek-ai-DeepSeek-R1.jinja | DeepSeek R1 (extract reasoning) |
-| deepseek-ai-DeepSeek-V2-Lite.jinja | Generic |
-| deepseek-ai-DeepSeek-V2.5.jinja | DeepSeek R1 (extract reasoning) |
-| deepseek-ai-DeepSeek-V3.jinja | DeepSeek R1 (extract reasoning) |
-| deepseek-ai-deepseek-coder-33b-instruct.jinja | Generic |
-| deepseek-ai-deepseek-coder-6.7b-instruct.jinja | Generic |
-| deepseek-ai-deepseek-coder-7b-instruct-v1.5.jinja | Generic |
-| deepseek-ai-deepseek-llm-67b-chat.jinja | Generic |
-| deepseek-ai-deepseek-llm-7b-chat.jinja | Generic |
-| dicta-il-dictalm2.0-instruct.jinja | Generic |
-| ehristoforu-Falcon3-8B-Franken-Basestruct.jinja | Hermes 2 Pro |
-| fireworks-ai-llama-3-firefunction-v2.jinja | FireFunction v2 |
-| godlikehhd-alpaca_data_sampled_ifd_new_5200.jinja | Hermes 2 Pro |
-| godlikehhd-alpaca_data_score_max_0.7_2600.jinja | Hermes 2 Pro |
-| google-gemma-2-27b-it.jinja | Generic |
-| google-gemma-2-2b-it.jinja | Generic |
-| google-gemma-2-2b-jpn-it.jinja | Generic |
-| google-gemma-7b-it.jinja | Generic |
-| huihui-ai-DeepSeek-R1-Distill-Llama-70B-abliterated.jinja | DeepSeek R1 (extract reasoning) |
-| huihui-ai-DeepSeek-R1-Distill-Llama-8B-abliterated.jinja | DeepSeek R1 (extract reasoning) |
-| huihui-ai-DeepSeek-R1-Distill-Qwen-14B-abliterated-v2.jinja | DeepSeek R1 (extract reasoning) |
-| huihui-ai-DeepSeek-R1-Distill-Qwen-32B-abliterated.jinja | DeepSeek R1 (extract reasoning) |
-| huihui-ai-DeepSeek-R1-Distill-Qwen-7B-abliterated-v2.jinja | DeepSeek R1 (extract reasoning) |
-| huihui-ai-Qwen2.5-14B-Instruct-1M-abliterated.jinja | Hermes 2 Pro |
-| ibm-granite-granite-3.1-8b-instruct.jinja | Generic |
-| indischepartij-MiniCPM-3B-OpenHermes-2.5-v2.jinja | Generic |
-| inflatebot-MN-12B-Mag-Mell-R1.jinja | Generic |
-| jinaai-ReaderLM-v2.jinja | Generic |
-| kms7530-chemeng_qwen-math-7b_24_1_100_1_nonmath.jinja | Hermes 2 Pro |
-| knifeayumu-Cydonia-v1.3-Magnum-v4-22B.jinja | Mistral Nemo |
-| langgptai-qwen1.5-7b-chat-sa-v0.1.jinja | Generic |
-| lightblue-DeepSeek-R1-Distill-Qwen-7B-Japanese.jinja | DeepSeek R1 (extract reasoning) |
-| mattshumer-Reflection-Llama-3.1-70B.jinja | Generic |
-| meetkai-functionary-medium-v3.1.jinja | Functionary v3.1 Llama 3.1 |
-| meetkai-functionary-medium-v3.2.jinja | Functionary v3.2 |
-| meta-llama-Llama-2-7b-chat-hf.jinja | Generic |
-| meta-llama-Llama-3.1-8B-Instruct.jinja | Llama 3.x |
-| meta-llama-Llama-3.2-11B-Vision-Instruct.jinja | Llama 3.x |
-| meta-llama-Llama-3.2-1B-Instruct.jinja | Llama 3.x |
-| meta-llama-Llama-3.2-3B-Instruct.jinja | Llama 3.x |
-| meta-llama-Llama-3.3-70B-Instruct.jinja | Llama 3.x |
-| meta-llama-Meta-Llama-3-8B-Instruct.jinja | Generic |
-| meta-llama-Meta-Llama-3.1-8B-Instruct.jinja | Llama 3.x |
-| microsoft-Phi-3-medium-4k-instruct.jinja | Generic |
-| microsoft-Phi-3-mini-4k-instruct.jinja | Generic |
-| microsoft-Phi-3-small-8k-instruct.jinja | Generic |
-| microsoft-Phi-3.5-mini-instruct.jinja | Generic |
-| microsoft-Phi-3.5-vision-instruct.jinja | Generic |
-| microsoft-phi-4.jinja | Generic |
-| migtissera-Tess-3-Mistral-Nemo-12B.jinja | Generic |
-| ministral-Ministral-3b-instruct.jinja | Generic |
-| mistralai-Codestral-22B-v0.1.jinja | Generic |
-| mistralai-Mistral-7B-Instruct-v0.1.jinja | Generic |
-| mistralai-Mistral-7B-Instruct-v0.2.jinja | Generic |
-| mistralai-Mistral-7B-Instruct-v0.3.jinja | Mistral Nemo |
-| mistralai-Mistral-Large-Instruct-2407.jinja | Mistral Nemo |
-| mistralai-Mistral-Large-Instruct-2411.jinja | Generic |
-| mistralai-Mistral-Nemo-Instruct-2407.jinja | Mistral Nemo |
-| mistralai-Mistral-Small-24B-Instruct-2501.jinja | Generic |
-| mistralai-Mixtral-8x7B-Instruct-v0.1.jinja | Generic |
-| mkurman-Qwen2.5-14B-DeepSeek-R1-1M.jinja | Hermes 2 Pro |
-| mlabonne-AlphaMonarch-7B.jinja | Generic |
-| mlx-community-Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1-float32.jinja | Hermes 2 Pro |
-| mlx-community-Qwen2.5-VL-7B-Instruct-8bit.jinja | Hermes 2 Pro |
-| mobiuslabsgmbh-DeepSeek-R1-ReDistill-Qwen-1.5B-v1.1.jinja | DeepSeek R1 (extract reasoning) |
-| netcat420-MFANNv0.20.jinja | Generic |
-| netcat420-MFANNv0.24.jinja | Generic |
-| netease-youdao-Confucius-o1-14B.jinja | Hermes 2 Pro |
-| nvidia-AceMath-7B-RM.jinja | Hermes 2 Pro |
-| nvidia-Eagle2-1B.jinja | Hermes 2 Pro |
-| nvidia-Eagle2-9B.jinja | Hermes 2 Pro |
-| nvidia-Llama-3.1-Nemotron-70B-Instruct-HF.jinja | Llama 3.x |
-| onnx-community-DeepSeek-R1-Distill-Qwen-1.5B-ONNX.jinja | DeepSeek R1 (extract reasoning) |
-| open-thoughts-OpenThinker-7B.jinja | Hermes 2 Pro |
-| openchat-openchat-3.5-0106.jinja | Generic |
-| pankajmathur-orca_mini_v6_8b.jinja | Generic |
-| princeton-nlp-Mistral-7B-Base-SFT-RDPO.jinja | Generic |
-| princeton-nlp-Mistral-7B-Instruct-DPO.jinja | Generic |
-| princeton-nlp-Mistral-7B-Instruct-RDPO.jinja | Generic |
-| prithivMLmods-Bellatrix-Tiny-1.5B-R1.jinja | Hermes 2 Pro |
-| prithivMLmods-Bellatrix-Tiny-1B-R1.jinja | Llama 3.x |
-| prithivMLmods-Bellatrix-Tiny-1B-v3.jinja | Generic |
-| prithivMLmods-Bellatrix-Tiny-3B-R1.jinja | Llama 3.x |
-| prithivMLmods-Blaze-14B-xElite.jinja | Generic |
-| prithivMLmods-Calcium-Opus-14B-Elite2-R1.jinja | Hermes 2 Pro |
-| prithivMLmods-Calme-Ties-78B.jinja | Generic |
-| prithivMLmods-Calme-Ties2-78B.jinja | Generic |
-| prithivMLmods-Calme-Ties3-78B.jinja | Generic |
-| prithivMLmods-ChemQwen2-vL.jinja | Generic |
-| prithivMLmods-GWQ2b.jinja | Generic |
-| prithivMLmods-LatexMind-2B-Codec.jinja | Generic |
-| prithivMLmods-Llama-3.2-6B-AlgoCode.jinja | Llama 3.x |
-| prithivMLmods-Megatron-Opus-14B-Exp.jinja | Hermes 2 Pro |
-| prithivMLmods-Megatron-Opus-14B-Stock.jinja | Hermes 2 Pro |
-| prithivMLmods-Megatron-Opus-7B-Exp.jinja | Hermes 2 Pro |
-| prithivMLmods-Omni-Reasoner-Merged.jinja | Hermes 2 Pro |
-| prithivMLmods-Omni-Reasoner4-Merged.jinja | Hermes 2 Pro |
-| prithivMLmods-Primal-Opus-14B-Optimus-v1.jinja | Hermes 2 Pro |
-| prithivMLmods-QwQ-Math-IO-500M.jinja | Hermes 2 Pro |
-| prithivMLmods-Qwen-7B-Distill-Reasoner.jinja | DeepSeek R1 (extract reasoning) |
-| prithivMLmods-Qwen2.5-1.5B-DeepSeek-R1-Instruct.jinja | Hermes 2 Pro |
-| prithivMLmods-Qwen2.5-14B-DeepSeek-R1-1M.jinja | Hermes 2 Pro |
-| prithivMLmods-Qwen2.5-32B-DeepSeek-R1-Instruct.jinja | Hermes 2 Pro |
-| prithivMLmods-Qwen2.5-7B-DeepSeek-R1-1M.jinja | Hermes 2 Pro |
-| prithivMLmods-Triangulum-v2-10B.jinja | Hermes 2 Pro |
-| qingy2024-Falcon3-2x10B-MoE-Instruct.jinja | Hermes 2 Pro |
-| rubenroy-Zurich-14B-GCv2-5m.jinja | Hermes 2 Pro |
-| rubenroy-Zurich-7B-GCv2-5m.jinja | Hermes 2 Pro |
-| silma-ai-SILMA-Kashif-2B-Instruct-v1.0.jinja | Generic |
-| simplescaling-s1-32B.jinja | Hermes 2 Pro |
-| sometimesanotion-Lamarck-14B-v0.7.jinja | Hermes 2 Pro |
-| sonthenguyen-zephyr-sft-bnb-4bit-DPO-mtbr-180steps.jinja | Generic |
-| sthenno-tempesthenno-icy-0130.jinja | Generic |
-| sumink-qwft.jinja | Hermes 2 Pro |
-| teknium-OpenHermes-2.5-Mistral-7B.jinja | Generic |
-| thirdeyeai-elevate360m.jinja | Generic |
-| tiiuae-Falcon3-10B-Instruct.jinja | Hermes 2 Pro |
-| unsloth-DeepSeek-R1-Distill-Llama-8B-unsloth-bnb-4bit.jinja | DeepSeek R1 (extract reasoning) |
-| unsloth-DeepSeek-R1-Distill-Llama-8B.jinja | DeepSeek R1 (extract reasoning) |
-| unsloth-DeepSeek-R1.jinja | DeepSeek R1 (extract reasoning) |
-| unsloth-Mistral-Small-24B-Instruct-2501-unsloth-bnb-4bit.jinja | Generic |
-| upstage-solar-pro-preview-instruct.jinja | Generic |
-| whyhow-ai-PatientSeek.jinja | Generic |
-| xwen-team-Xwen-72B-Chat.jinja | Hermes 2 Pro |
-| xwen-team-Xwen-7B-Chat.jinja | Hermes 2 Pro |
-
-This table can be generated with:
-
-```bash
-./build/bin/test-chat ../minja/build/tests/*.jinja 2>/dev/null
-```
-
-</details>
-
-# Usage - need tool-aware Jinja template
-
-First, start a server with any model, but make sure it has a tools-enabled template: you can verify this by inspecting the `chat_template` or `chat_template_tool_use` properties in `http://localhost:8080/props`).
-
-Here are some models known to work (w/ chat template override when needed):
-
-```shell
-# Native support:
-
-llama-server --jinja -fa -hf bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M
-llama-server --jinja -fa -hf bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q6_K_L
-llama-server --jinja -fa -hf bartowski/functionary-small-v3.2-GGUF:Q4_K_M
-llama-server --jinja -fa -hf bartowski/Llama-3.3-70B-Instruct-GGUF:Q4_K_M
-
-# Native support for DeepSeek R1 works best w/ our own template (official template buggy)
-
-llama-server --jinja -fa -hf bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q6_K_L \
--chat-template-file models/templates/llama-cpp-deepseek-r1.jinja
-
-llama-server --jinja -fa -hf bartowski/DeepSeek-R1-Distill-Qwen-32B-GGUF:Q4_K_M \
--chat-template-file models/templates/llama-cpp-deepseek-r1.jinja
-
-# Native support requires the right template for these GGUFs:
-
-llama-server --jinja -fa -hf bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M \
--chat-template-file <( python scripts/get_chat_template.py NousResearch/Hermes-2-Pro-Llama-3-8B tool_use )
-
-llama-server --jinja -fa -hf bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M \
--chat-template-file <( python scripts/get_chat_template.py NousResearch/Hermes-3-Llama-3.1-8B tool_use )
-
-llama-server --jinja -fa -hf bartowski/firefunction-v2-GGUF -hff firefunction-v2-IQ1_M.gguf \
--chat-template-file <( python scripts/get_chat_template.py fireworks-ai/llama-3-firefunction-v2 tool_use )
-
-llama-server --jinja -fa -hf bartowski/c4ai-command-r7b-12-2024-GGUF:Q6_K_L \
--chat-template-file <( python scripts/get_chat_template.py CohereForAI/c4ai-command-r7b-12-2024 tool_use )
-
-# Generic format support
-llama-server --jinja -fa -hf bartowski/phi-4-GGUF:Q4_0
-llama-server --jinja -fa -hf bartowski/gemma-2-2b-it-GGUF:Q8_0
-llama-server --jinja -fa -hf bartowski/c4ai-command-r-v01-GGUF:Q2_K
-```
-
-> [!TIP]
-> If there is no official `tool_use` Jinja template, you may want to set `--chat-template chatml` to use a default that works with many models (YMMV!), or write your own (e.g. we provide a custom [llama-cpp-deepseek-r1.jinja](../models/templates/llama-cpp-deepseek-r1.jinja) for DeepSeek R1 distills)
-
-Test in CLI (or with any library / software that can use OpenAI-compatible API backends):
-
-```bash
-curl http://localhost:8080/v1/chat/completions -d '{
-"model": "gpt-3.5-turbo",
-"tools": [
-    {
-    "type":"function",
-    "function":{
-        "name":"python",
-        "description":"Runs code in an ipython interpreter and returns the result of the execution after 60 seconds.",
-        "parameters":{
-        "type":"object",
-        "properties":{
-            "code":{
-            "type":"string",
-            "description":"The code to run in the ipython interpreter."
-            }
-        },
-        "required":["code"]
-        }
-    }
-    }
-],
-"messages": [
-    {
-    "role": "user",
-    "content": "Print a hello world message with python."
-    }
-]
-}'
-```
-
-<details>
-<summary>Show output</summary>
-
-```json
-{
-"choices": [
-    {
-    "finish_reason": "tool",
-    "index": 0,
-    "message": {
-        "content": null,
-        "tool_calls": [
-        {
-            "name": "python",
-            "arguments": "{\"code\":\" \\nprint(\\\"Hello, World!\\\")\"}"
-        }
-        ],
-        "role": "assistant"
-    }
-    }
-],
-"created": 1727287211,
-"model": "gpt-3.5-turbo",
-"object": "chat.completion",
-"usage": {
-    "completion_tokens": 16,
-    "prompt_tokens": 44,
-    "total_tokens": 60
-},
-"id": "chatcmpl-Htbgh9feMmGM0LEH2hmQvwsCxq3c6Ni8"
-}
-```
-
-</details>
--- a/docs/install.md
+++ b/docs/install.md
@@ -7,7 +7,7 @@ On Mac and Linux, the homebrew package manager can be used via
 ```sh
 brew install llama.cpp
 ```
-The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggml-org/llama.cpp/discussions/7668
+The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggerganov/llama.cpp/discussions/7668

 ## Nix

--- a/docs/llguidance.md
+++ b/docs/llguidance.md
@@ -13,15 +13,13 @@ cmake -B build -DLLAMA_LLGUIDANCE=ON
 make -C build -j
 ```

-For Windows use `cmake --build build --config Release` instead of `make`.
-
 This requires the Rust compiler and the `cargo` tool to be [installed](https://www.rust-lang.org/tools/install).

 ## Interface

 There are no new command-line arguments or modifications to `common_params`. When enabled, grammars starting with `%llguidance` are passed to LLGuidance instead of the [current](../grammars/README.md) llama.cpp grammars. Additionally, JSON Schema requests (e.g., using the `-j` argument in `llama-cli`) are also passed to LLGuidance.

-For your existing GBNF grammars, you can use [gbnf_to_lark.py script](https://github.com/guidance-ai/llguidance/blob/main/python/llguidance/gbnf_to_lark.py) to convert them to LLGuidance Lark-like format.
+For your existing GBNF grammars, you can use [gbnf_to_lark.py script](https://github.com/guidance-ai/llguidance/blob/main/scripts/gbnf_to_lark.py) to convert them to LLGuidance Lark-like format.

 ## Performance

--- a/examples/cvector-generator/README.md
+++ b/examples/cvector-generator/README.md
@@ -3,9 +3,9 @@
 This example demonstrates how to generate a control vector using gguf models.

 Related PRs:
- [Add support for control vectors](https://github.com/ggml-org/llama.cpp/pull/5970)
- (Issue) [Generate control vector using llama.cpp](https://github.com/ggml-org/llama.cpp/issues/6880)
- [Add cvector-generator example](https://github.com/ggml-org/llama.cpp/pull/7514)
+- [Add support for control vectors](https://github.com/ggerganov/llama.cpp/pull/5970)
+- (Issue) [Generate control vector using llama.cpp](https://github.com/ggerganov/llama.cpp/issues/6880)
+- [Add cvector-generator example](https://github.com/ggerganov/llama.cpp/pull/7514)

 ## Examples

--- a/examples/imatrix/README.md
+++ b/examples/imatrix/README.md
@@ -1,7 +1,7 @@
 # llama.cpp/examples/imatrix

-Compute an importance matrix for a model and given text dataset. Can be used during quantization to enhance the quality of the quantized models.
-More information is available here: https://github.com/ggml-org/llama.cpp/pull/4861
+Compute an importance matrix for a model and given text dataset. Can be used during quantization to enchance the quality of the quantized models.
+More information is available here: https://github.com/ggerganov/llama.cpp/pull/4861

 ## Usage

--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -3,7 +3,6 @@
 #include "log.h"
 #include "llama.h"

-#include <chrono>
 #include <cmath>
 #include <cstdio>
 #include <cstring>
@@ -100,7 +99,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
    const float * data = is_host ? (const float *) src1->data : m_src1_data.data();

    // this has been adapted to the new format of storing merged experts in a single 3d tensor
-    // ref: https://github.com/ggml-org/llama.cpp/pull/6387
+    // ref: https://github.com/ggerganov/llama.cpp/pull/6387
    if (t->op == GGML_OP_MUL_MAT_ID) {
        //   ids  -> [n_experts_used, n_tokens]
        //   src1 -> [cols, n_expert_used, n_tokens]
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -876,8 +876,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
 struct test {
    static const std::string build_commit;
    static const int         build_number;
-    const std::string        cpu_info;
-    const std::string        gpu_info;
+    static const std::string cpu_info;
+    static const std::string gpu_info;
    std::string              model_filename;
    std::string              model_type;
    uint64_t                 model_size;
@@ -903,10 +903,7 @@ struct test {
    std::string              test_time;
    std::vector<uint64_t>    samples_ns;

-    test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) :
-        cpu_info(get_cpu_info()),
-        gpu_info(get_gpu_info()) {
-
+    test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) {
        model_filename = inst.model;
        char buf[128];
        llama_model_desc(lmodel, buf, sizeof(buf));
@@ -1061,6 +1058,8 @@ struct test {

 const std::string test::build_commit = LLAMA_COMMIT;
 const int         test::build_number = LLAMA_BUILD_NUMBER;
+const std::string test::cpu_info     = get_cpu_info();
+const std::string test::gpu_info     = get_gpu_info();

 struct printer {
    virtual ~printer() {}
--- a/examples/llama.android/llama/src/main/cpp/CMakeLists.txt
+++ b/examples/llama.android/llama/src/main/cpp/CMakeLists.txt
@@ -14,7 +14,7 @@ project("llama-android")
 #include(FetchContent)
 #FetchContent_Declare(
 #        llama
-#        GIT_REPOSITORY https://github.com/ggml-org/llama.cpp
+#        GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
 #        GIT_TAG        master
 #)

--- a/examples/llama.swiftui/README.md
+++ b/examples/llama.swiftui/README.md
@@ -3,9 +3,9 @@
 Local inference of llama.cpp on an iPhone. This is a sample app that can be used as a starting
 point for more advanced projects.

-For usage instructions and performance stats, check the following discussion: https://github.com/ggml-org/llama.cpp/discussions/4508
+For usage instructions and performance stats, check the following discussion: https://github.com/ggerganov/llama.cpp/discussions/4508

-![image](https://github.com/ggml-org/llama.cpp/assets/1991296/2b40284f-8421-47a2-b634-74eece09a299)
+![image](https://github.com/ggerganov/llama.cpp/assets/1991296/2b40284f-8421-47a2-b634-74eece09a299)

 Video demonstration:

--- a/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift
+++ b/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift
@@ -124,26 +124,15 @@ struct ContentView: View {
                    }
                }
            }.sheet(isPresented: $showingHelp) {    // Sheet for help modal
-                NavigationView {
+                VStack(alignment: .leading) {
                    VStack(alignment: .leading) {
-                        VStack(alignment: .leading) {
-                            Text("1. Make sure the model is in GGUF Format")
-                                    .padding()
-                            Text("2. Copy the download link of the quantized model")
-                                    .padding()
-                        }
-                        Spacer()
+                        Text("1. Make sure the model is in GGUF Format")
+                               .padding()
+                        Text("2. Copy the download link of the quantized model")
+                               .padding()
                    }
-                    .navigationTitle("Help")
-                    .navigationBarTitleDisplayMode(.inline)
-                    .toolbar {
-                        ToolbarItem(placement: .navigationBarTrailing) {
-                            Button("Done") {
-                                showingHelp = false
-                            }
-                        }
-                    }
-                }
+                    Spacer()
+                   }
            }
        }
    }
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -39,7 +39,7 @@
 "
 "   :call llama#init()
 "
-" more info: https://github.com/ggml-org/llama.cpp/pull/9787
+" more info: https://github.com/ggerganov/llama.cpp/pull/9787
 "

 " colors (adjust to your liking)
--- a/examples/llava/README-granitevision.md
+++ b/examples/llava/README-granitevision.md
@@ -1,190 +0,0 @@
-# Granite Vision
-
-Download the model and point your `GRANITE_MODEL` environment variable to the path.
-
-```bash
-$ git clone https://huggingface.co/ibm-granite/granite-vision-3.2-2b
-$ export GRANITE_MODEL=./granite-vision-3.2-2b
-```
-
-
-### 1. Running llava surgery v2.
-First, we need to run the llava surgery script as shown below:
-
-`python llava_surgery_v2.py -C -m $GRANITE_MODEL`
-
-You should see two new files (`llava.clip` and `llava.projector`) written into your model's directory, as shown below.
-
-```bash
-$ ls $GRANITE_MODEL | grep -i llava
-llava.clip
-llava.projector
-```
-
-We should see that the projector and visual encoder get split out into the llava files. Quick check to make sure they aren't empty:
-```python
-import os
-import torch
-
-MODEL_PATH = os.getenv("GRANITE_MODEL")
-if not MODEL_PATH:
-    raise ValueError("env var GRANITE_MODEL is unset!")
-
-encoder_tensors = torch.load(os.path.join(MODEL_PATH, "llava.clip"))
-projector_tensors = torch.load(os.path.join(MODEL_PATH, "llava.projector"))
-
-assert len(encoder_tensors) > 0
-assert len(projector_tensors) > 0
-```
-
-If you actually inspect the `.keys()` of the loaded tensors, you should see a lot of `vision_model` tensors in the `encoder_tensors`, and 5 tensors (`'multi_modal_projector.linear_1.bias'`, `'multi_modal_projector.linear_1.weight'`, `'multi_modal_projector.linear_2.bias'`, `'multi_modal_projector.linear_2.weight'`, `'image_newline'`) in the multimodal `projector_tensors`.
-
-
-### 2. Creating the Visual Component GGUF
-Next, create a new directory to hold the visual components, and copy the llava.clip/projector files, as shown below.
-
-```bash
-$ ENCODER_PATH=$PWD/visual_encoder
-$ mkdir $ENCODER_PATH
-
-$ cp $GRANITE_MODEL/llava.clip $ENCODER_PATH/pytorch_model.bin
-$ cp $GRANITE_MODEL/llava.projector $ENCODER_PATH/
-```
-
-Now, we need to write a config for the visual encoder. In order to convert the model, be sure to use the correct `image_grid_pinpoints`, as these may vary based on the model. You can find the `image_grid_pinpoints` in `$GRANITE_MODEL/config.json`.
-
-```json
-{
-    "_name_or_path": "siglip-model",
-    "architectures": [
-      "SiglipVisionModel"
-    ],
-    "image_grid_pinpoints": [
-        [384,384],
-        [384,768],
-        [384,1152],
-        [384,1536],
-        [384,1920],
-        [384,2304],
-        [384,2688],
-        [384,3072],
-        [384,3456],
-        [384,3840],
-        [768,384],
-        [768,768],
-        [768,1152],
-        [768,1536],
-        [768,1920],
-        [1152,384],
-        [1152,768],
-        [1152,1152],
-        [1536,384],
-        [1536,768],
-        [1920,384],
-        [1920,768],
-        [2304,384],
-        [2688,384],
-        [3072,384],
-        [3456,384],
-        [3840,384]
-    ],
-    "mm_patch_merge_type": "spatial_unpad",
-    "hidden_size": 1152,
-    "image_size": 384,
-    "intermediate_size": 4304,
-    "model_type": "siglip_vision_model",
-    "num_attention_heads": 16,
-    "num_hidden_layers": 27,
-    "patch_size": 14,
-    "layer_norm_eps": 1e-6,
-    "hidden_act": "gelu_pytorch_tanh",
-    "projection_dim": 0,
-    "vision_feature_layer": [-24, -20, -12, -1]
-}
-```
-
-At this point you should have something like this:
-```bash
-$ ls $ENCODER_PATH
-config.json             llava.projector         pytorch_model.bin
-```
-
-Now convert the components to GGUF; Note that we also override the image mean/std dev to `[.5,.5,.5]` since we use the SigLIP visual encoder - in the transformers model, you can find these numbers in the `preprocessor_config.json`.
-```bash
-$ python convert_image_encoder_to_gguf.py \
-    -m $ENCODER_PATH \
-    --llava-projector $ENCODER_PATH/llava.projector \
-    --output-dir $ENCODER_PATH \
-    --clip-model-is-vision \
-    --clip-model-is-siglip \
-    --image-mean 0.5 0.5 0.5 \
-    --image-std 0.5 0.5 0.5
-```
-
-This will create the first GGUF file at `$ENCODER_PATH/mmproj-model-f16.gguf`; we will refer to the absolute path of this file as the `$VISUAL_GGUF_PATH.`
-
-
-### 3. Creating the LLM GGUF.
-The granite vision model contains a granite LLM as its language model. For now, the easiest way to get the GGUF for LLM is by loading the composite model in `transformers` and exporting the LLM so that it can be directly converted with the normal conversion path.
-
-First, set the `LLM_EXPORT_PATH` to the path to export the `transformers` LLM to.
-```bash
-$ export LLM_EXPORT_PATH=$PWD/granite_vision_llm
-```
-
-```python
-import os
-import transformers
-
-MODEL_PATH = os.getenv("GRANITE_MODEL")
-if not MODEL_PATH:
-    raise ValueError("env var GRANITE_MODEL is unset!")
-
-LLM_EXPORT_PATH = os.getenv("LLM_EXPORT_PATH")
-if not LLM_EXPORT_PATH:
-    raise ValueError("env var LLM_EXPORT_PATH is unset!")
-
-tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_PATH)
-
-# NOTE: granite vision support was added to transformers very recently (4.49);
-# if you get size mismatches, your version is too old.
-# If you are running with an older version, set `ignore_mismatched_sizes=True`
-# as shown below; it won't be loaded correctly, but the LLM part of the model that
-# we are exporting will be loaded correctly.
-model = transformers.AutoModelForImageTextToText.from_pretrained(MODEL_PATH, ignore_mismatched_sizes=True)
-
-tokenizer.save_pretrained(LLM_EXPORT_PATH)
-model.language_model.save_pretrained(LLM_EXPORT_PATH)
-```
-
-Now you can convert the exported LLM to GGUF with the normal converter in the root of the llama cpp project.
-```bash
-$ LLM_GGUF_PATH=$LLM_EXPORT_PATH/granite_llm.gguf
-...
-$ python convert_hf_to_gguf.py --outfile $LLM_GGUF_PATH $LLM_EXPORT_PATH
-```
-
-
-### 4. Quantization
-If you want to quantize the LLM, you can do so with `llama-quantize` as you would any other LLM. For example:
-```bash
-$ ./build/bin/llama-quantize $LLM_EXPORT_PATH/granite_llm.gguf $LLM_EXPORT_PATH/granite_llm_q4_k_m.gguf Q4_K_M
-$ LLM_GGUF_PATH=$LLM_EXPORT_PATH/granite_llm_q4_k_m.gguf
-```
-
-Note that currently you cannot quantize the visual encoder because granite vision models use SigLIP as the visual encoder, which has tensor dimensions that are not divisible by 32.
-
-
-### 5. Running the Model in Llama cpp
-Build llama cpp normally; you should have a target binary named `llama-llava-cli`, which you can pass two binaries to. As an example, we pass the the llama.cpp banner.
-
-```bash
-$ ./build/bin/llama-llava-cli -m $LLM_GGUF_PATH \
-    --mmproj $VISUAL_GGUF_PATH \
-    --image ./media/llama0-banner.png \
-    -c 16384 \
-    -p "<|system|>\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n<|user|>\n\<image>\nWhat does the text in this image say?\n<|assistant|>\n" \
-    --temp 0
-```
-
-Sample output: `The text in the image reads "LLAMA C++ Can it run DOOM Llama?"`
--- a/examples/llava/README-minicpmo2.6.md
+++ b/examples/llava/README-minicpmo2.6.md
@@ -26,7 +26,7 @@ python ./convert_hf_to_gguf.py ../MiniCPM-o-2_6/model
 ```

 Build llama.cpp using `CMake`:
-https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md
+https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md

 ```bash
 cmake -B build
--- a/examples/llava/README-minicpmv2.5.md
+++ b/examples/llava/README-minicpmv2.5.md
@@ -6,7 +6,7 @@ Download [MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-

 Clone llama.cpp:
 ```bash
-git clone https://github.com/ggml-org/llama.cpp
+git clone https://github.com/ggerganov/llama.cpp
 cd llama.cpp
 ```

--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@@ -101,27 +101,8 @@ python ./examples/convert_legacy_llama.py ../llava-v1.6-vicuna-7b/ --skip-unknow
 ```

 **note** llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096)
-
 **note** llava-1.6 greatly benefits from batched prompt processing (defaults work)

-**note** if the language model in step `6)` is incompatible with the legacy conversion script, the easiest way handle the LLM model conversion is to load the model in transformers, and export only the LLM from the llava next model.
-
-```python
-import os
-import transformers
-
-model_path = ...
-llm_export_path = ...
-
-tokenizer = transformers.AutoTokenizer.from_pretrained(model_path)
-model = transformers.AutoModelForImageTextToText.from_pretrained(model_path)
-
-tokenizer.save_pretrained(llm_export_path)
-model.language_model.save_pretrained(llm_export_path)
-```
-
-Then, you can convert the LLM using the `convert_hf_to_gguf.py` script, which handles more LLM architectures.
-
 ## llava-cli templating and llava-1.6 prompting

 llava-1.5 models all use the same vicuna prompt, here you can just add your image question like `-p "Provide a full description."`
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -40,7 +40,6 @@
 #include <map>
 #include <regex>
 #include <stdexcept>
-#include <unordered_set>
 #include <vector>
 #include <sstream>
 #include <cinttypes>
@@ -121,7 +120,6 @@ static std::string format(const char * fmt, ...) {
 #define KEY_IMAGE_MEAN          "clip.vision.image_mean"
 #define KEY_IMAGE_STD           "clip.vision.image_std"
 #define KEY_PROJ_TYPE           "clip.projector_type"
-#define KEY_FEATURE_LAYER       "clip.vision.feature_layer"

 #define KEY_MM_PATCH_MERGE_TYPE   "clip.vision.mm_patch_merge_type"
 #define KEY_IMAGE_GRID_PINPOINTS  "clip.vision.image_grid_pinpoints"
@@ -446,9 +444,8 @@ struct clip_hparams {

    char mm_patch_merge_type[32] = "flat"; // spatial_unpad or flat (default)

-    std::vector<int32_t> image_grid_pinpoints;
+    int32_t image_grid_pinpoints[32];
    int32_t image_crop_resolution;
-    std::unordered_set<int32_t> vision_feature_layer;
 };

 struct clip_layer {
@@ -588,7 +585,6 @@ struct clip_ctx {
    struct clip_vision_model vision_model;
    projector_type proj_type = PROJECTOR_TYPE_MLP;

-    int32_t max_feature_layer;
    float image_mean[3];
    float image_std[3];
    bool use_gelu = false;
@@ -655,6 +651,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
    const int hidden_size          = hparams.hidden_size;
    const int n_head               = hparams.n_head;
    const int d_head               = hidden_size / n_head;
+    int n_layer                    = hparams.n_layer;
    const float eps                = hparams.eps;
    int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};

@@ -755,19 +752,13 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
        embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_ln_w), model.pre_ln_b);
    }

-    std::vector<struct ggml_tensor *> embedding_stack;
-    const auto & vision_feature_layer = hparams.vision_feature_layer;
-
    // loop over layers
-    for (int il = 0; il < ctx->max_feature_layer; il++) {
+    if (ctx->has_minicpmv_projector || ctx->has_glm_projector || ctx->has_qwen2vl_merger) {
+        n_layer += 1;
+    }
+    for (int il = 0; il < n_layer - 1; il++) {
        struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states

-        // If this is an embedding feature layer, save the output.
-        // NOTE: 0 index here refers to the input to the encoder.
-        if (vision_feature_layer.find(il) != vision_feature_layer.end()) {
-            embedding_stack.push_back(embeddings);
-        }
-
        //const size_t nb_q_w = model.layers[il].q_w->nb[0];

        // layernorm1
@@ -855,6 +846,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
        cur = ggml_add(ctx0, embeddings, cur);

        embeddings = cur;
+
    }

    // post-layernorm
@@ -865,19 +857,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
        embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b);
    }

-    // final layer is a vision feature layer
-    if (vision_feature_layer.find(ctx->max_feature_layer) != vision_feature_layer.end()) {
-        embedding_stack.push_back(embeddings);
-    }
-
-    // If feature layers are explicitly set, stack them (if we have multiple)
-    if (!embedding_stack.empty()) {
-        embeddings = embedding_stack[0];
-        for (size_t i = 1; i < embedding_stack.size(); i++) {
-            embeddings = ggml_concat(ctx0, embeddings, embedding_stack[i], 0);
-        }
-    }
-
    // llava projector
    if (ctx->has_llava_projector) {
        embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
@@ -1464,26 +1443,14 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            int idx = get_key_idx(ctx, KEY_IMAGE_GRID_PINPOINTS);
            int n = gguf_get_arr_n(ctx, idx);
            const int32_t * pinpoints = (const int32_t *)gguf_get_arr_data(ctx, idx);
-            for (int i = 0; i < n; ++i) {
-                hparams.image_grid_pinpoints.push_back(pinpoints[i]);
+            for (int i = 0; i < 32 && i < n && pinpoints[i] != 0; ++i) {
+                hparams.image_grid_pinpoints[i] = pinpoints[i];
            }
-        } catch (std::runtime_error & /*e*/) { }
-
-        // Load the vision feature layer indices if they are explicitly provided;
-        // if multiple vision feature layers are present, the values will be concatenated
-        // to form the final visual features.
-        // NOTE: gguf conversions should standardize the values of the vision feature layer to
-        // be non-negative, since we use -1 to mark values as unset here.
-        try {
-            int idx = get_key_idx(ctx, KEY_FEATURE_LAYER);
-            int n = gguf_get_arr_n(ctx, idx);
-
-            const int32_t * vision_feature_layer = (const int32_t *)gguf_get_arr_data(ctx, idx);
-
-            for (int i = 0; i < n; ++i) {
-                hparams.vision_feature_layer.insert(vision_feature_layer[i]);
-            }
-        } catch (std::runtime_error & /*e*/) { }
+            if (n < 32)
+                hparams.image_grid_pinpoints[n] = 0;
+        } catch (std::runtime_error & /*e*/) {
+            hparams.image_grid_pinpoints[0]=0;
+        }

        try {
            int idx = get_key_idx(ctx, KEY_MM_PATCH_MERGE_TYPE);
@@ -1509,9 +1476,6 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            new_clip->image_std[i]  = std_data[i];
        }

-        // Calculate the deepest feature layer based on hparams and projector type
-        new_clip->max_feature_layer = get_deepest_feature_layer(new_clip);
-
        if (verbosity >= 2) {
            LOG_INF("\n%s: vision model hparams\n", __func__);
            LOG_INF("image_size         %d\n", hparams.image_size);
@@ -1525,13 +1489,8 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            LOG_INF("v_image_mean       %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
            LOG_INF("v_image_std        %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
            LOG_INF("v_image_grid_pinpoints: ");
-            for (const auto & pp : hparams.image_grid_pinpoints) {
-                LOG_INF("%d ", pp);
-            }
-            LOG_INF("\n");
-            LOG_INF("v_vision_feature_layer: ");
-            for (const auto & feature_layer: hparams.vision_feature_layer) {
-                LOG_INF("%d ", feature_layer);
+            for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) {
+                LOG_INF("%d ", hparams.image_grid_pinpoints[i]);
            }
            LOG_INF("\n");
            LOG_INF("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
@@ -1770,11 +1729,11 @@ void clip_image_f32_batch_free(struct clip_image_f32_batch  * batch) {
    }
 }

-void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, clip_image_u8 * img) {
+static void build_clip_img_from_data(const stbi_uc * data, int nx, int ny, clip_image_u8 * img) {
    img->nx = nx;
    img->ny = ny;
    img->buf.resize(3 * nx * ny);
-    memcpy(img->buf.data(), rgb_pixels, img->buf.size());
+    memcpy(img->buf.data(), data, img->buf.size());
 }

 bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
@@ -1784,7 +1743,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
        LOG_ERR("%s: failed to load image '%s'\n", __func__, fname);
        return false;
    }
-    clip_build_img_from_pixels(data, nx, ny, img);
+    build_clip_img_from_data(data, nx, ny, img);
    stbi_image_free(data);
    return true;
 }
@@ -1796,7 +1755,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
        LOG_ERR("%s: failed to decode image bytes\n", __func__);
        return false;
    }
-    clip_build_img_from_pixels(data, nx, ny, img);
+    build_clip_img_from_data(data, nx, ny, img);
    stbi_image_free(data);
    return true;
 }
@@ -2276,10 +2235,10 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
            }
        }
    } else {
-        if (!params.image_grid_pinpoints.empty()) {
+        if (params.image_grid_pinpoints[0] != 0) {
            // "spatial_unpad" with "anyres" processing for llava-1.6
            std::vector<std::pair<int, int>> possible_resolutions;
-            for (size_t i = 0; i < params.image_grid_pinpoints.size(); i+=2) {
+            for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) {
                possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
            }
            std::pair<int, int> best_resolution = select_best_resolution({img->nx, img->ny}, possible_resolutions);
@@ -2445,14 +2404,7 @@ const char * clip_patch_merge_type(const struct clip_ctx * ctx) {
 }

 const int32_t * clip_image_grid(const struct clip_ctx * ctx) {
-    if (ctx->vision_model.hparams.image_grid_pinpoints.size()) {
-        return &ctx->vision_model.hparams.image_grid_pinpoints.front();
-    }
-    return nullptr;
-}
-
-size_t get_clip_image_grid_size(const struct clip_ctx * ctx) {
-    return ctx->vision_model.hparams.image_grid_pinpoints.size();
+    return ctx->vision_model.hparams.image_grid_pinpoints;
 }

 int clip_n_patches(const struct clip_ctx * ctx) {
@@ -2760,13 +2712,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima

            if (!ctx->has_glm_projector) {
                struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
-                // The patches vector is used to get rows to index into the embeds with;
-                // we should skip dim 0 only if we have CLS to avoid going out of bounds
-                // when retrieving the rows.
-                int patch_offset = ctx->has_class_embedding ? 1 : 0;
                int* patches_data = (int*)malloc(ggml_nbytes(patches));
                for (int i = 0; i < num_patches; i++) {
-                    patches_data[i] = i + patch_offset;
+                    patches_data[i] = i + 1;
                }
                ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
                free(patches_data);
@@ -2977,28 +2925,6 @@ bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
    return ctx->has_qwen2vl_merger;
 }

-// Determine the number of encoder layers to iterate over
-int get_deepest_feature_layer(const struct clip_ctx * ctx) {
-    // Get the index of the second to last layer; this is the
-    // default for models that have a llava projector
-    const auto & hparams = ctx->vision_model.hparams;
-    int n_layer = hparams.n_layer - 1;
-    int deepest_feature_layer = -1;
-
-    // Handle other projectors; incrementing here indicates that we
-    // should use the last encoder layer for the vision features.
-    if (ctx->has_minicpmv_projector || ctx->has_glm_projector || ctx->has_qwen2vl_merger) {
-        n_layer += 1;
-    }
-
-    // If we set explicit vision feature layers, only go up to the deepest one
-    for (const auto & feature_layer : hparams.vision_feature_layer) {
-        if (feature_layer > deepest_feature_layer) {
-            deepest_feature_layer = feature_layer;
-        }
-    }
-    return deepest_feature_layer < 0 ? n_layer : deepest_feature_layer;
-}

 bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
    clip_image_f32 clip_img;
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@@ -55,7 +55,6 @@ CLIP_API int32_t clip_hidden_size(const struct clip_ctx * ctx);
 CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);

 CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
-CLIP_API size_t get_clip_image_grid_size(const struct clip_ctx * ctx);

 CLIP_API int clip_n_patches        (const struct clip_ctx * ctx);
 CLIP_API int clip_n_patches_by_img (const struct clip_ctx * ctx, struct clip_image_f32 * img);
@@ -74,12 +73,6 @@ CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
 CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch  * batch);
 CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);

-/**
- * Build image from pixels decoded by other libraries instead of stb_image.h for better performance.
- * The memory layout is RGBRGBRGB..., input buffer length must be 3*nx*ny bytes
- */
-CLIP_API void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img);
-
 CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);

 /** interpret bytes as an image file with length bytes_length, and use the result to populate img */
@@ -96,13 +89,11 @@ CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, cons
 CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);

 CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
-CLIP_API bool clip_is_glm(const struct clip_ctx * ctx);
 CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);

-CLIP_API int get_deepest_feature_layer(const struct clip_ctx * ctx);
-
 CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);

+CLIP_API bool clip_is_glm(const struct clip_ctx * ctx);

 #ifdef __cplusplus
 }
--- a/examples/llava/convert_image_encoder_to_gguf.py
+++ b/examples/llava/convert_image_encoder_to_gguf.py
@@ -6,7 +6,7 @@ import re
 import torch
 import numpy as np
 from gguf import *
-from transformers import CLIPModel, CLIPProcessor, CLIPVisionModel, SiglipVisionModel
+from transformers import CLIPModel, CLIPProcessor, CLIPVisionModel

 TEXT = "clip.text"
 VISION = "clip.vision"
@@ -37,18 +37,6 @@ def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: b


 def get_tensor_name(name: str) -> str:
-    # Standardize the transformers llava next keys for
-    # image newline / mm projector with the classes in haotian-liu LLaVA
-    if name == "image_newline":
-        return "model.image_newline"
-    if name.startswith("multi_modal_projector"):
-        name = name.replace("multi_modal_projector", "mm")
-        if "linear_1" in name:
-            name = name.replace("linear_1", "0")
-        if "linear_2" in name:
-            name = name.replace("linear_2", "2")
-        return name
-
    if "projection" in name:
        return name
    if "mm_projector" in name:
@@ -95,14 +83,8 @@ ap.add_argument("--vision-only", action="store_true", required=False,
                help="Save a vision-only model. It can't be used to encode texts")
 ap.add_argument("--clip-model-is-vision", action="store_true", required=False,
                help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
-
-# Selectable visual encoders that are compatible with this script
-encoder_group = ap.add_mutually_exclusive_group()
-encoder_group.add_argument("--clip-model-is-openclip", action="store_true", required=False,
+ap.add_argument("--clip-model-is-openclip", action="store_true", required=False,
                help="The clip model is from openclip (for ViT-SO400M type))")
-encoder_group.add_argument("--clip-model-is-siglip", action="store_true", required=False,
-                help="the visual encoder is Siglip.")
-
 ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
 ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2"], default="mlp")
 ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
@@ -127,12 +109,7 @@ if args.use_f32:
 # output in the same directory as the model if output_dir is None
 dir_model = args.model_dir

-if (
-    args.clip_model_is_vision or
-    not os.path.exists(dir_model + "/vocab.json") or
-    args.clip_model_is_openclip or
-    args.clip_model_is_siglip
-):
+if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip:
    vocab = None
    tokens = None
 else:
@@ -160,10 +137,7 @@ ftype = 1
 if args.use_f32:
    ftype = 0

-if args.clip_model_is_siglip:
-    model = SiglipVisionModel.from_pretrained(dir_model)
-    processor = None
-elif args.clip_model_is_vision or args.clip_model_is_openclip:
+if args.clip_model_is_vision or args.clip_model_is_openclip:
    model = CLIPVisionModel.from_pretrained(dir_model)
    processor = None
 else:
@@ -213,71 +187,26 @@ else:
 if has_text_encoder:
    assert t_hparams is not None
    assert tokens is not None
-    if args.clip_model_is_siglip:
-        text_projection_dim = 0
-    else:
-        text_projection_dim = t_hparams.get("projection_dim", config["projection_dim"])
    # text_model hparams
    fout.add_uint32(k(KEY_CONTEXT_LENGTH, TEXT), t_hparams["max_position_embeddings"])
    fout.add_uint32(k(KEY_EMBEDDING_LENGTH, TEXT), t_hparams["hidden_size"])
    fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, TEXT), t_hparams["intermediate_size"])
-    fout.add_uint32("clip.text.projection_dim", text_projection_dim)
+    fout.add_uint32("clip.text.projection_dim", t_hparams.get("projection_dim", config["projection_dim"]))
    fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, TEXT), t_hparams["num_attention_heads"])
    fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, TEXT), t_hparams["layer_norm_eps"])
    fout.add_uint32(k(KEY_BLOCK_COUNT, TEXT), t_hparams["num_hidden_layers"])
    fout.add_token_list(tokens)

-
-
-def get_non_negative_vision_feature_layers(v_hparams):
-    """
-    Determine the vision feature layer(s) for the llava model, which are indices into the
-    hidden states of the visual encoder. Note that the hidden states array generally takes the
-    form:
-
-        [<emb input>, <output of enc block 0>, ... <output of enc block num_hidden_layers>]
-
-    so feature indices should be offset as n+1 to get the output of encoder block n.
-    We convert all vision feature layers to non-negative so that -1 can be used in
-    the model as an unset value. If no vision feature layer is found, we leave it unset.
-    """
-    num_hidden_layers = v_hparams["num_hidden_layers"]
-    to_non_negative = lambda layer_idx: layer_idx  if layer_idx >= 0 else num_hidden_layers + layer_idx + 1
-    feature_layers_key = None
-    # Key used for llava models in transformers
-    if "vision_feature_layer" in config:
-        feature_layers_key = "vision_feature_layer"
-    # Key used for llava models in the original format
-    elif "mm_vision_select_layer" in config:
-        feature_layers_key = "mm_vision_select_layer"
-    if feature_layers_key is not None:
-        feature_layers = config[feature_layers_key]
-        if isinstance(feature_layers, int):
-            feature_layers = [feature_layers]
-        return [to_non_negative(feature_layer) for feature_layer in feature_layers]
-
-# Determine if we have explicitly specified vision feature layers in our config
-feature_layers = get_non_negative_vision_feature_layers(v_hparams)
-
 if has_vision_encoder:
-    # Siglip does not have a visual projector; set projection dim to 0
-    if args.clip_model_is_siglip:
-        visual_projection_dim = 0
-    else:
-        visual_projection_dim = v_hparams.get("projection_dim", config["projection_dim"])
-
-    # set vision_model hparams
+    # vision_model hparams
    fout.add_uint32("clip.vision.image_size", v_hparams["image_size"])
    fout.add_uint32("clip.vision.patch_size", v_hparams["patch_size"])
    fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), v_hparams["hidden_size"])
    fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), v_hparams["intermediate_size"])
-    fout.add_uint32("clip.vision.projection_dim", visual_projection_dim)
+    fout.add_uint32("clip.vision.projection_dim", v_hparams.get("projection_dim", config["projection_dim"]))
    fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), v_hparams["num_attention_heads"])
    fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), v_hparams["layer_norm_eps"])
-    if feature_layers:
-        block_count = max(feature_layers)
-    else:
-        block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"]
+    block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"]
    fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count)
                            #     /**
                            #      "image_grid_pinpoints": [
@@ -329,8 +258,7 @@ if has_vision_encoder:
        fout.add_string("clip.vision.mm_patch_merge_type", v_hparams["mm_patch_merge_type"])
    if "mm_projector_type" in v_hparams:
        fout.add_string("clip.vision.mm_projector_type", v_hparams["mm_projector_type"])
-    if feature_layers:
-        fout.add_array("clip.vision.feature_layer", feature_layers)
+

    if processor is not None:
        image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean  # pyright: ignore[reportAttributeAccessIssue]
@@ -346,13 +274,7 @@ fout.add_bool("clip.use_gelu", use_gelu)


 if has_llava_projector:
-    # By default, we drop the last layer for llava projector
-    # models unless we have explicitly set vision feature layers
-    if feature_layers is None:
-        model.vision_model.encoder.layers.pop(-1)
-    else:
-        model.vision_model.encoder.layers = model.vision_model.encoder.layers[:max(feature_layers)]
-
+    model.vision_model.encoder.layers.pop(-1)
    projector = torch.load(args.llava_projector)
    for name, data in projector.items():
        name = get_tensor_name(name)
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -353,10 +353,9 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
        LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);

        const int32_t * image_grid = clip_image_grid(ctx_clip);
-        const size_t num_gridpoints = get_clip_image_grid_size(ctx_clip);

        std::vector<std::pair<int, int>> grid_pinpoints;
-        for (size_t i = 0; i < num_gridpoints; i += 2) {
+        for (int i = 0; i < 32 && image_grid[i] != 0; i += 2) {
            grid_pinpoints.push_back({image_grid[i], image_grid[i+1]});
        }

@@ -406,8 +405,7 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx *
 }

 bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
-    // Granite vision uses up to 10 patches + base patch
-    int num_max_patches = 11;
+    int num_max_patches = 6;
    if (clip_is_minicpmv(ctx_clip)) {
        num_max_patches = 10;
    }
--- a/examples/llava/llava_surgery_v2.py
+++ b/examples/llava/llava_surgery_v2.py
@@ -33,33 +33,6 @@ def save_model(model, file_path, file_type):
    else:
        torch.save(model, file_path)

-# Helpers to match weight names from specific components or
-# determine if a saved shard contains that component
-def is_vision_tower(weight_name):
-    return (
-        weight_name.startswith("model.vision_tower") or
-        weight_name.startswith("vit.") or
-        weight_name.startswith("vision_tower")
-    )
-
-def is_newline(weight_name):
-    return (
-        weight_name.startswith("model.image_newline") or
-        weight_name.startswith("image_newline")
-    )
-
-def is_mm_projector(weight_name):
-    return (
-        weight_name.startswith("model.mm_projector") or
-        weight_name.startswith("vision_proj.") or
-        weight_name.startswith("multi_modal_projector")
-    )
-
-def newline_criteria(checkpoint):
-    return any(is_newline(k) for k in checkpoint.keys())
-
-def proj_criteria(checkpoint):
-    return any(is_mm_projector(k) for k in checkpoint.keys())

 # Adapted function to clean vision tower from checkpoint
 def clean_vision_tower_from_checkpoint(checkpoint_path):
@@ -67,7 +40,7 @@ def clean_vision_tower_from_checkpoint(checkpoint_path):
    # file_type = 'pytorch'
    model_path = os.path.dirname(checkpoint_path)
    print(f"Searching for vision tower tensors in {checkpoint_path}")
-    clip_tensors = [k for k, v in checkpoint.items() if is_vision_tower(k)]
+    clip_tensors = [k for k, v in checkpoint.items() if (k.startswith("model.vision_tower") or k.startswith("vit."))]

    if len(clip_tensors) > 0:
        print(f"Found {len(clip_tensors)} tensors to extract from {checkpoint_path}")
@@ -111,6 +84,12 @@ def find_relevant_checkpoints(checkpoint_paths, newline_criteria, projector):

    return newline_checkpoint_path, projector_checkpoint_path

+def newline_criteria(checkpoint):
+    return any(k.startswith("model.image_newline") for k in checkpoint.keys())
+
+def proj_criteria(checkpoint):
+    return any(k.startswith("model.mm_projector") or k.startswith("vision_proj.") for k in checkpoint.keys())
+

 # Command-line interface setup
 ap = argparse.ArgumentParser()
@@ -144,14 +123,14 @@ first_checkpoint = None
 if newline_checkpoint_path is not None:
    print(f"Taking newline from {newline_checkpoint_path}")
    first_checkpoint, file_type = load_model(newline_checkpoint_path)
-    first_mm_tensors = [k for k, v in first_checkpoint.items() if is_newline(k)]
+    first_mm_tensors = [k for k, v in first_checkpoint.items() if k.startswith("model.image_newline")]

 # Load the checkpoint
 mm_tensors = []
 last_checkpoint = None
 if projector_checkpoint_path is not None:
    last_checkpoint, file_type = load_model(projector_checkpoint_path)
-    mm_tensors = [k for k, v in last_checkpoint.items() if is_mm_projector(k)]
+    mm_tensors = [k for k, v in last_checkpoint.items() if k.startswith("model.mm_projector") or k.startswith("vision_proj.")]

 if len(mm_tensors) == 0:
    if last_checkpoint is not None:
@@ -176,5 +155,5 @@ if len(projector) > 0:
    save_model(projector, f"{args.model}/llava.projector", 'pytorch')

 print("Done!")
-print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.")
+print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.")
 print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.")
--- a/examples/lookahead/README.md
+++ b/examples/lookahead/README.md
@@ -4,4 +4,4 @@ Demonstration of lookahead decoding technique:

 https://lmsys.org/blog/2023-11-21-lookahead-decoding/

-More info: https://github.com/ggml-org/llama.cpp/pull/4207
+More info: https://github.com/ggerganov/llama.cpp/pull/4207
--- a/examples/lookup/README.md
+++ b/examples/lookup/README.md
@@ -8,5 +8,5 @@ The key parameters for lookup decoding are `ngram_min`, `ngram_max` and `n_draft

 More info:

-https://github.com/ggml-org/llama.cpp/pull/4484
-https://github.com/ggml-org/llama.cpp/issues/4226
+https://github.com/ggerganov/llama.cpp/pull/4484
+https://github.com/ggerganov/llama.cpp/issues/4226
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -1,6 +1,6 @@
 # llama.cpp/examples/main

-This example program allows you to use various LLaMA language models easily and efficiently. It is specifically designed to work with the [llama.cpp](https://github.com/ggml-org/llama.cpp) project, which provides a plain C/C++ implementation with optional 4-bit quantization support for faster, lower memory inference, and is optimized for desktop CPUs. This program can be used to perform various inference tasks with LLaMA models, including generating text based on user-provided prompts and chat-like interactions with reverse prompts.
+This example program allows you to use various LLaMA language models easily and efficiently. It is specifically designed to work with the [llama.cpp](https://github.com/ggerganov/llama.cpp) project, which provides a plain C/C++ implementation with optional 4-bit quantization support for faster, lower memory inference, and is optimized for desktop CPUs. This program can be used to perform various inference tasks with LLaMA models, including generating text based on user-provided prompts and chat-like interactions with reverse prompts.

 ## Table of Contents

@@ -37,7 +37,7 @@ Once downloaded, place your model in the models folder in llama.cpp.

 ##### Infinite text from a starting prompt (you can use `Ctrl-C` to stop it):
 ```bash
-./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
+./llama-cli -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
 ```

 ### Windows:
@@ -121,7 +121,7 @@ When --in-prefix or --in-suffix options are enabled the chat template ( --chat-t

 ### Chat templates

- `--chat-template JINJA_TEMPLATE`: This option sets a custom jinja chat template. It accepts a string, not a file name.  Default: template taken from model's metadata. Llama.cpp only supports [some pre-defined templates](https://github.com/ggml-org/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template). These include llama2, llama3, gemma, monarch, chatml, orion, vicuna, vicuna-orca, deepseek, command-r, zephyr. When --in-prefix or --in-suffix options are enabled the chat template ( --chat-template ) is disabled.
+ `--chat-template JINJA_TEMPLATE`: This option sets a custom jinja chat template. It accepts a string, not a file name.  Default: template taken from model's metadata. Llama.cpp only supports [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template). These include llama2, llama3, gemma, monarch, chatml, orion, vicuna, vicuna-orca, deepseek, command-r, zephyr. When --in-prefix or --in-suffix options are enabled the chat template ( --chat-template ) is disabled.

 Example usage: `--chat-template gemma`

@@ -265,14 +265,6 @@ Being experimental and unique, XTC is disabled by default. The recommended combi

 Example usage: `--xtc-probability 0.5 --xtc-threshold 0.1`

-### Top-nσ Sampling
-
-   `--top-nsigma N`: Limit the next token selection to a subset of tokens with pre-softmax logits that are within n * σ less than the max logit (default: -1, -1 = disabled).
-
-Top-nσ sampling is a text generation method that selects tokens based on a statistical threshold in pre-softmax logits. It works by only sampling from tokens with logits that are within n * σ of the maximum logit. This method helps maintain a stable sampling space regardless of temperature scaling, allowing it to perform well on reasoning tasks even in high temperatures. Without complex probability manipulation, it efficiently filters tokens directly on the pre-softmax logits. A higher value for top-nsigma (e.g., 5) will take more noisy tokens into consideration, while a lower value (e.g., 1) will focous on the more informative region of the sampling space.
-
-Example usage: `--top-nsigma 1`
-
 ### Logit Bias

 -   `-l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS`: Modify the likelihood of a token appearing in the generated text completion.
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -4,7 +4,7 @@
 #include "log.h"
 #include "sampling.h"
 #include "llama.h"
-#include "chat.h"
+#include "chat-template.hpp"

 #include <cstdio>
 #include <cstring>
@@ -31,6 +31,8 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

+static const char * DEFAULT_SYSTEM_MESSAGE = "You are a helpful assistant";
+
 static llama_context           ** g_ctx;
 static llama_model             ** g_model;
 static common_sampler          ** g_smpl;
@@ -156,7 +158,7 @@ int main(int argc, char ** argv) {
    }

    const llama_vocab * vocab = llama_model_get_vocab(model);
-    auto chat_templates = common_chat_templates_init(model, params.chat_template);
+    auto chat_templates = common_chat_templates_from_model(model, params.chat_template);

    LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);

@@ -199,7 +201,7 @@ int main(int argc, char ** argv) {
    }

    // auto enable conversation mode if chat template is available
-    const bool has_chat_template = common_chat_templates_was_explicit(chat_templates.get());
+    const bool has_chat_template = chat_templates.has_explicit_template && chat_templates.template_default;
    if (params.conversation_mode == COMMON_CONVERSATION_MODE_AUTO) {
        if (has_chat_template) {
            LOG_INF("%s: chat template is available, enabling conversation mode (disable it with -no-cnv)\n", __func__);
@@ -217,11 +219,7 @@ int main(int argc, char ** argv) {
    // print chat template example in conversation mode
    if (params.conversation_mode) {
        if (params.enable_chat_template) {
-            if (!params.prompt.empty()) {
-                LOG_WRN("*** User-specified prompt in conversation mode will be ignored, did you mean to set --system-prompt (-sys) instead?\n");
-            }
-
-            LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(chat_templates.get(), params.use_jinja).c_str());
+            LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(*chat_templates.template_default, params.use_jinja).c_str());
        } else {
            LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
        }
@@ -265,32 +263,20 @@ int main(int argc, char ** argv) {

    std::vector<llama_token> embd_inp;

-    bool waiting_for_first_input = params.conversation_mode && params.enable_chat_template && params.system_prompt.empty();
    auto chat_add_and_format = [&chat_msgs, &chat_templates](const std::string & role, const std::string & content) {
-        common_chat_msg new_msg;
-        new_msg.role = role;
-        new_msg.content = content;
-        auto formatted = common_chat_format_single(chat_templates.get(), chat_msgs, new_msg, role == "user", g_params->use_jinja);
-        chat_msgs.push_back(new_msg);
+        common_chat_msg new_msg{role, content, {}};
+        auto formatted = common_chat_format_single(*chat_templates.template_default, chat_msgs, new_msg, role == "user", g_params->use_jinja);
+        chat_msgs.push_back({role, content, {}});
        LOG_DBG("formatted: '%s'\n", formatted.c_str());
        return formatted;
    };

    {
-        std::string prompt;
-
-        if (params.conversation_mode && params.enable_chat_template) {
-            // format the system prompt in conversation mode (will use template default if empty)
-            prompt = params.system_prompt;
-
-            if (!prompt.empty()) {
-                prompt = chat_add_and_format("system", prompt);
-            }
-        } else {
+        auto prompt = (params.conversation_mode && params.enable_chat_template)
+            // format the system prompt in conversation mode (fallback to default if empty)
+            ? chat_add_and_format("system", params.prompt.empty() ? DEFAULT_SYSTEM_MESSAGE : params.prompt)
            // otherwise use the prompt as is
-            prompt = params.prompt;
-        }
-
+            : params.prompt;
        if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
            LOG_DBG("tokenize the prompt\n");
            embd_inp = common_tokenize(ctx, prompt, true, true);
@@ -304,7 +290,7 @@ int main(int argc, char ** argv) {
    }

    // Should not run without any tokens
-    if (!params.conversation_mode && embd_inp.empty()) {
+    if (embd_inp.empty()) {
        if (add_bos) {
            embd_inp.push_back(llama_vocab_bos(vocab));
            LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
@@ -488,8 +474,8 @@ int main(int argc, char ** argv) {
        LOG_INF(       " - Press Ctrl+C to interject at any time.\n");
 #endif
        LOG_INF(       "%s", control_message);
-        if (params.conversation_mode && params.enable_chat_template && params.system_prompt.empty()) {
-            LOG_INF(   " - Not using system message. To change it, set a different value via -sys PROMPT\n");
+        if (params.conversation_mode && params.enable_chat_template && params.prompt.empty()) {
+            LOG_INF(   " - Using default system message. To change it, set a different value via -p PROMPT or -f FILE argument.\n");
        }
        LOG_INF("\n");

@@ -769,14 +755,11 @@ int main(int argc, char ** argv) {

                // check for reverse prompt using special tokens
                llama_token last_token = common_sampler_last(smpl);
-                for (auto token : antiprompt_token) {
-                    if (token == last_token) {
-                        if (params.interactive) {
-                            is_interacting = true;
-                        }
-                        is_antiprompt = true;
-                        break;
+                if (std::find(antiprompt_token.begin(), antiprompt_token.end(), last_token) != antiprompt_token.end()) {
+                    if (params.interactive) {
+                        is_interacting = true;
                    }
+                    is_antiprompt = true;
                }

                if (is_antiprompt) {
@@ -785,7 +768,7 @@ int main(int argc, char ** argv) {
            }

            // deal with end of generation tokens in interactive mode
-            if (!waiting_for_first_input && llama_vocab_is_eog(vocab, common_sampler_last(smpl))) {
+            if (llama_vocab_is_eog(vocab, common_sampler_last(smpl))) {
                LOG_DBG("found an EOG token\n");

                if (params.interactive) {
@@ -805,12 +788,12 @@ int main(int argc, char ** argv) {
            }

            // if current token is not EOG, we add it to current assistant message
-            if (params.conversation_mode && !waiting_for_first_input) {
+            if (params.conversation_mode) {
                const auto id = common_sampler_last(smpl);
                assistant_ss << common_token_to_piece(ctx, id, false);
            }

-            if ((n_past > 0 || waiting_for_first_input) && is_interacting) {
+            if (n_past > 0 && is_interacting) {
                LOG_DBG("waiting for user input\n");

                if (params.conversation_mode) {
@@ -900,12 +883,11 @@ int main(int argc, char ** argv) {
                input_echo = false; // do not echo this again
            }

-            if (n_past > 0 || waiting_for_first_input) {
+            if (n_past > 0) {
                if (is_interacting) {
                    common_sampler_reset(smpl);
                }
                is_interacting = false;
-                waiting_for_first_input = false;
            }
        }

--- a/examples/passkey/README.md
+++ b/examples/passkey/README.md
@@ -5,8 +5,8 @@ models ability to recall information from long contexts.

 See the following PRs for more info:

- https://github.com/ggml-org/llama.cpp/pull/3856
- https://github.com/ggml-org/llama.cpp/pull/4810
+- https://github.com/ggerganov/llama.cpp/pull/3856
+- https://github.com/ggerganov/llama.cpp/pull/4810

 ### Usage

--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -3,7 +3,6 @@
 #include "log.h"
 #include "llama.h"

-#include <chrono>
 #include <algorithm>
 #include <array>
 #include <atomic>
--- a/examples/pydantic_models_to_grammar_examples.py
+++ b/examples/pydantic_models_to_grammar_examples.py
@@ -23,7 +23,7 @@ def create_completion(host, prompt, gbnf_grammar):
    """Calls the /completion API on llama-server.

    See
-    https://github.com/ggml-org/llama.cpp/tree/HEAD/examples/server#api-endpoints
+    https://github.com/ggerganov/llama.cpp/tree/HEAD/examples/server#api-endpoints
    """
    print(f"  Request:\n    Grammar:\n{textwrap.indent(gbnf_grammar, '      ')}\n    Prompt:\n{textwrap.indent(prompt.rstrip(), '      ')}")
    headers = {"Content-Type": "application/json"}
--- a/examples/quantize/README.md
+++ b/examples/quantize/README.md
@@ -69,22 +69,22 @@ Several quantization methods are supported. They differ in the resulting model d
 |   13B | ms/tok @ 8th |      - |     73 |     82 |     98 |    105 |    128 |
 |   13B | bits/weight  |   16.0 |    4.5 |    5.0 |    5.5 |    6.0 |    8.5 |

- [k-quants](https://github.com/ggml-org/llama.cpp/pull/1684)
+- [k-quants](https://github.com/ggerganov/llama.cpp/pull/1684)
 - recent k-quants improvements and new i-quants
-  - [#2707](https://github.com/ggml-org/llama.cpp/pull/2707)
-  - [#2807](https://github.com/ggml-org/llama.cpp/pull/2807)
-  - [#4773 - 2-bit i-quants (inference)](https://github.com/ggml-org/llama.cpp/pull/4773)
-  - [#4856 - 2-bit i-quants (inference)](https://github.com/ggml-org/llama.cpp/pull/4856)
-  - [#4861 - importance matrix](https://github.com/ggml-org/llama.cpp/pull/4861)
-  - [#4872 - MoE models](https://github.com/ggml-org/llama.cpp/pull/4872)
-  - [#4897 - 2-bit quantization](https://github.com/ggml-org/llama.cpp/pull/4897)
-  - [#4930 - imatrix for all k-quants](https://github.com/ggml-org/llama.cpp/pull/4930)
-  - [#4951 - imatrix on the GPU](https://github.com/ggml-org/llama.cpp/pull/4957)
-  - [#4969 - imatrix for legacy quants](https://github.com/ggml-org/llama.cpp/pull/4969)
-  - [#4996 - k-quants tuning](https://github.com/ggml-org/llama.cpp/pull/4996)
-  - [#5060 - Q3_K_XS](https://github.com/ggml-org/llama.cpp/pull/5060)
-  - [#5196 - 3-bit i-quants](https://github.com/ggml-org/llama.cpp/pull/5196)
-  - [quantization tuning](https://github.com/ggml-org/llama.cpp/pull/5320), [another one](https://github.com/ggml-org/llama.cpp/pull/5334), and [another one](https://github.com/ggml-org/llama.cpp/pull/5361)
+  - [#2707](https://github.com/ggerganov/llama.cpp/pull/2707)
+  - [#2807](https://github.com/ggerganov/llama.cpp/pull/2807)
+  - [#4773 - 2-bit i-quants (inference)](https://github.com/ggerganov/llama.cpp/pull/4773)
+  - [#4856 - 2-bit i-quants (inference)](https://github.com/ggerganov/llama.cpp/pull/4856)
+  - [#4861 - importance matrix](https://github.com/ggerganov/llama.cpp/pull/4861)
+  - [#4872 - MoE models](https://github.com/ggerganov/llama.cpp/pull/4872)
+  - [#4897 - 2-bit quantization](https://github.com/ggerganov/llama.cpp/pull/4897)
+  - [#4930 - imatrix for all k-quants](https://github.com/ggerganov/llama.cpp/pull/4930)
+  - [#4951 - imatrix on the GPU](https://github.com/ggerganov/llama.cpp/pull/4957)
+  - [#4969 - imatrix for legacy quants](https://github.com/ggerganov/llama.cpp/pull/4969)
+  - [#4996 - k-quants tuning](https://github.com/ggerganov/llama.cpp/pull/4996)
+  - [#5060 - Q3_K_XS](https://github.com/ggerganov/llama.cpp/pull/5060)
+  - [#5196 - 3-bit i-quants](https://github.com/ggerganov/llama.cpp/pull/5196)
+  - [quantization tuning](https://github.com/ggerganov/llama.cpp/pull/5320), [another one](https://github.com/ggerganov/llama.cpp/pull/5334), and [another one](https://github.com/ggerganov/llama.cpp/pull/5361)

 **Llama 2 7B**

--- a/examples/retrieval/README.md
+++ b/examples/retrieval/README.md
@@ -3,7 +3,7 @@
 Demonstration of simple retrieval technique based on cosine similarity

 More info:
-https://github.com/ggml-org/llama.cpp/pull/6193
+https://github.com/ggerganov/llama.cpp/pull/6193

 ### How to use

--- a/examples/run/linenoise.cpp/linenoise.cpp
+++ b/examples/run/linenoise.cpp/linenoise.cpp
--- a/examples/run/linenoise.cpp/linenoise.h
+++ b/examples/run/linenoise.cpp/linenoise.h
@@ -47,27 +47,27 @@ extern "C" {
 #include <stddef.h> /* For size_t. */
 #include <stdlib.h>

-extern const char * linenoiseEditMore;
+extern const char *linenoiseEditMore;

 /* The linenoiseState structure represents the state during line editing.
 * We pass this state to functions implementing specific editing
 * functionalities. */
 struct linenoiseState {
-    int          in_completion;  /* The user pressed TAB and we are now in completion
+    int in_completion;  /* The user pressed TAB and we are now in completion
                         * mode, so input is handled by completeLine(). */
-    size_t       completion_idx; /* Index of next completion to propose. */
-    int          ifd;            /* Terminal stdin file descriptor. */
-    int          ofd;            /* Terminal stdout file descriptor. */
-    char *       buf;            /* Edited line buffer. */
-    size_t       buflen;         /* Edited line buffer size. */
-    const char * prompt;         /* Prompt to display. */
-    size_t       plen;           /* Prompt length. */
-    size_t       pos;            /* Current cursor position. */
-    size_t       oldcolpos;      /* Previous refresh cursor column position. */
-    size_t       len;            /* Current edited line length. */
-    size_t       cols;           /* Number of columns in terminal. */
-    size_t       oldrows;        /* Rows used by last refreshed line (multiline mode) */
-    int          history_index;  /* The history index we are currently editing. */
+    size_t completion_idx; /* Index of next completion to propose. */
+    int ifd;            /* Terminal stdin file descriptor. */
+    int ofd;            /* Terminal stdout file descriptor. */
+    char *buf;          /* Edited line buffer. */
+    size_t buflen;      /* Edited line buffer size. */
+    const char *prompt; /* Prompt to display. */
+    size_t plen;        /* Prompt length. */
+    size_t pos;         /* Current cursor position. */
+    size_t oldpos;      /* Previous refresh cursor position. */
+    size_t len;         /* Current edited line length. */
+    size_t cols;        /* Number of columns in terminal. */
+    size_t oldrows;     /* Rows used by last refrehsed line (multiline mode) */
+    int history_index;  /* The history index we are currently editing. */
 };

 struct linenoiseCompletions {
@@ -89,20 +89,19 @@ struct linenoiseCompletions {
 };

 /* Non blocking API. */
-int          linenoiseEditStart(struct linenoiseState * l, int stdin_fd, int stdout_fd, char * buf, size_t buflen,
-                                const char * prompt);
-const char * linenoiseEditFeed(struct linenoiseState * l);
-void         linenoiseEditStop(struct linenoiseState * l);
-void         linenoiseHide(struct linenoiseState * l);
-void         linenoiseShow(struct linenoiseState * l);
+int linenoiseEditStart(struct linenoiseState *l, int stdin_fd, int stdout_fd, char *buf, size_t buflen, const char *prompt);
+const char *linenoiseEditFeed(struct linenoiseState *l);
+void linenoiseEditStop(struct linenoiseState *l);
+void linenoiseHide(struct linenoiseState *l);
+void linenoiseShow(struct linenoiseState *l);

 /* Blocking API. */
-const char * linenoise(const char * prompt);
-void         linenoiseFree(void * ptr);
+const char *linenoise(const char *prompt);
+void linenoiseFree(void *ptr);

 /* Completion API. */
 typedef void(linenoiseCompletionCallback)(const char *, linenoiseCompletions *);
-typedef const char *(linenoiseHintsCallback) (const char *, int * color, int * bold);
+typedef const char*(linenoiseHintsCallback)(const char *, int *color, int *bold);
 typedef void(linenoiseFreeHintsCallback)(const char *);
 void linenoiseSetCompletionCallback(linenoiseCompletionCallback *);
 void linenoiseSetHintsCallback(linenoiseHintsCallback *);
@@ -110,10 +109,10 @@ void linenoiseSetFreeHintsCallback(linenoiseFreeHintsCallback *);
 void linenoiseAddCompletion(linenoiseCompletions *, const char *);

 /* History API. */
-int linenoiseHistoryAdd(const char * line);
+int linenoiseHistoryAdd(const char *line);
 int linenoiseHistorySetMaxLen(int len);
-int linenoiseHistorySave(const char * filename);
-int linenoiseHistoryLoad(const char * filename);
+int linenoiseHistorySave(const char *filename);
+int linenoiseHistoryLoad(const char *filename);

 /* Other utilities. */
 void linenoiseClearScreen(void);
@@ -122,14 +121,6 @@ void linenoisePrintKeyCodes(void);
 void linenoiseMaskModeEnable(void);
 void linenoiseMaskModeDisable(void);

-/* Encoding functions. */
-typedef size_t(linenoisePrevCharLen)(const char * buf, size_t buf_len, size_t pos, size_t * col_len);
-typedef size_t(linenoiseNextCharLen)(const char * buf, size_t buf_len, size_t pos, size_t * col_len);
-typedef size_t(linenoiseReadCode)(int fd, char * buf, size_t buf_len, int * c);
-
-void linenoiseSetEncodingFunctions(linenoisePrevCharLen * prevCharLenFunc, linenoiseNextCharLen * nextCharLenFunc,
-                                   linenoiseReadCode * readCodeFunc);
-
 #ifdef __cplusplus
 }
 #endif
--- a/examples/run/run.cpp
+++ b/examples/run/run.cpp
@@ -24,7 +24,7 @@
 #include <string>
 #include <vector>

-#include "chat.h"
+#include "chat-template.hpp"
 #include "common.h"
 #include "json.hpp"
 #include "linenoise.cpp/linenoise.h"
@@ -113,7 +113,6 @@ class Opt {
    llama_context_params ctx_params;
    llama_model_params   model_params;
    std::string model_;
-    std::string chat_template_file;
    std::string          user;
    bool                 use_jinja   = false;
    int                  context_size = -1, ngl = -1;
@@ -149,16 +148,6 @@ class Opt {
        return 0;
    }

-    int handle_option_with_value(int argc, const char ** argv, int & i, std::string & option_value) {
-        if (i + 1 >= argc) {
-            return 1;
-        }
-
-        option_value = argv[++i];
-
-        return 0;
-    }
-
    int parse(int argc, const char ** argv) {
        bool options_parsing   = true;
        for (int i = 1, positional_args_i = 0; i < argc; ++i) {
@@ -180,11 +169,6 @@ class Opt {
                verbose = true;
            } else if (options_parsing && strcmp(argv[i], "--jinja") == 0) {
                use_jinja = true;
-            } else if (options_parsing && strcmp(argv[i], "--chat-template-file") == 0){
-                if (handle_option_with_value(argc, argv, i, chat_template_file) == 1) {
-                    return 1;
-                }
-                use_jinja = true;
            } else if (options_parsing && parse_flag(argv, i, "-h", "--help")) {
                help = true;
                return 0;
@@ -223,11 +207,6 @@ class Opt {
            "Options:\n"
            "  -c, --context-size <value>\n"
            "      Context size (default: %d)\n"
-            "  --chat-template-file <path>\n"
-            "      Path to the file containing the chat template to use with the model.\n"
-            "      Only supports jinja templates and implicitly sets the --jinja flag.\n"
-            "  --jinja\n"
-            "      Use jinja templating for the chat template of the model\n"
            "  -n, -ngl, --ngl <value>\n"
            "      Number of GPU layers (default: %d)\n"
            "  --temp <value>\n"
@@ -282,12 +261,13 @@ static int get_terminal_width() {
 #endif
 }

+#ifdef LLAMA_USE_CURL
 class File {
  public:
    FILE * file = nullptr;

    FILE * open(const std::string & filename, const char * mode) {
-        file = ggml_fopen(filename.c_str(), mode);
+        file = fopen(filename.c_str(), mode);

        return file;
    }
@@ -323,20 +303,6 @@ class File {
        return 0;
    }

-    std::string to_string() {
-        fseek(file, 0, SEEK_END);
-        const size_t size = ftell(file);
-        fseek(file, 0, SEEK_SET);
-        std::string out;
-        out.resize(size);
-        const size_t read_size = fread(&out[0], 1, size, file);
-        if (read_size != size) {
-            printe("Error reading file: %s", strerror(errno));
-        }
-
-        return out;
-    }
-
    ~File() {
        if (fd >= 0) {
 #    ifdef _WIN32
@@ -361,7 +327,6 @@ class File {
 #    endif
 };

-#ifdef LLAMA_USE_CURL
 class HttpClient {
  public:
    int init(const std::string & url, const std::vector<std::string> & headers, const std::string & output_file,
@@ -592,7 +557,7 @@ class LlamaData {
    llama_model_ptr                 model;
    llama_sampler_ptr               sampler;
    llama_context_ptr               context;
-    std::vector<llama_chat_message> messages; // TODO: switch to common_chat_msg
+    std::vector<llama_chat_message> messages;
    std::list<std::string>          msg_strs;
    std::vector<char>               fmtted;

@@ -869,23 +834,44 @@ static void add_message(const char * role, const std::string & text, LlamaData &
 }

 // Function to apply the chat template and resize `formatted` if needed
-static int apply_chat_template(const struct common_chat_templates * tmpls, LlamaData & llama_data, const bool append, bool use_jinja) {
-    common_chat_templates_inputs inputs;
-    for (const auto & msg : llama_data.messages) {
-        common_chat_msg cmsg;
-        cmsg.role    = msg.role;
-        cmsg.content = msg.content;
-        inputs.messages.push_back(cmsg);
-    }
-    inputs.add_generation_prompt = append;
-    inputs.use_jinja = use_jinja;
+static int apply_chat_template(const common_chat_template & tmpl, LlamaData & llama_data, const bool append, bool use_jinja) {
+    if (use_jinja) {
+        json messages = json::array();
+        for (const auto & msg : llama_data.messages) {
+            messages.push_back({
+                {"role", msg.role},
+                {"content", msg.content},
+            });
+        }
+        try {
+            minja::chat_template_inputs tmpl_inputs;
+            tmpl_inputs.messages = messages;
+            tmpl_inputs.add_generation_prompt = append;

-    auto chat_params = common_chat_templates_apply(tmpls, inputs);
-    // TODO: use other params for tool calls.
-    auto result = chat_params.prompt;
-    llama_data.fmtted.resize(result.size() + 1);
-    memcpy(llama_data.fmtted.data(), result.c_str(), result.size() + 1);
-    return result.size();
+            minja::chat_template_options tmpl_opts;
+            tmpl_opts.use_bos_token = false;
+            tmpl_opts.use_eos_token = false;
+
+            auto result = tmpl.apply(tmpl_inputs, tmpl_opts);
+            llama_data.fmtted.resize(result.size() + 1);
+            memcpy(llama_data.fmtted.data(), result.c_str(), result.size() + 1);
+            return result.size();
+        } catch (const std::exception & e) {
+            printe("failed to render the chat template: %s\n", e.what());
+            return -1;
+        }
+    }
+    int result = llama_chat_apply_template(
+        tmpl.source().c_str(), llama_data.messages.data(), llama_data.messages.size(), append,
+        append ? llama_data.fmtted.data() : nullptr, append ? llama_data.fmtted.size() : 0);
+    if (append && result > static_cast<int>(llama_data.fmtted.size())) {
+        llama_data.fmtted.resize(result);
+        result = llama_chat_apply_template(tmpl.source().c_str(), llama_data.messages.data(),
+                                           llama_data.messages.size(), append, llama_data.fmtted.data(),
+                                           llama_data.fmtted.size());
+    }
+
+    return result;
 }

 // Function to tokenize the prompt
@@ -977,8 +963,7 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str
 }

 static int read_user_input(std::string & user_input) {
-    static const char * prompt_prefix_env = std::getenv("LLAMA_PROMPT_PREFIX");
-    static const char * prompt_prefix     = prompt_prefix_env ? prompt_prefix_env : "> ";
+    static const char * prompt_prefix = "> ";
 #ifdef WIN32
    printf("\r" LOG_CLR_TO_EOL LOG_COL_DEFAULT "%s", prompt_prefix);

@@ -1030,8 +1015,8 @@ static int generate_response(LlamaData & llama_data, const std::string & prompt,
 }

 // Helper function to apply the chat template and handle errors
-static int apply_chat_template_with_error_handling(const common_chat_templates * tmpls, LlamaData & llama_data, const bool append, int & output_length, bool use_jinja) {
-    const int new_len = apply_chat_template(tmpls, llama_data, append, use_jinja);
+static int apply_chat_template_with_error_handling(const common_chat_template & tmpl, LlamaData & llama_data, const bool append, int & output_length, bool use_jinja) {
+    const int new_len = apply_chat_template(tmpl, llama_data, append, use_jinja);
    if (new_len < 0) {
        printe("failed to apply the chat template\n");
        return -1;
@@ -1089,68 +1074,40 @@ static int get_user_input(std::string & user_input, const std::string & user) {
    return 0;
 }

-// Reads a chat template file to be used
-static std::string read_chat_template_file(const std::string & chat_template_file) {
-    File file;
-    if (!file.open(chat_template_file, "r")) {
-        printe("Error opening chat template file '%s': %s", chat_template_file.c_str(), strerror(errno));
-        return "";
-    }
-
-    return file.to_string();
-}
-
-static int process_user_message(const Opt & opt, const std::string & user_input, LlamaData & llama_data,
-                                const common_chat_templates_ptr & chat_templates, int & prev_len,
-                                const bool stdout_a_terminal) {
-    add_message("user", opt.user.empty() ? user_input : opt.user, llama_data);
-    int new_len;
-    if (apply_chat_template_with_error_handling(chat_templates.get(), llama_data, true, new_len, opt.use_jinja) < 0) {
-        return 1;
-    }
-
-    std::string prompt(llama_data.fmtted.begin() + prev_len, llama_data.fmtted.begin() + new_len);
-    std::string response;
-    if (generate_response(llama_data, prompt, response, stdout_a_terminal)) {
-        return 1;
-    }
-
-    if (!opt.user.empty()) {
-        return 2;
-    }
-
-    add_message("assistant", response, llama_data);
-    if (apply_chat_template_with_error_handling(chat_templates.get(), llama_data, false, prev_len, opt.use_jinja) < 0) {
-        return 1;
-    }
-
-    return 0;
-}
-
 // Main chat loop function
-static int chat_loop(LlamaData & llama_data, const Opt & opt) {
+static int chat_loop(LlamaData & llama_data, const std::string & user, bool use_jinja) {
    int prev_len = 0;
    llama_data.fmtted.resize(llama_n_ctx(llama_data.context.get()));
-    std::string chat_template;
-    if (!opt.chat_template_file.empty()) {
-        chat_template = read_chat_template_file(opt.chat_template_file);
-    }
-
-    common_chat_templates_ptr chat_templates    = common_chat_templates_init(llama_data.model.get(), chat_template);
+    auto chat_templates = common_chat_templates_from_model(llama_data.model.get(), "");
+    GGML_ASSERT(chat_templates.template_default);
    static const bool stdout_a_terminal = is_stdout_a_terminal();
    while (true) {
        // Get user input
        std::string user_input;
-        if (get_user_input(user_input, opt.user) == 1) {
+        if (get_user_input(user_input, user) == 1) {
            return 0;
        }

-        const int ret = process_user_message(opt, user_input, llama_data, chat_templates, prev_len, stdout_a_terminal);
-        if (ret == 1) {
+        add_message("user", user.empty() ? user_input : user, llama_data);
+        int new_len;
+        if (apply_chat_template_with_error_handling(*chat_templates.template_default, llama_data, true, new_len, use_jinja) < 0) {
            return 1;
-        } else if (ret == 2) {
+        }
+
+        std::string prompt(llama_data.fmtted.begin() + prev_len, llama_data.fmtted.begin() + new_len);
+        std::string response;
+        if (generate_response(llama_data, prompt, response, stdout_a_terminal)) {
+            return 1;
+        }
+
+        if (!user.empty()) {
            break;
        }
+
+        add_message("assistant", response, llama_data);
+        if (apply_chat_template_with_error_handling(*chat_templates.template_default, llama_data, false, prev_len, use_jinja) < 0) {
+            return 1;
+        }
    }

    return 0;
@@ -1208,7 +1165,7 @@ int main(int argc, const char ** argv) {
        return 1;
    }

-    if (chat_loop(llama_data, opt)) {
+    if (chat_loop(llama_data, opt.user, opt.use_jinja)) {
        return 1;
    }

--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@@ -5,7 +5,7 @@ option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})

 if (MINGW)
-    # fix: https://github.com/ggml-org/llama.cpp/actions/runs/9651004652/job/26617901362?pr=8006
+    # fix: https://github.com/ggerganov/llama.cpp/actions/runs/9651004652/job/26617901362?pr=8006
    add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
 endif()

--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -7,15 +7,14 @@ Set of LLM REST APIs and a simple web front end to interact with llama.cpp.
 **Features:**
 * LLM inference of F16 and quantized models on GPU and CPU
 * [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes
- * Reranking endoint (WIP: https://github.com/ggml-org/llama.cpp/pull/9510)
+ * Reranking endoint (WIP: https://github.com/ggerganov/llama.cpp/pull/9510)
 * Parallel decoding with multi-user support
 * Continuous batching
 * Multimodal (wip)
 * Monitoring endpoints
 * Schema-constrained JSON response format
- * [Function calling](../../docs/function-calling.md) / tool use for ~any model

-The project is under active development, and we are [looking for feedback and contributors](https://github.com/ggml-org/llama.cpp/issues/4216).
+The project is under active development, and we are [looking for feedback and contributors](https://github.com/ggerganov/llama.cpp/issues/4216).

 ## Usage

@@ -66,7 +65,7 @@ The project is under active development, and we are [looking for feedback and co
 | `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
 | `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
 | `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock)<br/>(env: LLAMA_ARG_NO_MMAP) |
-| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggml-org/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
+| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggerganov/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
 | `-dev, --device <dev1,dev2,..>` | comma-separated list of devices to use for offloading (none = don't offload)<br/>use --list-devices to see a list of available devices<br/>(env: LLAMA_ARG_DEVICE) |
 | `--list-devices` | print list of available devices and exit |
 | `-ngl, --gpu-layers, --n-gpu-layers N` | number of layers to store in VRAM<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
@@ -128,7 +127,6 @@ The project is under active development, and we are [looking for feedback and co
 | `--grammar-file FNAME` | file to read grammar from |
 | `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
 | `--jinja` | Enable experimental Jinja templating engine (required for tool use) |
-| `--reasoning-format FORMAT` | Controls extraction of model thinking traces and the format / field in which they are returned (default: `deepseek`; allowed values: `deepseek`, `none`; requires `--jinja`). `none` will leave thinking traces inline in `message.content` in a model-specific format, while `deepseek` will return them separately under `message.reasoning_content` |

 **Example-specific params**

@@ -179,7 +177,7 @@ Example usage of docker compose with environment variables:
 ```yml
 services:
  llamacpp-server:
-    image: ghcr.io/ggml-org/llama.cpp:server
+    image: ghcr.io/ggerganov/llama.cpp:server
    ports:
      - 8080:8080
    volumes:
@@ -274,10 +272,10 @@ You can consume the endpoints with Postman or NodeJS with axios library. You can
 ### Docker

 ```bash
-docker run -p 8080:8080 -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:server -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080
+docker run -p 8080:8080 -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:server -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080

 # or, with CUDA:
-docker run -p 8080:8080 -v /path/to/models:/models --gpus all ghcr.io/ggml-org/llama.cpp:server-cuda -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080 --n-gpu-layers 99
+docker run -p 8080:8080 -v /path/to/models:/models --gpus all ghcr.io/ggerganov/llama.cpp:server-cuda -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080 --n-gpu-layers 99
 ```

 ## Testing with CURL
@@ -1067,7 +1065,7 @@ print(completion.choices[0].text)

 ### POST `/v1/chat/completions`: OpenAI-compatible Chat Completions API

-Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggml-org/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used.
+Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used.

 *Options:*

@@ -1121,9 +1119,181 @@ curl http://localhost:8080/v1/chat/completions \

 *Tool call support*

-[OpenAI-style function calling](https://platform.openai.com/docs/guides/function-calling) is supported with the `--jinja` flag (and may require a `--chat-template-file` override to get the right tool-use compatible Jinja template; worst case, `--chat-template chatml` may also work).
+[Function calling](https://platform.openai.com/docs/guides/function-calling) is supported for all models (see https://github.com/ggerganov/llama.cpp/pull/9639):

-**See our [Function calling](../../docs/function-calling.md) docs** for more details, supported native tool call styles (generic tool call style is used as fallback) / examples of use.
+- Requires `--jinja` flag
+- Native tool call formats supported:
+  - Llama 3.1 / 3.3 (including builtin tools support - tool names for `wolfram_alpha`, `web_search` / `brave_search`, `code_interpreter`), Llama 3.2
+  - Functionary v3.1 / v3.2
+  - Hermes 2/3, Qwen 2.5
+  - Mistral Nemo
+  - Firefunction v2
+  - Command R7B
+  - DeepSeek R1 (WIP / seems reluctant to call any tools?)
+
+  <details>
+  <summary>Show some common templates and which format handler they use</summary>
+
+  | Template | Format |
+  |----------|--------|
+  | CohereForAI-c4ai-command-r-plus-default.jinja | generic tool calls |
+  | CohereForAI-c4ai-command-r-plus-rag.jinja | generic tool calls |
+  | CohereForAI-c4ai-command-r-plus-tool_use.jinja | generic tool calls |
+  | MiniMaxAI-MiniMax-Text-01.jinja | generic tool calls |
+  | NexaAIDev-Octopus-v2.jinja | generic tool calls |
+  | NousResearch-Hermes-2-Pro-Llama-3-8B-default.jinja | generic tool calls |
+  | NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja | hermes 2 pro tool calls |
+  | NousResearch-Hermes-2-Pro-Mistral-7B-default.jinja | generic tool calls |
+  | NousResearch-Hermes-2-Pro-Mistral-7B-tool_use.jinja | hermes 2 pro tool calls |
+  | NousResearch-Hermes-3-Llama-3.1-70B-default.jinja | generic tool calls |
+  | NousResearch-Hermes-3-Llama-3.1-70B-tool_use.jinja | hermes 2 pro tool calls |
+  | OrionStarAI-Orion-14B-Chat.jinja | generic tool calls |
+  | Qwen-QwQ-32B-Preview.jinja | hermes 2 pro tool calls |
+  | Qwen-Qwen2-7B-Instruct.jinja | generic tool calls |
+  | Qwen-Qwen2-VL-7B-Instruct.jinja | generic tool calls |
+  | Qwen-Qwen2.5-7B-Instruct.jinja | hermes 2 pro tool calls |
+  | Qwen-Qwen2.5-Math-7B-Instruct.jinja | hermes 2 pro tool calls |
+  | TheBloke-FusionNet_34Bx2_MoE-AWQ.jinja | generic tool calls |
+  | abacusai-Fewshot-Metamath-OrcaVicuna-Mistral.jinja | generic tool calls |
+  | bofenghuang-vigogne-2-70b-chat.jinja | generic tool calls |
+  | databricks-dbrx-instruct.jinja | generic tool calls |
+  | deepseek-ai-DeepSeek-Coder-V2-Instruct.jinja | generic tool calls |
+  | deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja | deepseek r1 tool calls |
+  | deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja | deepseek r1 tool calls |
+  | deepseek-ai-DeepSeek-R1-Distill-Qwen-7B.jinja | deepseek r1 tool calls |
+  | deepseek-ai-DeepSeek-V2.5.jinja | deepseek r1 tool calls |
+  | deepseek-ai-deepseek-coder-33b-instruct.jinja | generic tool calls |
+  | google-gemma-2-2b-it.jinja | generic tool calls |
+  | google-gemma-7b-it.jinja | generic tool calls |
+  | indischepartij-MiniCPM-3B-OpenHermes-2.5-v2.jinja | generic tool calls |
+  | mattshumer-Reflection-Llama-3.1-70B.jinja | generic tool calls |
+  | meetkai-functionary-medium-v3.2.jinja | functionary v3.2 tool calls |
+  | meta-llama-Llama-3.1-8B-Instruct.jinja | llama 3.x tool calls (w/ builtin tools) |
+  | meta-llama-Llama-3.2-3B-Instruct.jinja | llama 3.x tool calls |
+  | meta-llama-Llama-3.3-70B-Instruct.jinja | llama 3.x tool calls (w/ builtin tools) |
+  | meta-llama-Meta-Llama-3.1-8B-Instruct.jinja | llama 3.x tool calls (w/ builtin tools) |
+  | microsoft-Phi-3-medium-4k-instruct.jinja | generic tool calls |
+  | microsoft-Phi-3-mini-4k-instruct.jinja | generic tool calls |
+  | microsoft-Phi-3-small-8k-instruct.jinja | generic tool calls |
+  | microsoft-Phi-3.5-mini-instruct.jinja | generic tool calls |
+  | microsoft-Phi-3.5-vision-instruct.jinja | generic tool calls |
+  | mistralai-Mistral-7B-Instruct-v0.2.jinja | generic tool calls |
+  | mistralai-Mistral-Large-Instruct-2407.jinja | mistral nemo tool calls |
+  | mistralai-Mistral-Large-Instruct-2411.jinja | generic tool calls |
+  | mistralai-Mistral-Nemo-Instruct-2407.jinja | mistral nemo tool calls |
+  | mistralai-Mixtral-8x7B-Instruct-v0.1.jinja | generic tool calls |
+  | mlabonne-AlphaMonarch-7B.jinja | generic tool calls |
+  | nvidia-Llama-3.1-Nemotron-70B-Instruct-HF.jinja | llama 3.x tool calls (w/ builtin tools) |
+  | openchat-openchat-3.5-0106.jinja | generic tool calls |
+  | teknium-OpenHermes-2.5-Mistral-7B.jinja | generic tool calls |
+
+  This table can be generated with:
+
+  ```bash
+  ./build/bin/test-chat ../minja/build/tests/*.jinja 2>/dev/null
+
+  </details>
+
+- Generic tool call is supported when the template isn't recognized by native format handlers (you'll see `Chat format: Generic` in the logs).
+  - Use `--chat-template-file` to override the template when appropriate (see examples below)
+  - Generic support may consume more tokens and be less efficient than a model's native format.
+
+- Run with:
+
+  ```shell
+  # Native support:
+  llama-server --jinja -fa -hf bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M
+  llama-server --jinja -fa -hf bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q6_K_L
+  llama-server --jinja -fa -hf bartowski/functionary-small-v3.2-GGUF:Q4_K_M
+  llama-server --jinja -fa -hf bartowski/Llama-3.3-70B-Instruct-GGUF:Q4_K_M
+
+  # Native support requires the right template for these GGUFs:
+
+  llama-server --jinja -fa -hf bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M \
+    --chat-template-file <( python scripts/get_chat_template.py NousResearch/Hermes-2-Pro-Llama-3-8B tool_use )
+
+  llama-server --jinja -fa -hf bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M \
+    --chat-template-file <( python scripts/get_chat_template.py NousResearch/Hermes-3-Llama-3.1-8B tool_use )
+
+  llama-server --jinja -fa -hf bartowski/firefunction-v2-GGUF -hff firefunction-v2-IQ1_M.gguf \
+    --chat-template-file <( python scripts/get_chat_template.py fireworks-ai/llama-3-firefunction-v2 tool_use )
+
+  llama-server --jinja -fa -hf bartowski/c4ai-command-r7b-12-2024-GGUF:Q6_K_L \
+    --chat-template-file <( python scripts/get_chat_template.py CohereForAI/c4ai-command-r7b-12-2024 tool_use )
+
+  # Generic format support
+  llama-server --jinja -fa -hf bartowski/phi-4-GGUF:Q4_0
+  llama-server --jinja -fa -hf bartowski/gemma-2-2b-it-GGUF:Q8_0
+  llama-server --jinja -fa -hf bartowski/c4ai-command-r-v01-GGUF:Q2_K
+  ```
+
+- Test in CLI:
+
+  ```bash
+  curl http://localhost:8080/v1/chat/completions -d '{
+    "model": "gpt-3.5-turbo",
+    "tools": [
+      {
+        "type":"function",
+        "function":{
+          "name":"get_current_weather",
+          "description":"Get the current weather in a given location",
+          "parameters":{
+            "type":"object",
+            "properties":{
+              "location":{
+                "type":"string",
+                "description":"The city and state, e.g. San Francisco, CA"
+              }
+            },
+            "required":["location"]
+          }
+        }
+      }
+    ],
+    "messages": [
+      {
+        "role": "user",
+        "content": "What is the weather like in Istanbul?."
+      }
+    ]
+  }'
+  ```
+
+  <details>
+  <summary>Show output</summary>
+
+  ```json
+  {
+    "choices": [
+      {
+        "finish_reason": "tool",
+        "index": 0,
+        "message": {
+          "content": null,
+          "tool_calls": [
+            {
+              "name": "python",
+              "arguments": "{\"code\":\" \\nprint(\\\"Hello, World!\\\")\"}"
+            }
+          ],
+          "role": "assistant"
+        }
+      }
+    ],
+    "created": 1727287211,
+    "model": "gpt-3.5-turbo",
+    "object": "chat.completion",
+    "usage": {
+      "completion_tokens": 16,
+      "prompt_tokens": 44,
+      "total_tokens": 60
+    },
+    "id": "chatcmpl-Htbgh9feMmGM0LEH2hmQvwsCxq3c6Ni8"
+  }
+  ```
+
+  </details>

 ### POST `/v1/embeddings`: OpenAI-compatible embeddings API

@@ -1228,7 +1398,7 @@ Apart from error types supported by OAI, we also have custom types that are spec

 ### Legacy completion web UI

-A new chat-based UI has replaced the old completion-based since [this PR](https://github.com/ggml-org/llama.cpp/pull/10175). If you want to use the old completion, start the server with `--path ./examples/server/public_legacy`
+A new chat-based UI has replaced the old completion-based since [this PR](https://github.com/ggerganov/llama.cpp/pull/10175). If you want to use the old completion, start the server with `--path ./examples/server/public_legacy`

 For example:

--- a/examples/server/httplib.h
+++ b/examples/server/httplib.h
--- a/examples/server/public/index.html.gz
+++ b/examples/server/public/index.html.gz
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -42,7 +42,7 @@ enum stop_type {
    STOP_TYPE_LIMIT,
 };

-// state diagram: https://github.com/ggml-org/llama.cpp/pull/9283
+// state diagram: https://github.com/ggerganov/llama.cpp/pull/9283
 enum slot_state {
    SLOT_STATE_IDLE,
    SLOT_STATE_STARTED, // TODO: this state is only used for setting up the initial prompt processing; maybe merge it with launch_slot_with_task in the future
@@ -173,7 +173,6 @@ struct slot_params {
            {"grammar_trigger_words",     grammar_trigger_words},
            {"grammar_trigger_tokens",    sampling.grammar_trigger_tokens},
            {"preserved_tokens",          sampling.preserved_tokens},
-            {"chat_format",               common_chat_format_name(oaicompat_chat_format)},
            {"samplers",                  samplers},
            {"speculative.n_max",         speculative.n_max},
            {"speculative.n_min",         speculative.n_min},
@@ -274,7 +273,7 @@ struct server_task {
        params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min);

        params.speculative.n_min = std::min(params.speculative.n_max, params.speculative.n_min);
-        params.speculative.n_min = std::max(params.speculative.n_min, 0);
+        params.speculative.n_min = std::max(params.speculative.n_min, 2);
        params.speculative.n_max = std::max(params.speculative.n_max, 0);

        // Use OpenAI API logprobs only if n_probs wasn't provided
@@ -329,6 +328,9 @@ struct server_task {
        }

        // process "json_schema" and "grammar"
+        if (data.contains("json_schema") && !data.at("json_schema").is_null() && data.contains("grammar") && !data.at("grammar").is_null()) {
+            throw std::runtime_error("Either \"json_schema\" or \"grammar\" can be specified, but not both");
+        }
        if (data.contains("json_schema") && !data.contains("grammar")) {
            try {
                auto schema                  = json_value(data, "json_schema", json::object());
@@ -722,19 +724,9 @@ struct server_task_result_cmpl_final : server_task_result {
            msg.content = content;
        }

-        json message {
-            {"role", "assistant"},
-        };
-        if (!msg.reasoning_content.empty()) {
-            message["reasoning_content"] = msg.reasoning_content;
-        }
-        if (msg.content.empty() && !msg.tool_calls.empty()) {
-            message["content"] = json();
-        } else {
-            message["content"] = msg.content;
-        }
+        json tool_calls;
        if (!msg.tool_calls.empty()) {
-            auto tool_calls = json::array();
+            tool_calls = json::array();
            for (const auto & tc : msg.tool_calls) {
                tool_calls.push_back({
                    {"type", "function"},
@@ -745,7 +737,15 @@ struct server_task_result_cmpl_final : server_task_result {
                    {"id", tc.id},
                });
            }
-            message["tool_calls"] = tool_calls;
+        }
+
+        json message {
+            {"content", msg.content},
+            {"tool_calls", tool_calls},
+            {"role", "assistant"},
+        };
+        if (!msg.tool_plan.empty()) {
+            message["tool_plan"] = msg.tool_plan;
        }

        json choice {
@@ -1600,10 +1600,6 @@ struct server_queue {

            while (true) {
                std::unique_lock<std::mutex> lock(mutex_tasks);
-                if (!running) {
-                    QUE_DBG("%s", "terminate\n");
-                    return;
-                }
                if (queue_tasks.empty()) {
                    lock.unlock();
                    break;
@@ -1624,11 +1620,11 @@ struct server_queue {
            QUE_DBG("%s", "waiting for new tasks\n");
            {
                std::unique_lock<std::mutex> lock(mutex_tasks);
-                if (!running) {
-                    QUE_DBG("%s", "terminate\n");
-                    return;
-                }
                if (queue_tasks.empty()) {
+                    if (!running) {
+                        QUE_DBG("%s", "terminate\n");
+                        return;
+                    }
                    condition_tasks.wait(lock, [&]{
                        return (!queue_tasks.empty() || !running);
                    });
@@ -1804,7 +1800,7 @@ struct server_context {
    // Necessary similarity of prompt for slot selection
    float slot_prompt_similarity = 0.0f;

-    common_chat_templates_ptr chat_templates;
+    common_chat_templates chat_templates;

    ~server_context() {
        // Clear any sampling context
@@ -1888,17 +1884,45 @@ struct server_context {
            llama_init_dft.context.reset();
        }

-        chat_templates = common_chat_templates_init(model, params_base.chat_template);
-        try {
-            common_chat_format_example(chat_templates.get(), params.use_jinja);
-        } catch (const std::exception & e) {
+        if (params_base.chat_template.empty() && !validate_builtin_chat_template(params.use_jinja)) {
            SRV_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
-            chat_templates = common_chat_templates_init(model, "chatml");
+            chat_templates = common_chat_templates_from_model(model, "chatml");
+        } else {
+            chat_templates = common_chat_templates_from_model(model, params_base.chat_template);
        }
+        GGML_ASSERT(chat_templates.template_default.get() != nullptr);

        return true;
    }

+    bool validate_builtin_chat_template(bool use_jinja) const {
+        llama_chat_message chat[] = {{"user", "test"}};
+
+        if (use_jinja) {
+            auto templates = common_chat_templates_from_model(model, "");
+            common_chat_inputs inputs;
+            inputs.messages = json::array({{
+                {"role", "user"},
+                {"content", "test"},
+            }});
+            GGML_ASSERT(templates.template_default);
+            try {
+                common_chat_params_init(*templates.template_default, inputs);
+                if (templates.template_tool_use) {
+                    common_chat_params_init(*templates.template_tool_use, inputs);
+                }
+                return true;
+            } catch (const std::exception & e) {
+                SRV_ERR("failed to apply template: %s\n", e.what());
+                return false;
+            }
+        } else {
+            const char * tmpl = llama_model_chat_template(model, /* name */ nullptr);
+            const int32_t chat_res = llama_chat_apply_template(tmpl, chat, 1, true, nullptr, 0);
+            return chat_res > 0;
+        }
+    }
+
    void init() {
        const int32_t n_ctx_slot = n_ctx / params_base.n_parallel;

@@ -2045,8 +2069,8 @@ struct server_context {

        if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
            // Might be better to reject the request with a 400 ?
-            SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d", slot.params.n_predict, slot.n_predict);
            slot.params.n_predict = slot.n_predict;
+            SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d", slot.n_predict, slot.n_predict);
        }

        if (slot.params.ignore_eos && has_eos_token) {
@@ -2251,7 +2275,7 @@ struct server_context {
            for (size_t i = 0; i < std::min(max_probs, n_probs); i++) {
                result.probs.push_back({
                    cur_p->data[i].id,
-                    common_token_to_piece(ctx, cur_p->data[i].id, special),
+                    common_detokenize(ctx, {cur_p->data[i].id}, special),
                    cur_p->data[i].p
                });
            }
@@ -2273,7 +2297,7 @@ struct server_context {
            for (size_t i = 0; i < std::min(n_vocab, n_probs); i++) {
                result.probs.push_back({
                    cur[i].id,
-                    common_token_to_piece(ctx, cur[i].id, special),
+                    common_detokenize(ctx, {cur[i].id}, special),
                    cur[i].p
                });
            }
@@ -3625,7 +3649,7 @@ int main(int argc, char ** argv) {
            }, {
                    {"name",  "n_busy_slots_per_decode"},
                    {"help",  "Average number of busy slots per llama_decode() call"},
-                    {"value",  (float) res_metrics->n_busy_slots_total / std::max((float) res_metrics->n_decode_total, 1.f)}
+                    {"value",  (float) res_metrics->n_busy_slots_total / (float) res_metrics->n_decode_total}
            }}},
            {"gauge", {{
                    {"name",  "prompt_tokens_seconds"},
@@ -3791,15 +3815,13 @@ int main(int argc, char ** argv) {
            { "default_generation_settings", ctx_server.default_generation_settings_for_props },
            { "total_slots",                 ctx_server.params_base.n_parallel },
            { "model_path",                  ctx_server.params_base.model },
-            { "chat_template",               common_chat_templates_source(ctx_server.chat_templates.get()) },
-            { "bos_token",                   common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)},
-            { "eos_token",                   common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)},
+            { "chat_template",               ctx_server.chat_templates.template_default->source() },
+            { "bos_token",                   ctx_server.chat_templates.template_default->bos_token() },
+            { "eos_token",                   ctx_server.chat_templates.template_default->eos_token() },
            { "build_info",                  build_info },
        };
-        if (ctx_server.params_base.use_jinja) {
-            if (auto tool_use_src = common_chat_templates_source(ctx_server.chat_templates.get(), "tool_use")) {
-                data["chat_template_tool_use"] = tool_use_src;
-            }
+        if (ctx_server.params_base.use_jinja && ctx_server.chat_templates.template_tool_use) {
+            data["chat_template_tool_use"] = ctx_server.chat_templates.template_tool_use->source();
        }

        res_ok(res, data);
@@ -4034,7 +4056,7 @@ int main(int argc, char ** argv) {
        }

        auto body = json::parse(req.body);
-        json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates.get());
+        json data = oaicompat_completion_params_parse(body, params.use_jinja, ctx_server.chat_templates);

        return handle_completions_impl(
            SERVER_TASK_TYPE_COMPLETION,
@@ -4047,7 +4069,7 @@ int main(int argc, char ** argv) {
    // same with handle_chat_completions, but without inference part
    const auto handle_apply_template = [&ctx_server, &params, &res_ok](const httplib::Request & req, httplib::Response & res) {
        auto body = json::parse(req.body);
-        json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates.get());
+        json data = oaicompat_completion_params_parse(body, params.use_jinja, ctx_server.chat_templates);
        res_ok(res, {{ "prompt", std::move(data.at("prompt")) }});
    };

@@ -4234,11 +4256,6 @@ int main(int argc, char ** argv) {
        //    return;
        //}

-        // if true, use TEI API format, otherwise use Jina API format
-        // Jina: https://jina.ai/reranker/
-        // TEI: https://huggingface.github.io/text-embeddings-inference/#/Text%20Embeddings%20Inference/rerank
-        bool is_tei_format = body.contains("texts");
-
        json query;
        if (body.count("query") == 1) {
            query = body.at("query");
@@ -4251,8 +4268,7 @@ int main(int argc, char ** argv) {
            return;
        }

-        std::vector<std::string> documents = json_value(body, "documents",
-                                             json_value(body, "texts", std::vector<std::string>()));
+        std::vector<std::string> documents = json_value(body, "documents", std::vector<std::string>());
        if (documents.empty()) {
            res_error(res, format_error_response("\"documents\" must be a non-empty string array", ERROR_TYPE_INVALID_REQUEST));
            return;
@@ -4297,12 +4313,7 @@ int main(int argc, char ** argv) {
        }

        // write JSON response
-        json root = format_response_rerank(
-            body,
-            responses,
-            is_tei_format,
-            documents);
-
+        json root = format_response_rerank(body, responses);
        res_ok(res, root);
    };

@@ -4419,7 +4430,6 @@ int main(int argc, char ** argv) {

    // clean up function, to be called before exit
    auto clean_up = [&svr]() {
-        SRV_INF("%s: cleaning up before exit...\n", __func__);
        svr->stop();
        llama_backend_free();
    };
@@ -4436,6 +4446,10 @@ int main(int argc, char ** argv) {
    }

    if (!was_bound) {
+        //LOG_ERROR("couldn't bind HTTP server socket", {
+        //    {"hostname", params.hostname},
+        //    {"port", params.port},
+        //});
        LOG_ERR("%s: couldn't bind HTTP server socket, hostname: %s, port: %d\n", __func__, params.hostname.c_str(), params.port);
        clean_up();
        return 1;
@@ -4452,7 +4466,7 @@ int main(int argc, char ** argv) {

    if (!ctx_server.load_model(params)) {
        clean_up();
-        // t.join(); // FIXME: see below
+        t.join();
        LOG_ERR("%s: exiting due to model loading error\n", __func__);
        return 1;
    }
@@ -4464,8 +4478,8 @@ int main(int argc, char ** argv) {

    // print sample chat example to make it clear which template is used
    LOG_INF("%s: chat template, chat_template: %s, example_format: '%s'\n", __func__,
-        common_chat_templates_source(ctx_server.chat_templates.get()),
-        common_chat_format_example(ctx_server.chat_templates.get(), ctx_server.params_base.use_jinja).c_str());
+        ctx_server.chat_templates.template_default->source().c_str(),
+        common_chat_format_example(*ctx_server.chat_templates.template_default, ctx_server.params_base.use_jinja).c_str());

    ctx_server.queue_tasks.on_new_task([&ctx_server](const server_task & task) {
        ctx_server.process_single_task(task);
@@ -4476,10 +4490,13 @@ int main(int argc, char ** argv) {
    });

    shutdown_handler = [&](int) {
-        // this will unblock start_loop()
        ctx_server.queue_tasks.terminate();
    };

+    LOG_INF("%s: server is listening on http://%s:%d - starting the main loop\n", __func__, params.hostname.c_str(), params.port);
+
+    ctx_server.queue_tasks.start_loop();
+
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
    struct sigaction sigint_action;
    sigint_action.sa_handler = signal_handler;
@@ -4494,13 +4511,8 @@ int main(int argc, char ** argv) {
    SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
 #endif

-    LOG_INF("%s: server is listening on http://%s:%d - starting the main loop\n", __func__, params.hostname.c_str(), params.port);
-
-    // this call blocks the main thread until queue_tasks.terminate() is called
-    ctx_server.queue_tasks.start_loop();
-
    clean_up();
-    // t.join(); // FIXME: http thread may stuck if there is an on-going request. we don't need to care about this for now as the HTTP connection will already be closed at this point, but it's better to fix this
+    t.join();

    return 0;
 }
--- a/examples/server/tests/README.md
+++ b/examples/server/tests/README.md
@@ -48,7 +48,7 @@ DEBUG=1 ./tests.sh -s -v -x
 To run all the tests in a file:

 ```shell
-./tests.sh unit/test_chat_completion.py -v -x
+./tests.sh unit/test_chat_completion.py.py -v -x
 ```

 To run a single test:
--- a/examples/server/tests/unit/test_chat_completion.py
+++ b/examples/server/tests/unit/test_chat_completion.py
@@ -21,8 +21,6 @@ def create_server():
        (None, "Book", "What is the best book", 8, "^ blue",                    23, 8, "length", True, "This is not a chat template, it is"),
        ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length", False, None),
        ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length", True, None),
-        (None, "Book", [{"type": "text", "text": "What is"}, {"type": "text", "text": "the best book"}], 8, "Whillicter", 79, 8, "length", False, None),
-        (None, "Book", [{"type": "text", "text": "What is"}, {"type": "text", "text": "the best book"}], 8, "Whillicter", 79, 8, "length", True, None),
    ]
 )
 def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason, jinja, chat_template):
@@ -46,7 +44,7 @@ def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_conte
    assert res.body["usage"]["completion_tokens"] == n_predicted
    choice = res.body["choices"][0]
    assert "assistant" == choice["message"]["role"]
-    assert match_regex(re_content, choice["message"]["content"]), f'Expected {re_content}, got {choice["message"]["content"]}'
+    assert match_regex(re_content, choice["message"]["content"])
    assert choice["finish_reason"] == finish_reason


@@ -171,47 +169,6 @@ def test_completion_with_response_format(response_format: dict, n_predicted: int
        assert "error" in res.body


-@pytest.mark.parametrize("jinja,json_schema,n_predicted,re_content", [
-    (False, {"const": "42"}, 6, "\"42\""),
-    (True, {"const": "42"}, 6, "\"42\""),
-])
-def test_completion_with_json_schema(jinja: bool, json_schema: dict, n_predicted: int, re_content: str):
-    global server
-    server.jinja = jinja
-    server.start()
-    res = server.make_request("POST", "/chat/completions", data={
-        "max_tokens": n_predicted,
-        "messages": [
-            {"role": "system", "content": "You are a coding assistant."},
-            {"role": "user", "content": "Write an example"},
-        ],
-        "json_schema": json_schema,
-    })
-    assert res.status_code == 200, f'Expected 200, got {res.status_code}'
-    choice = res.body["choices"][0]
-    assert match_regex(re_content, choice["message"]["content"]), f'Expected {re_content}, got {choice["message"]["content"]}'
-
-
-@pytest.mark.parametrize("jinja,grammar,n_predicted,re_content", [
-    (False, 'root ::= "a"{5,5}', 6, "a{5,5}"),
-    (True, 'root ::= "a"{5,5}', 6, "a{5,5}"),
-])
-def test_completion_with_grammar(jinja: bool, grammar: str, n_predicted: int, re_content: str):
-    global server
-    server.jinja = jinja
-    server.start()
-    res = server.make_request("POST", "/chat/completions", data={
-        "max_tokens": n_predicted,
-        "messages": [
-            {"role": "user", "content": "Does not matter what I say, does it?"},
-        ],
-        "grammar": grammar,
-    })
-    assert res.status_code == 200, res.body
-    choice = res.body["choices"][0]
-    assert match_regex(re_content, choice["message"]["content"]), choice["message"]["content"]
-
-
@pytest.mark.parametrize("messages", [
    None,
    "string",
--- a/examples/server/tests/unit/test_rerank.py
+++ b/examples/server/tests/unit/test_rerank.py
@@ -10,20 +10,17 @@ def create_server():
    server = ServerPreset.jina_reranker_tiny()


-TEST_DOCUMENTS = [
-    "A machine is a physical system that uses power to apply forces and control movement to perform an action. The term is commonly applied to artificial devices, such as those employing engines or motors, but also to natural biological macromolecules, such as molecular machines.",
-    "Learning is the process of acquiring new understanding, knowledge, behaviors, skills, values, attitudes, and preferences. The ability to learn is possessed by humans, non-human animals, and some machines; there is also evidence for some kind of learning in certain plants.",
-    "Machine learning is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions.",
-    "Paris, capitale de la France, est une grande ville européenne et un centre mondial de l'art, de la mode, de la gastronomie et de la culture. Son paysage urbain du XIXe siècle est traversé par de larges boulevards et la Seine."
-]
-
-
 def test_rerank():
    global server
    server.start()
    res = server.make_request("POST", "/rerank", data={
        "query": "Machine learning is",
-        "documents": TEST_DOCUMENTS,
+        "documents": [
+            "A machine is a physical system that uses power to apply forces and control movement to perform an action. The term is commonly applied to artificial devices, such as those employing engines or motors, but also to natural biological macromolecules, such as molecular machines.",
+            "Learning is the process of acquiring new understanding, knowledge, behaviors, skills, values, attitudes, and preferences. The ability to learn is possessed by humans, non-human animals, and some machines; there is also evidence for some kind of learning in certain plants.",
+            "Machine learning is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions.",
+            "Paris, capitale de la France, est une grande ville européenne et un centre mondial de l'art, de la mode, de la gastronomie et de la culture. Son paysage urbain du XIXe siècle est traversé par de larges boulevards et la Seine."
+        ]
    })
    assert res.status_code == 200
    assert len(res.body["results"]) == 4
@@ -41,29 +38,6 @@ def test_rerank():
    assert least_relevant["index"] == 3


-def test_rerank_tei_format():
-    global server
-    server.start()
-    res = server.make_request("POST", "/rerank", data={
-        "query": "Machine learning is",
-        "texts": TEST_DOCUMENTS,
-    })
-    assert res.status_code == 200
-    assert len(res.body) == 4
-
-    most_relevant = res.body[0]
-    least_relevant = res.body[0]
-    for doc in res.body:
-        if doc["score"] > most_relevant["score"]:
-            most_relevant = doc
-        if doc["score"] < least_relevant["score"]:
-            least_relevant = doc
-
-    assert most_relevant["score"] > least_relevant["score"]
-    assert most_relevant["index"] == 2
-    assert least_relevant["index"] == 3
-
-
@pytest.mark.parametrize("documents", [
    [],
    None,
--- a/examples/server/tests/unit/test_tool_call.py
+++ b/examples/server/tests/unit/test_tool_call.py
@@ -92,7 +92,6 @@ def do_test_completion_with_required_tool_tiny(template_name: str, tool: dict, a
    tool_calls = choice["message"].get("tool_calls")
    assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
    tool_call = tool_calls[0]
-    assert choice["message"].get("content") is None, f'Expected no content in {choice["message"]}'
    expected_function_name = "python" if tool["type"] == "code_interpreter" else tool["function"]["name"]
    assert expected_function_name == tool_call["function"]["name"]
    actual_arguments = tool_call["function"]["arguments"]
@@ -156,11 +155,11 @@ def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict,

    (TEST_TOOL,    "success",  "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
    (PYTHON_TOOL,  "code",     "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
-    # (PYTHON_TOOL,  "code",     "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", "chatml"),
+    (PYTHON_TOOL,  "code",     "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", "chatml"),

    (TEST_TOOL,    "success",  "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",   ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
    (PYTHON_TOOL,  "code",     "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",   ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
-    # (PYTHON_TOOL,  "code",     "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",   "chatml"),
+    (PYTHON_TOOL,  "code",     "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",   "chatml"),

    (TEST_TOOL,    "success",  "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
    (PYTHON_TOOL,  "code",     "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
@@ -176,7 +175,7 @@ def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict,

    (TEST_TOOL,    "success",  "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      ("meta-llama/Llama-3.2-3B-Instruct", None)),
    (PYTHON_TOOL,  "code",     "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      ("meta-llama/Llama-3.2-3B-Instruct", None)),
-    # (PYTHON_TOOL,  "code",     "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      "chatml"),
+    (PYTHON_TOOL,  "code",     "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      "chatml"),
    # TODO: fix these
    # (TEST_TOOL,    "success",  "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
    # (PYTHON_TOOL,  "code",     "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
@@ -215,7 +214,6 @@ def test_completion_with_required_tool_real_model(tool: dict, argument_key: str
    tool_calls = choice["message"].get("tool_calls")
    assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
    tool_call = tool_calls[0]
-    assert choice["message"].get("content") is None, f'Expected no content in {choice["message"]}'
    expected_function_name = "python" if tool["type"] == "code_interpreter" else tool["function"]["name"]
    assert expected_function_name == tool_call["function"]["name"]
    actual_arguments = tool_call["function"]["arguments"]
@@ -275,6 +273,7 @@ def test_completion_without_tool_call_slow(template_name: str, n_predict: int, t

@pytest.mark.slow
@pytest.mark.parametrize("hf_repo,template_override", [
+    ("bartowski/c4ai-command-r7b-12-2024-GGUF:Q4_K_M",   ("CohereForAI/c4ai-command-r7b-12-2024", "tool_use")),
    ("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
    ("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),

@@ -299,16 +298,13 @@ def test_completion_without_tool_call_slow(template_name: str, n_predict: int, t
    ("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      ("meta-llama/Llama-3.2-3B-Instruct", None)),
    ("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      "chatml"),

-    ("bartowski/c4ai-command-r7b-12-2024-GGUF:Q6_K_L",   ("CohereForAI/c4ai-command-r7b-12-2024", "tool_use")),
-
-    ("bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
-
    # Note: gemma-2-2b-it knows itself as "model", not "assistant", so we don't test the ill-suited chatml on it.
    ("bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              None),

    # ("bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)),
+    # ("bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
 ])
-def test_weather(hf_repo: str, template_override: str | Tuple[str, str | None] | None):
+def test_weather(hf_repo: str, template_override: Tuple[str, str | None] | None):
    global server
    n_predict = 512
    server.n_slots = 1
@@ -327,7 +323,6 @@ def test_weather(hf_repo: str, template_override: str | Tuple[str, str | None] |
    res = server.make_request("POST", "/chat/completions", data={
        "max_tokens": n_predict,
        "messages": [
-            {"role": "system", "content": "You are a chatbot that uses tools/functions. Dont overthink things."},
            {"role": "user", "content": "What is the weather in Istanbul?"},
        ],
        "tools": [WEATHER_TOOL],
@@ -337,7 +332,6 @@ def test_weather(hf_repo: str, template_override: str | Tuple[str, str | None] |
    tool_calls = choice["message"].get("tool_calls")
    assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
    tool_call = tool_calls[0]
-    assert choice["message"].get("content") is None, f'Expected no content in {choice["message"]}'
    assert tool_call["function"]["name"] == WEATHER_TOOL["function"]["name"]
    actual_arguments = json.loads(tool_call["function"]["arguments"])
    assert 'location' in actual_arguments, f"location not found in {json.dumps(actual_arguments)}"
@@ -346,166 +340,22 @@ def test_weather(hf_repo: str, template_override: str | Tuple[str, str | None] |
    assert re.match('^Istanbul(, (TR|Turkey|Türkiye))?$', location), f'Expected Istanbul for location, got {location}'


-@pytest.mark.slow
-@pytest.mark.parametrize("result_override,n_predict,hf_repo,template_override", [
-    (None,                                           128,  "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",       "chatml"),
-    (None,                                           128,  "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",         None),
-    (None,                                           128,  "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",         "chatml"),
-    (None,                                           128,  "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",     ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
-    (None,                                           128,  "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",       ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
-    (None,                                           128,  "bartowski/functionary-small-v3.2-GGUF:Q8_0",        ("meetkai/functionary-medium-v3.2", None)),
-    (None,                                           128,  "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M",  None),
-    (None,                                           128,  "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M",  None),
-    (None,                                           128,  "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M",  "chatml"),
-    (None,                                           128,  "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",       None),
-
-    # TODO: fix these (wrong results, either didn't respect decimal instruction or got wrong value)
-    ("[\\s\\S]*?\\*\\*\\s*0.5($|\\*\\*)",            8192, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
-    # ("[\\s\\S]*?\\*\\*\\s*0.5($|\\*\\*)",            8192, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)),
-])
-def test_calc_result(result_override: str | None, n_predict: int, hf_repo: str, template_override: str | Tuple[str, str | None] | None):
-    global server
-    # n_predict = 512
-    server.n_slots = 1
-    server.jinja = True
-    server.n_ctx = 8192 * 2
-    server.n_predict = n_predict
-    server.model_hf_repo = hf_repo
-    server.model_hf_file = None
-    if isinstance(template_override, tuple):
-        (template_hf_repo, template_variant) = template_override
-        server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja"
-        assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
-    elif isinstance(template_override, str):
-        server.chat_template = template_override
-    server.start(timeout_seconds=TIMEOUT_SERVER_START)
-    res = server.make_request("POST", "/chat/completions", data={
-        "max_tokens": n_predict,
-        "messages": [
-            {"role": "system", "content": "You are a chatbot that uses tools/functions. Dont overthink things, and provide very concise answers. Do not explain your reasoning to the user. Provide any numerical values back to the user with at most two decimals."},
-            {"role": "user", "content": "What's the y coordinate of a point on the unit sphere at angle 30 degrees?"},
-            {
-                "role": "assistant",
-                "content": None,
-                "tool_calls": [
-                    {
-                        "id": "call_6789",
-                        "type": "function",
-                        "function": {
-                            "name": "calculate",
-                            "arguments": "{\"expression\":\"sin(30 * pi / 180)\"}"
-                        }
-                    }
-                ]
-            },
-            {
-                "role": "tool",
-                "name": "calculate",
-                "content": "0.55644242476",
-                "tool_call_id": "call_6789"
-            }
-        ],
-        "tools": [
-            {
-                "type":"function",
-                "function":{
-                    "name":"calculate",
-                    "description":"A calculator function that computes values of arithmetic expressions in the Python syntax",
-                    "parameters":{
-                        "type":"object",
-                        "properties":{
-                            "expression":{
-                            "type":"string",
-                            "description":"An arithmetic expression to compute the value of (Python syntad, assuming all floats)"
-                            }
-                        },
-                        "required":["expression"]
-                    }
-                }
-            }
-        ]
-    }, timeout=TIMEOUT_HTTP_REQUEST)
-    assert res.status_code == 200, f"Expected status code 200, got {res.status_code}"
-    choice = res.body["choices"][0]
-    tool_calls = choice["message"].get("tool_calls")
-    assert tool_calls is None, f'Expected no tool call in {choice["message"]}'
-    content = choice["message"].get("content")
-    assert content is not None, f'Expected content in {choice["message"]}'
-    if result_override is not None:
-        assert re.match(result_override, content), f'Expected {result_override}, got {content}'
-    else:
-        assert re.match('^[\\s\\S]*?The (y[ -])?coordinate [\\s\\S]*?is (approximately )?0\\.56\\b|^0\\.56$', content), \
-            f'Expected something like "The y coordinate is 0.56.", got {content}'
-
-
-@pytest.mark.slow
-@pytest.mark.parametrize("n_predict,reasoning_format,expect_content,expect_reasoning_content,hf_repo,template_override", [
-    (128, 'deepseek',  "^The sum of 102 and 7 is 109.*",                        None,                                          "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",       None),
-    (128,  None,        "^The sum of 102 and 7 is 109.*",                       None,                                          "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",       None),
-
-    (1024, 'deepseek',  "To find the sum of.*",                                 "I need to calculate the sum of 102 and 7.*",  "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
-    (1024, 'none',      "^I need[\\s\\S]*?</think>\n?To find.*",                None,                                          "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
-
-    (1024, 'deepseek',  "To find the sum of.*",                                 "First, I [\\s\\S]*",                          "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)),
-])
-def test_thoughts(n_predict: int, reasoning_format: Literal['deepseek', 'none'] | None, expect_content: str | None, expect_reasoning_content: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None):
-    global server
-    server.n_slots = 1
-    server.reasoning_format = reasoning_format
-    server.jinja = True
-    server.n_ctx = 8192 * 2
-    server.n_predict = n_predict
-    server.model_hf_repo = hf_repo
-    server.model_hf_file = None
-    if isinstance(template_override, tuple):
-        (template_hf_repo, template_variant) = template_override
-        server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja"
-        assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
-    elif isinstance(template_override, str):
-        server.chat_template = template_override
-    server.start(timeout_seconds=TIMEOUT_SERVER_START)
-    res = server.make_request("POST", "/chat/completions", data={
-        "max_tokens": n_predict,
-        "messages": [
-            {"role": "user", "content": "What's the sum of 102 and 7?"},
-        ]
-    }, timeout=TIMEOUT_HTTP_REQUEST)
-    assert res.status_code == 200, f"Expected status code 200, got {res.status_code}"
-    choice = res.body["choices"][0]
-    assert choice["message"].get("tool_calls") is None, f'Expected no tool call in {choice["message"]}'
-
-    content = choice["message"].get("content")
-    if expect_content is None:
-        assert content is None, f'Expected no content in {choice["message"]}'
-    else:
-        assert re.match(expect_content, content), f'Expected {expect_content}, got {content}'
-
-    reasoning_content = choice["message"].get("reasoning_content")
-    if expect_reasoning_content is None:
-        assert reasoning_content is None, f'Expected no reasoning content in {choice["message"]}'
-    else:
-        assert re.match(expect_reasoning_content, reasoning_content), f'Expected {expect_reasoning_content}, got {reasoning_content}'
-
-
@pytest.mark.slow
@pytest.mark.parametrize("expected_arguments_override,hf_repo,template_override", [
-    (None,                 "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
-    # (None,                 "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", "chatml"),
-
    (None,                 "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      None),
    (None,                 "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      "chatml"),

    (None,                 "bartowski/functionary-small-v3.2-GGUF:Q8_0",       ("meetkai-functionary-medium-v3.2", None)),
    (None,                 "bartowski/functionary-small-v3.2-GGUF:Q8_0",       "chatml"),

-    ('{"code":"print("}',  "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
-    (None,                 "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
+    (None,                 "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
+    ('{"code":"print("}',  "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),

-    (None,                 "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      ("meta-llama-Llama-3.2-3B-Instruct", None)),
+    ('{"code":"print("}',  "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      ("meta-llama-Llama-3.2-3B-Instruct", None)),
    (None,                 "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      "chatml"),

    ('{"code":"print("}',  "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      ("meta-llama-Llama-3.2-3B-Instruct", None)),
-    (None,                 "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      "chatml"),
+    ('{"code":"print("}',  "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      "chatml"),

    (None,                 "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        None),
    (None,                 "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        "chatml"),
@@ -521,13 +371,15 @@ def test_thoughts(n_predict: int, reasoning_format: Literal['deepseek', 'none']

    # Note: gemma-2-2b-it knows itself as "model", not "assistant", so we don't test the ill-suited chatml on it.
    (None,                 "bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              None),
+
+    # (None,                 "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
 ])
-def test_hello_world(expected_arguments_override: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None):
+def test_hello_world_tool_call(expected_arguments_override: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None):
    global server
    server.n_slots = 1
    server.jinja = True
    server.n_ctx = 8192
-    server.n_predict = 512 # High because of DeepSeek R1
+    server.n_predict = 128
    server.model_hf_repo = hf_repo
    server.model_hf_file = None
    if isinstance(template_override, tuple):
@@ -554,7 +406,6 @@ def test_hello_world(expected_arguments_override: str | None, hf_repo: str, temp
    tool_calls = choice["message"].get("tool_calls")
    assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
    tool_call = tool_calls[0]
-    assert choice["message"].get("content") is None, f'Expected no content in {choice["message"]}'
    assert tool_call["function"]["name"] == PYTHON_TOOL["function"]["name"]
    actual_arguments = tool_call["function"]["arguments"]
    if expected_arguments_override is not None:
--- a/examples/server/tests/utils.py
+++ b/examples/server/tests/utils.py
@@ -26,10 +26,7 @@ from re import RegexFlag
 import wget


-DEFAULT_HTTP_TIMEOUT = 12
-
-if "LLAMA_SANITIZE" in os.environ or "GITHUB_ACTION" in os.environ:
-    DEFAULT_HTTP_TIMEOUT = 30
+DEFAULT_HTTP_TIMEOUT = 12 if "LLAMA_SANITIZE" not in os.environ else 30


 class ServerResponse:
@@ -81,7 +78,6 @@ class ServerProcess:
    draft_max: int | None = None
    no_webui: bool | None = None
    jinja: bool | None = None
-    reasoning_format: Literal['deepseek', 'none'] | None = None
    chat_template: str | None = None
    chat_template_file: str | None = None

@@ -176,8 +172,6 @@ class ServerProcess:
            server_args.append("--no-webui")
        if self.jinja:
            server_args.append("--jinja")
-        if self.reasoning_format is not None:
-            server_args.extend(("--reasoning-format", self.reasoning_format))
        if self.chat_template:
            server_args.extend(["--chat-template", self.chat_template])
        if self.chat_template_file:
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -7,14 +7,14 @@

 // increase max payload length to allow use of larger context size
 #define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
-// disable Nagle's algorithm
-#define CPPHTTPLIB_TCP_NODELAY true
 #include "httplib.h"

 // Change JSON_ASSERT from assert() to GGML_ASSERT:
 #define JSON_ASSERT GGML_ASSERT
 #include "json.hpp"
-#include "chat.h"
+#include "minja.hpp"
+#include "chat.hpp"
+#include "chat-template.hpp"

 #include <random>
 #include <sstream>
@@ -347,6 +347,41 @@ static llama_tokens format_infill(
    return embd_inp;
 }

+// Format given chat. If tmpl is empty, we take the template from model metadata
+inline std::string format_chat(const common_chat_template & tmpl, const std::vector<json> & messages) {
+    std::vector<common_chat_msg> chat;
+
+    for (size_t i = 0; i < messages.size(); ++i) {
+        const auto & curr_msg = messages[i];
+
+        std::string role = json_value(curr_msg, "role", std::string(""));
+
+        std::string content;
+        if (curr_msg.contains("content")) {
+            if (curr_msg["content"].is_string()) {
+                content = curr_msg["content"].get<std::string>();
+            } else if (curr_msg["content"].is_array()) {
+                for (const auto & part : curr_msg["content"]) {
+                    if (part.contains("text")) {
+                        content += "\n" + part["text"].get<std::string>();
+                    }
+                }
+            } else {
+                throw std::runtime_error("Invalid 'content' type (ref: https://github.com/ggerganov/llama.cpp/issues/8367)");
+            }
+        } else {
+            throw std::runtime_error("Missing 'content' (ref: https://github.com/ggerganov/llama.cpp/issues/8367)");
+        }
+
+        chat.push_back({role, content, /* tool_calls= */ {}});
+    }
+
+    const auto formatted_chat = common_chat_apply_template(tmpl, chat, true, /* use_jinja= */ false);
+    LOG_DBG("formatted_chat: '%s'\n", formatted_chat.c_str());
+
+    return formatted_chat;
+}
+
 //
 // base64 utils (TODO: move to common in the future)
 //
@@ -521,13 +556,8 @@ static json oaicompat_completion_params_parse(const json & body) {
        throw std::runtime_error("Only one completion choice is allowed");
    }

-    // Handle "echo" field
-    if (json_value(body, "echo", false)) {
-        throw std::runtime_error("Only no echo is supported");
-    }
-
    // Params supported by OAI but unsupported by llama.cpp
-    static const std::vector<std::string> unsupported_params { "best_of", "suffix" };
+    static const std::vector<std::string> unsupported_params { "best_of", "echo", "suffix" };
    for (const auto & param : unsupported_params) {
        if (body.contains(param)) {
            throw std::runtime_error("Unsupported param: " + param);
@@ -548,10 +578,12 @@ static json oaicompat_completion_params_parse(const json & body) {
 static json oaicompat_completion_params_parse(
    const json & body, /* openai api json semantics */
    bool use_jinja,
-    common_reasoning_format reasoning_format,
-    const struct common_chat_templates * tmpls)
+    const common_chat_templates & chat_templates)
 {
    json llama_params;
+    const auto & tmpl = body.contains("tools") && chat_templates.template_tool_use
+        ? *chat_templates.template_tool_use
+        : *chat_templates.template_default;

    auto tools = json_value(body, "tools", json());
    auto stream = json_value(body, "stream", false);
@@ -577,58 +609,61 @@ static json oaicompat_completion_params_parse(
        llama_params["stop"] = json_value(body, "stop", json::array());
    }

-    auto json_schema = json_value(body, "json_schema", json());
-    auto grammar = json_value(body, "grammar", std::string());
-    if (!json_schema.is_null() && !grammar.empty()) {
-        throw std::runtime_error("Cannot use both json_schema and grammar");
-    }
-
    // Handle "response_format" field
    if (body.contains("response_format")) {
        json response_format      = json_value(body, "response_format", json::object());
        std::string response_type = json_value(response_format, "type", std::string());
        if (response_type == "json_object") {
-            json_schema = json_value(response_format, "schema", json::object());
+            llama_params["json_schema"] = json_value(response_format, "schema", json::object());
        } else if (response_type == "json_schema") {
            json json_schema = json_value(response_format, "json_schema", json::object());
-            json_schema = json_value(json_schema, "schema", json::object());
+            llama_params["json_schema"] = json_value(json_schema, "schema", json::object());
        } else if (!response_type.empty() && response_type != "text") {
            throw std::runtime_error("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type);
        }
    }

-    common_chat_templates_inputs inputs;
-    inputs.messages              = common_chat_msgs_parse_oaicompat(body.at("messages"));
-    inputs.tools                 = common_chat_tools_parse_oaicompat(tools);
-    inputs.tool_choice           = common_chat_tool_choice_parse_oaicompat(json_value(body, "tool_choice", std::string("auto")));
-    inputs.json_schema           = json_schema.is_null() ? "" : json_schema.dump();
-    inputs.grammar               = grammar;
-    inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
-    inputs.use_jinja             = use_jinja;
-    inputs.parallel_tool_calls   = json_value(body, "parallel_tool_calls", false);
-    inputs.extract_reasoning     = reasoning_format != COMMON_REASONING_FORMAT_NONE;
-    if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && body.contains("grammar")) {
-        throw std::runtime_error("Cannot use custom grammar constraints with tools.");
-    }
-
    // Apply chat template to the list of messages
-    auto chat_params = common_chat_templates_apply(tmpls, inputs);
+    if (use_jinja) {
+        auto tool_choice = json_value(body, "tool_choice", std::string("auto"));
+        if (tool_choice != "none" && tool_choice != "auto" && tool_choice != "required") {
+            throw std::runtime_error("Invalid tool_choice: " + tool_choice);
+        }
+        if (tool_choice != "none" && llama_params.contains("grammar")) {
+            throw std::runtime_error("Cannot use custom grammar constraints with tools.");
+        }
+        common_chat_inputs inputs;
+        inputs.messages = body.at("messages");
+        inputs.tools = tools;
+        inputs.tool_choice = tool_choice;
+        inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false);
+        if (inputs.parallel_tool_calls && !tmpl.original_caps().supports_parallel_tool_calls) {
+            LOG_DBG("Disabling parallel_tool_calls because the template does not support it\n");
+            inputs.parallel_tool_calls = false;
+        }
+        inputs.stream = stream;
+        // TODO: support mixing schema w/ tools beyond generic format.
+        inputs.json_schema = json_value(llama_params, "json_schema", json());
+        auto chat_params = common_chat_params_init(tmpl, inputs);

-    llama_params["chat_format"]      = static_cast<int>(chat_params.format);
-    llama_params["prompt"]           = chat_params.prompt;
-    llama_params["grammar"]          = chat_params.grammar;
-    llama_params["grammar_lazy"]     = chat_params.grammar_lazy;
-    auto grammar_triggers = json::array();
-    for (const auto & trigger : chat_params.grammar_triggers) {
-        grammar_triggers.push_back({
-            {"word", trigger.word},
-            {"at_start", trigger.at_start},
-        });
-    }
-    llama_params["grammar_triggers"] = grammar_triggers;
-    llama_params["preserved_tokens"] = chat_params.preserved_tokens;
-    for (const auto & stop : chat_params.additional_stops) {
-        llama_params["stop"].push_back(stop);
+        llama_params["chat_format"] = static_cast<int>(chat_params.format);
+        llama_params["prompt"] = chat_params.prompt;
+        llama_params["grammar"] = chat_params.grammar;
+        llama_params["grammar_lazy"] = chat_params.grammar_lazy;
+        auto grammar_triggers = json::array();
+        for (const auto & trigger : chat_params.grammar_triggers) {
+            grammar_triggers.push_back({
+                {"word", trigger.word},
+                {"at_start", trigger.at_start},
+            });
+        }
+        llama_params["grammar_triggers"] = grammar_triggers;
+        llama_params["preserved_tokens"] = chat_params.preserved_tokens;
+        for (const auto & stop : chat_params.additional_stops) {
+            llama_params["stop"].push_back(stop);
+        }
+    } else {
+        llama_params["prompt"] = format_chat(tmpl, body.at("messages"));
    }

    // Handle "n" field
@@ -700,51 +735,29 @@ static json format_embeddings_response_oaicompat(const json & request, const jso
    return res;
 }

-static json format_response_rerank(
-        const json & request,
-        const json & ranks,
-        bool is_tei_format,
-        std::vector<std::string> & texts) {
-    json res;
-    if (is_tei_format) {
-        // TEI response format
-        res = json::array();
-        bool return_text = json_value(request, "return_text", false);
-        for (const auto & rank : ranks) {
-            int index = json_value(rank, "index", 0);
-            json elem = json{
-                {"index", index},
-                {"score", json_value(rank, "score", 0.0)},
-            };
-            if (return_text) {
-                elem["text"] = std::move(texts[index]);
-            }
-            res.push_back(elem);
-        }
-    } else {
-        // Jina response format
-        json results = json::array();
-        int32_t n_tokens = 0;
-        for (const auto & rank : ranks) {
-            results.push_back(json{
-                {"index",           json_value(rank, "index", 0)},
-                {"relevance_score", json_value(rank, "score", 0.0)},
-            });
+static json format_response_rerank(const json & request, const json & ranks) {
+    json data = json::array();
+    int32_t n_tokens = 0;
+    int i = 0;
+    for (const auto & rank : ranks) {
+        data.push_back(json{
+            {"index",    i++},
+            {"relevance_score", json_value(rank, "score", 0.0)},
+        });

-            n_tokens += json_value(rank, "tokens_evaluated", 0);
-        }
-
-        res = json{
-            {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
-            {"object", "list"},
-            {"usage", json{
-                {"prompt_tokens", n_tokens},
-                {"total_tokens", n_tokens}
-            }},
-            {"results", results}
-        };
+        n_tokens += json_value(rank, "tokens_evaluated", 0);
    }

+    json res = json {
+        {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
+        {"object", "list"},
+        {"usage", json {
+            {"prompt_tokens", n_tokens},
+            {"total_tokens", n_tokens}
+        }},
+        {"results", data}
+    };
+
    return res;
 }

--- a/examples/server/webui/package-lock.json
+++ b/examples/server/webui/package-lock.json
@@ -13,7 +13,6 @@
        "@vscode/markdown-it-katex": "^1.1.1",
        "autoprefixer": "^10.4.20",
        "daisyui": "^4.12.14",
-        "dexie": "^4.0.11",
        "highlight.js": "^11.10.0",
        "katex": "^0.16.15",
        "postcss": "^8.4.49",
@@ -2339,12 +2338,6 @@
        "url": "https://github.com/sponsors/wooorm"
      }
    },
-    "node_modules/dexie": {
-      "version": "4.0.11",
-      "resolved": "https://registry.npmjs.org/dexie/-/dexie-4.0.11.tgz",
-      "integrity": "sha512-SOKO002EqlvBYYKQSew3iymBoN2EQ4BDw/3yprjh7kAfFzjBYkaMNa/pZvcA7HSWlcKSQb9XhPe3wKyQ0x4A8A==",
-      "license": "Apache-2.0"
-    },
    "node_modules/didyoumean": {
      "version": "1.2.2",
      "resolved": "https://registry.npmjs.org/didyoumean/-/didyoumean-1.2.2.tgz",
--- a/examples/server/webui/package.json
+++ b/examples/server/webui/package.json
@@ -16,7 +16,6 @@
    "@vscode/markdown-it-katex": "^1.1.1",
    "autoprefixer": "^10.4.20",
    "daisyui": "^4.12.14",
-    "dexie": "^4.0.11",
    "highlight.js": "^11.10.0",
    "katex": "^0.16.15",
    "postcss": "^8.4.49",
--- a/examples/server/webui/src/components/ChatMessage.tsx
+++ b/examples/server/webui/src/components/ChatMessage.tsx
@@ -3,7 +3,6 @@ import { useAppContext } from '../utils/app.context';
 import { Message, PendingMessage } from '../utils/types';
 import { classNames } from '../utils/misc';
 import MarkdownDisplay, { CopyButton } from './MarkdownDisplay';
-import { ChevronLeftIcon, ChevronRightIcon } from '@heroicons/react/24/outline';

 interface SplitMessage {
  content: PendingMessage['content'];
@@ -13,24 +12,17 @@ interface SplitMessage {

 export default function ChatMessage({
  msg,
-  siblingLeafNodeIds,
-  siblingCurrIdx,
  id,
-  onRegenerateMessage,
-  onEditMessage,
-  onChangeSibling,
+  scrollToBottom,
  isPending,
 }: {
  msg: Message | PendingMessage;
-  siblingLeafNodeIds: Message['id'][];
-  siblingCurrIdx: number;
  id?: string;
-  onRegenerateMessage(msg: Message): void;
-  onEditMessage(msg: Message, content: string): void;
-  onChangeSibling(sibling: Message['id']): void;
+  scrollToBottom: (requiresNearBottom: boolean) => void;
  isPending?: boolean;
 }) {
-  const { viewingChat, config } = useAppContext();
+  const { viewingConversation, replaceMessageAndGenerate, config } =
+    useAppContext();
  const [editingContent, setEditingContent] = useState<string | null>(null);
  const timings = useMemo(
    () =>
@@ -45,8 +37,6 @@ export default function ChatMessage({
        : null,
    [msg.timings]
  );
-  const nextSibling = siblingLeafNodeIds[siblingCurrIdx + 1];
-  const prevSibling = siblingLeafNodeIds[siblingCurrIdx - 1];

  // for reasoning model, we split the message into content and thought
  // TODO: implement this as remark/rehype plugin in the future
@@ -74,7 +64,13 @@ export default function ChatMessage({
    return { content: actualContent, thought, isThinking };
  }, [msg]);

-  if (!viewingChat) return null;
+  if (!viewingConversation) return null;
+
+  const regenerate = async () => {
+    replaceMessageAndGenerate(viewingConversation.id, msg.id, undefined, () =>
+      scrollToBottom(true)
+    );
+  };

  return (
    <div className="group" id={id}>
@@ -109,12 +105,13 @@ export default function ChatMessage({
              </button>
              <button
                className="btn mt-2"
-                onClick={() => {
-                  if (msg.content !== null) {
-                    setEditingContent(null);
-                    onEditMessage(msg as Message, editingContent);
-                  }
-                }}
+                onClick={() =>
+                  replaceMessageAndGenerate(
+                    viewingConversation.id,
+                    msg.id,
+                    editingContent
+                  )
+                }
              >
                Submit
              </button>
@@ -159,35 +156,6 @@ export default function ChatMessage({
                        </div>
                      </details>
                    )}
-
-                    {msg.extra && msg.extra.length > 0 && (
-                      <details
-                        className={classNames({
-                          'collapse collapse-arrow mb-4 bg-base-200': true,
-                          'bg-opacity-10': msg.role !== 'assistant',
-                        })}
-                      >
-                        <summary className="collapse-title">
-                          Extra content
-                        </summary>
-                        <div className="collapse-content">
-                          {msg.extra.map(
-                            (extra, i) =>
-                              extra.type === 'textFile' ? (
-                                <div key={extra.name}>
-                                  <b>{extra.name}</b>
-                                  <pre>{extra.content}</pre>
-                                </div>
-                              ) : extra.type === 'context' ? (
-                                <div key={i}>
-                                  <pre>{extra.content}</pre>
-                                </div>
-                              ) : null // TODO: support other extra types
-                          )}
-                        </div>
-                      </details>
-                    )}
-
                    <MarkdownDisplay
                      content={content}
                      isGenerating={isPending}
@@ -228,35 +196,10 @@ export default function ChatMessage({
      {msg.content !== null && (
        <div
          className={classNames({
-            'flex items-center gap-2 mx-4 mt-2 mb-2': true,
-            'flex-row-reverse': msg.role === 'user',
+            'mx-4 mt-2 mb-2': true,
+            'text-right': msg.role === 'user',
          })}
        >
-          {siblingLeafNodeIds && siblingLeafNodeIds.length > 1 && (
-            <div className="flex gap-1 items-center opacity-60 text-sm">
-              <button
-                className={classNames({
-                  'btn btn-sm btn-ghost p-1': true,
-                  'opacity-20': !prevSibling,
-                })}
-                onClick={() => prevSibling && onChangeSibling(prevSibling)}
-              >
-                <ChevronLeftIcon className="h-4 w-4" />
-              </button>
-              <span>
-                {siblingCurrIdx + 1} / {siblingLeafNodeIds.length}
-              </span>
-              <button
-                className={classNames({
-                  'btn btn-sm btn-ghost p-1': true,
-                  'opacity-20': !nextSibling,
-                })}
-                onClick={() => nextSibling && onChangeSibling(nextSibling)}
-              >
-                <ChevronRightIcon className="h-4 w-4" />
-              </button>
-            </div>
-          )}
          {/* user message */}
          {msg.role === 'user' && (
            <button
@@ -273,22 +216,18 @@ export default function ChatMessage({
              {!isPending && (
                <button
                  className="badge btn-mini show-on-hover mr-2"
-                  onClick={() => {
-                    if (msg.content !== null) {
-                      onRegenerateMessage(msg as Message);
-                    }
-                  }}
+                  onClick={regenerate}
                  disabled={msg.content === null}
                >
                  🔄 Regenerate
                </button>
              )}
+              <CopyButton
+                className="badge btn-mini show-on-hover mr-2"
+                content={msg.content}
+              />
            </>
          )}
-          <CopyButton
-            className="badge btn-mini show-on-hover mr-2"
-            content={msg.content}
-          />
        </div>
      )}
    </div>
--- a/examples/server/webui/src/components/ChatScreen.tsx
+++ b/examples/server/webui/src/components/ChatScreen.tsx
@@ -1,78 +1,28 @@
-import { useEffect, useMemo, useRef, useState } from 'react';
-import { CallbackGeneratedChunk, useAppContext } from '../utils/app.context';
-import ChatMessage from './ChatMessage';
-import { CanvasType, Message, PendingMessage } from '../utils/types';
-import { classNames, cleanCurrentUrl, throttle } from '../utils/misc';
-import CanvasPyInterpreter from './CanvasPyInterpreter';
+import { useEffect, useState } from 'react';
+import { useAppContext } from '../utils/app.context';
 import StorageUtils from '../utils/storage';
-import { useVSCodeContext } from '../utils/llama-vscode';
+import { useNavigate } from 'react-router';
+import ChatMessage from './ChatMessage';
+import { CanvasType, PendingMessage } from '../utils/types';
+import { classNames } from '../utils/misc';
+import CanvasPyInterpreter from './CanvasPyInterpreter';

-/**
- * A message display is a message node with additional information for rendering.
- * For example, siblings of the message node are stored as their last node (aka leaf node).
- */
-export interface MessageDisplay {
-  msg: Message | PendingMessage;
-  siblingLeafNodeIds: Message['id'][];
-  siblingCurrIdx: number;
-  isPending?: boolean;
-}
+export default function ChatScreen() {
+  const {
+    viewingConversation,
+    sendMessage,
+    isGenerating,
+    stopGenerating,
+    pendingMessages,
+    canvasData,
+  } = useAppContext();
+  const [inputMsg, setInputMsg] = useState('');
+  const navigate = useNavigate();

-/**
- * If the current URL contains "?m=...", prefill the message input with the value.
- * If the current URL contains "?q=...", prefill and SEND the message.
- */
-const prefilledMsg = {
-  content() {
-    const url = new URL(window.location.href);
-    return url.searchParams.get('m') ?? url.searchParams.get('q') ?? '';
-  },
-  shouldSend() {
-    const url = new URL(window.location.href);
-    return url.searchParams.has('q');
-  },
-  clear() {
-    cleanCurrentUrl(['m', 'q']);
-  },
-};
+  const currConvId = viewingConversation?.id ?? '';
+  const pendingMsg: PendingMessage | undefined = pendingMessages[currConvId];

-function getListMessageDisplay(
-  msgs: Readonly<Message[]>,
-  leafNodeId: Message['id']
-): MessageDisplay[] {
-  const currNodes = StorageUtils.filterByLeafNodeId(msgs, leafNodeId, true);
-  const res: MessageDisplay[] = [];
-  const nodeMap = new Map<Message['id'], Message>();
-  for (const msg of msgs) {
-    nodeMap.set(msg.id, msg);
-  }
-  // find leaf node from a message node
-  const findLeafNode = (msgId: Message['id']): Message['id'] => {
-    let currNode: Message | undefined = nodeMap.get(msgId);
-    while (currNode) {
-      if (currNode.children.length === 0) break;
-      currNode = nodeMap.get(currNode.children.at(-1) ?? -1);
-    }
-    return currNode?.id ?? -1;
-  };
-  // traverse the current nodes
-  for (const msg of currNodes) {
-    const parentNode = nodeMap.get(msg.parent ?? -1);
-    if (!parentNode) continue;
-    const siblings = parentNode.children;
-    if (msg.type !== 'root') {
-      res.push({
-        msg,
-        siblingLeafNodeIds: siblings.map(findLeafNode),
-        siblingCurrIdx: siblings.indexOf(msg.id),
-      });
-    }
-  }
-  return res;
-}
-
-const scrollToBottom = throttle(
-  (requiresNearBottom: boolean, delay: number = 80) => {
+  const scrollToBottom = (requiresNearBottom: boolean) => {
    const mainScrollElem = document.getElementById('main-scroll');
    if (!mainScrollElem) return;
    const spaceToBottom =
@@ -82,143 +32,36 @@ const scrollToBottom = throttle(
    if (!requiresNearBottom || spaceToBottom < 50) {
      setTimeout(
        () => mainScrollElem.scrollTo({ top: mainScrollElem.scrollHeight }),
-        delay
+        1
      );
    }
-  },
-  80
-);
-
-export default function ChatScreen() {
-  const {
-    viewingChat,
-    sendMessage,
-    isGenerating,
-    stopGenerating,
-    pendingMessages,
-    canvasData,
-    replaceMessageAndGenerate,
-  } = useAppContext();
-  const [inputMsg, setInputMsg] = useState(prefilledMsg.content());
-  const inputRef = useRef<HTMLTextAreaElement>(null);
-
-  const { extraContext, clearExtraContext } = useVSCodeContext(
-    inputRef,
-    setInputMsg
-  );
-  // TODO: improve this when we have "upload file" feature
-  const currExtra: Message['extra'] = extraContext ? [extraContext] : undefined;
-
-  // keep track of leaf node for rendering
-  const [currNodeId, setCurrNodeId] = useState<number>(-1);
-  const messages: MessageDisplay[] = useMemo(() => {
-    if (!viewingChat) return [];
-    else return getListMessageDisplay(viewingChat.messages, currNodeId);
-  }, [currNodeId, viewingChat]);
-
-  const currConvId = viewingChat?.conv.id ?? null;
-  const pendingMsg: PendingMessage | undefined =
-    pendingMessages[currConvId ?? ''];
-
-  useEffect(() => {
-    // reset to latest node when conversation changes
-    setCurrNodeId(-1);
-    // scroll to bottom when conversation changes
-    scrollToBottom(false, 1);
-  }, [currConvId]);
-
-  const onChunk: CallbackGeneratedChunk = (currLeafNodeId?: Message['id']) => {
-    if (currLeafNodeId) {
-      setCurrNodeId(currLeafNodeId);
-    }
-    scrollToBottom(true);
  };

+  // scroll to bottom when conversation changes
+  useEffect(() => {
+    scrollToBottom(false);
+  }, [viewingConversation?.id]);
+
  const sendNewMessage = async () => {
-    if (inputMsg.trim().length === 0 || isGenerating(currConvId ?? '')) return;
+    if (inputMsg.trim().length === 0 || isGenerating(currConvId)) return;
+    const convId = viewingConversation?.id ?? StorageUtils.getNewConvId();
    const lastInpMsg = inputMsg;
    setInputMsg('');
+    if (!viewingConversation) {
+      // if user is creating a new conversation, redirect to the new conversation
+      navigate(`/chat/${convId}`);
+    }
    scrollToBottom(false);
-    setCurrNodeId(-1);
-    // get the last message node
-    const lastMsgNodeId = messages.at(-1)?.msg.id ?? null;
-    if (
-      !(await sendMessage(
-        currConvId,
-        lastMsgNodeId,
-        inputMsg,
-        currExtra,
-        onChunk
-      ))
-    ) {
+    // auto scroll as message is being generated
+    const onChunk = () => scrollToBottom(true);
+    if (!(await sendMessage(convId, inputMsg, onChunk))) {
      // restore the input message if failed
      setInputMsg(lastInpMsg);
    }
-    // OK
-    clearExtraContext();
-  };
-
-  const handleEditMessage = async (msg: Message, content: string) => {
-    if (!viewingChat) return;
-    setCurrNodeId(msg.id);
-    scrollToBottom(false);
-    await replaceMessageAndGenerate(
-      viewingChat.conv.id,
-      msg.parent,
-      content,
-      msg.extra,
-      onChunk
-    );
-    setCurrNodeId(-1);
-    scrollToBottom(false);
-  };
-
-  const handleRegenerateMessage = async (msg: Message) => {
-    if (!viewingChat) return;
-    setCurrNodeId(msg.parent);
-    scrollToBottom(false);
-    await replaceMessageAndGenerate(
-      viewingChat.conv.id,
-      msg.parent,
-      null,
-      msg.extra,
-      onChunk
-    );
-    setCurrNodeId(-1);
-    scrollToBottom(false);
  };

  const hasCanvas = !!canvasData;

-  useEffect(() => {
-    if (prefilledMsg.shouldSend()) {
-      // send the prefilled message if needed
-      sendNewMessage();
-    } else {
-      // otherwise, focus on the input and move the cursor to the end
-      if (inputRef.current) {
-        inputRef.current.focus();
-        inputRef.current.selectionStart = inputRef.current.value.length;
-      }
-    }
-    prefilledMsg.clear();
-    // no need to keep track of sendNewMessage
-    // eslint-disable-next-line react-hooks/exhaustive-deps
-  }, [inputRef]);
-
-  // due to some timing issues of StorageUtils.appendMsg(), we need to make sure the pendingMsg is not duplicated upon rendering (i.e. appears once in the saved conversation and once in the pendingMsg)
-  const pendingMsgDisplay: MessageDisplay[] =
-    pendingMsg && messages.at(-1)?.msg.id !== pendingMsg.id
-      ? [
-          {
-            msg: pendingMsg,
-            siblingLeafNodeIds: [],
-            siblingCurrIdx: 0,
-            isPending: true,
-          },
-        ]
-      : [];
-
  return (
    <div
      className={classNames({
@@ -238,19 +81,24 @@ export default function ChatScreen() {
        <div id="messages-list" className="grow">
          <div className="mt-auto flex justify-center">
            {/* placeholder to shift the message to the bottom */}
-            {viewingChat ? '' : 'Send a message to start'}
+            {viewingConversation ? '' : 'Send a message to start'}
          </div>
-          {[...messages, ...pendingMsgDisplay].map((msg) => (
+          {viewingConversation?.messages.map((msg) => (
            <ChatMessage
-              key={msg.msg.id}
-              msg={msg.msg}
-              siblingLeafNodeIds={msg.siblingLeafNodeIds}
-              siblingCurrIdx={msg.siblingCurrIdx}
-              onRegenerateMessage={handleRegenerateMessage}
-              onEditMessage={handleEditMessage}
-              onChangeSibling={setCurrNodeId}
+              key={msg.id}
+              msg={msg}
+              scrollToBottom={scrollToBottom}
            />
          ))}
+
+          {pendingMsg && (
+            <ChatMessage
+              msg={pendingMsg}
+              scrollToBottom={scrollToBottom}
+              isPending
+              id="pending-msg"
+            />
+          )}
        </div>

        {/* chat input */}
@@ -258,11 +106,9 @@ export default function ChatScreen() {
          <textarea
            className="textarea textarea-bordered w-full"
            placeholder="Type a message (Shift+Enter to add a new line)"
-            ref={inputRef}
            value={inputMsg}
            onChange={(e) => setInputMsg(e.target.value)}
            onKeyDown={(e) => {
-              if (e.nativeEvent.isComposing || e.keyCode === 229) return;
              if (e.key === 'Enter' && e.shiftKey) return;
              if (e.key === 'Enter' && !e.shiftKey) {
                e.preventDefault();
@@ -272,10 +118,10 @@ export default function ChatScreen() {
            id="msg-input"
            dir="auto"
          ></textarea>
-          {isGenerating(currConvId ?? '') ? (
+          {isGenerating(currConvId) ? (
            <button
              className="btn btn-neutral ml-2"
-              onClick={() => stopGenerating(currConvId ?? '')}
+              onClick={() => stopGenerating(currConvId)}
            >
              Stop
            </button>
--- a/examples/server/webui/src/components/Header.tsx
+++ b/examples/server/webui/src/components/Header.tsx
@@ -25,12 +25,12 @@ export default function Header() {
    );
  }, [selectedTheme]);

-  const { isGenerating, viewingChat } = useAppContext();
-  const isCurrConvGenerating = isGenerating(viewingChat?.conv.id ?? '');
+  const { isGenerating, viewingConversation } = useAppContext();
+  const isCurrConvGenerating = isGenerating(viewingConversation?.id ?? '');

  const removeConversation = () => {
-    if (isCurrConvGenerating || !viewingChat) return;
-    const convId = viewingChat?.conv.id;
+    if (isCurrConvGenerating || !viewingConversation) return;
+    const convId = viewingConversation.id;
    if (window.confirm('Are you sure to delete this conversation?')) {
      StorageUtils.remove(convId);
      navigate('/');
@@ -38,9 +38,9 @@ export default function Header() {
  };

  const downloadConversation = () => {
-    if (isCurrConvGenerating || !viewingChat) return;
-    const convId = viewingChat?.conv.id;
-    const conversationJson = JSON.stringify(viewingChat, null, 2);
+    if (isCurrConvGenerating || !viewingConversation) return;
+    const convId = viewingConversation.id;
+    const conversationJson = JSON.stringify(viewingConversation, null, 2);
    const blob = new Blob([conversationJson], { type: 'application/json' });
    const url = URL.createObjectURL(blob);
    const a = document.createElement('a');
@@ -75,41 +75,38 @@ export default function Header() {

      {/* action buttons (top right) */}
      <div className="flex items-center">
-        {viewingChat && (
-          <div className="dropdown dropdown-end">
-            {/* "..." button */}
-            <button
-              tabIndex={0}
-              role="button"
-              className="btn m-1"
-              disabled={isCurrConvGenerating}
+        <div v-if="messages.length > 0" className="dropdown dropdown-end">
+          {/* "..." button */}
+          <button
+            tabIndex={0}
+            role="button"
+            className="btn m-1"
+            disabled={isCurrConvGenerating}
+          >
+            <svg
+              xmlns="http://www.w3.org/2000/svg"
+              width="16"
+              height="16"
+              fill="currentColor"
+              className="bi bi-three-dots-vertical"
+              viewBox="0 0 16 16"
            >
-              <svg
-                xmlns="http://www.w3.org/2000/svg"
-                width="16"
-                height="16"
-                fill="currentColor"
-                className="bi bi-three-dots-vertical"
-                viewBox="0 0 16 16"
-              >
-                <path d="M9.5 13a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0m0-5a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0m0-5a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0" />
-              </svg>
-            </button>
-            {/* dropdown menu */}
-            <ul
-              tabIndex={0}
-              className="dropdown-content menu bg-base-100 rounded-box z-[1] w-52 p-2 shadow"
-            >
-              <li onClick={downloadConversation}>
-                <a>Download</a>
-              </li>
-              <li className="text-error" onClick={removeConversation}>
-                <a>Delete</a>
-              </li>
-            </ul>
-          </div>
-        )}
-
+              <path d="M9.5 13a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0m0-5a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0m0-5a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0" />
+            </svg>
+          </button>
+          {/* dropdown menu */}
+          <ul
+            tabIndex={0}
+            className="dropdown-content menu bg-base-100 rounded-box z-[1] w-52 p-2 shadow"
+          >
+            <li onClick={downloadConversation}>
+              <a>Download</a>
+            </li>
+            <li className="text-error" onClick={removeConversation}>
+              <a>Delete</a>
+            </li>
+          </ul>
+        </div>
        <div className="tooltip tooltip-bottom" data-tip="Settings">
          <button className="btn" onClick={() => setShowSettings(true)}>
            {/* settings button */}
--- a/examples/server/webui/src/components/SettingDialog.tsx
+++ b/examples/server/webui/src/components/SettingDialog.tsx
@@ -148,13 +148,13 @@ const SETTING_SECTIONS: SettingSection[] = [
    fields: [
      {
        type: SettingInputType.CHECKBOX,
-        label: 'Expand thought process by default when generating messages',
+        label: 'Expand though process by default for generating message',
        key: 'showThoughtInProgress',
      },
      {
        type: SettingInputType.CHECKBOX,
        label:
-          'Exclude thought process when sending requests to API (Recommended for DeepSeek-R1)',
+          'Exclude thought process when sending request to API (Recommended for DeepSeek-R1)',
        key: 'excludeThoughtOnReq',
      },
    ],
@@ -247,7 +247,7 @@ const SETTING_SECTIONS: SettingSection[] = [
              This feature uses{' '}
              <OpenInNewTab href="https://pyodide.org">pyodide</OpenInNewTab>,
              downloaded from CDN. To use this feature, ask the LLM to generate
-              Python code inside a Markdown code block. You will see a "Run"
+              python code inside a markdown code block. You will see a "Run"
              button on the code block, near the "Copy" button.
            </small>
          </>
@@ -274,7 +274,7 @@ export default function SettingDialog({
  );

  const resetConfig = () => {
-    if (window.confirm('Are you sure you want to reset all settings?')) {
+    if (window.confirm('Are you sure to reset all settings?')) {
      setLocalConfig(CONFIG_DEFAULT);
    }
  };
@@ -296,9 +296,9 @@ export default function SettingDialog({
          return;
        }
      } else if (mustBeNumeric) {
-        const trimmedValue = value.toString().trim();
-        const numVal = Number(trimmedValue);
-        if (isNaN(numVal) || !isNumeric(numVal) || trimmedValue.length === 0) {
+        const trimedValue = value.toString().trim();
+        const numVal = Number(trimedValue);
+        if (isNaN(numVal) || !isNumeric(numVal) || trimedValue.length === 0) {
          alert(`Value for ${key} must be numeric`);
          return;
        }
--- a/examples/server/webui/src/components/Sidebar.tsx
+++ b/examples/server/webui/src/components/Sidebar.tsx
@@ -1,4 +1,4 @@
-import { useEffect, useState } from 'react';
+import { useEffect, useMemo, useState } from 'react';
 import { classNames } from '../utils/misc';
 import { Conversation } from '../utils/types';
 import StorageUtils from '../utils/storage';
@@ -7,17 +7,16 @@ import { useNavigate, useParams } from 'react-router';
 export default function Sidebar() {
  const params = useParams();
  const navigate = useNavigate();
+  const currConv = useMemo(
+    () => StorageUtils.getOneConversation(params.convId ?? ''),
+    [params.convId]
+  );

  const [conversations, setConversations] = useState<Conversation[]>([]);
-  const [currConv, setCurrConv] = useState<Conversation | null>(null);

  useEffect(() => {
-    StorageUtils.getOneConversation(params.convId ?? '').then(setCurrConv);
-  }, [params.convId]);
-
-  useEffect(() => {
-    const handleConversationChange = async () => {
-      setConversations(await StorageUtils.getAllConversations());
+    const handleConversationChange = () => {
+      setConversations(StorageUtils.getAllConversations());
    };
    StorageUtils.onConversationChanged(handleConversationChange);
    handleConversationChange();
@@ -83,11 +82,11 @@ export default function Sidebar() {
              onClick={() => navigate(`/chat/${conv.id}`)}
              dir="auto"
            >
-              <span className="truncate">{conv.name}</span>
+              <span className="truncate">{conv.messages[0].content}</span>
            </div>
          ))}
          <div className="text-center text-xs opacity-40 mt-auto mx-4">
-            Conversations are saved to browser's IndexedDB
+            Conversations are saved to browser's localStorage
          </div>
        </div>
      </div>
--- a/examples/server/webui/src/utils/app.context.tsx
+++ b/examples/server/webui/src/utils/app.context.tsx
@@ -5,7 +5,6 @@ import {
  Conversation,
  Message,
  PendingMessage,
-  ViewingChat,
 } from './types';
 import StorageUtils from './storage';
 import {
@@ -14,27 +13,24 @@ import {
  getSSEStreamAsync,
 } from './misc';
 import { BASE_URL, CONFIG_DEFAULT, isDev } from '../Config';
-import { matchPath, useLocation, useNavigate } from 'react-router';
+import { matchPath, useLocation } from 'react-router';

 interface AppContextValue {
  // conversations and messages
-  viewingChat: ViewingChat | null;
+  viewingConversation: Conversation | null;
  pendingMessages: Record<Conversation['id'], PendingMessage>;
  isGenerating: (convId: string) => boolean;
  sendMessage: (
-    convId: string | null,
-    leafNodeId: Message['id'] | null,
+    convId: string,
    content: string,
-    extra: Message['extra'],
-    onChunk: CallbackGeneratedChunk
+    onChunk?: CallbackGeneratedChunk
  ) => Promise<boolean>;
  stopGenerating: (convId: string) => void;
  replaceMessageAndGenerate: (
    convId: string,
-    parentNodeId: Message['id'], // the parent node of the message to be replaced
-    content: string | null,
-    extra: Message['extra'],
-    onChunk: CallbackGeneratedChunk
+    origMsgId: Message['id'],
+    content?: string,
+    onChunk?: CallbackGeneratedChunk
  ) => Promise<void>;

  // canvas
@@ -48,33 +44,23 @@ interface AppContextValue {
  setShowSettings: (show: boolean) => void;
 }

-// this callback is used for scrolling to the bottom of the chat and switching to the last node
-export type CallbackGeneratedChunk = (currLeafNodeId?: Message['id']) => void;
+// for now, this callback is only used for scrolling to the bottom of the chat
+type CallbackGeneratedChunk = () => void;

 // eslint-disable-next-line @typescript-eslint/no-explicit-any
 const AppContext = createContext<AppContextValue>({} as any);

-const getViewingChat = async (convId: string): Promise<ViewingChat | null> => {
-  const conv = await StorageUtils.getOneConversation(convId);
-  if (!conv) return null;
-  return {
-    conv: conv,
-    // all messages from all branches, not filtered by last node
-    messages: await StorageUtils.getMessages(convId),
-  };
-};
-
 export const AppContextProvider = ({
  children,
 }: {
  children: React.ReactElement;
 }) => {
  const { pathname } = useLocation();
-  const navigate = useNavigate();
  const params = matchPath('/chat/:convId', pathname);
  const convId = params?.params?.convId;

-  const [viewingChat, setViewingChat] = useState<ViewingChat | null>(null);
+  const [viewingConversation, setViewingConversation] =
+    useState<Conversation | null>(null);
  const [pendingMessages, setPendingMessages] = useState<
    Record<Conversation['id'], PendingMessage>
  >({});
@@ -89,12 +75,12 @@ export const AppContextProvider = ({
  useEffect(() => {
    // also reset the canvas data
    setCanvasData(null);
-    const handleConversationChange = async (changedConvId: string) => {
+    const handleConversationChange = (changedConvId: string) => {
      if (changedConvId !== convId) return;
-      setViewingChat(await getViewingChat(changedConvId));
+      setViewingConversation(StorageUtils.getOneConversation(convId));
    };
    StorageUtils.onConversationChanged(handleConversationChange);
-    getViewingChat(convId ?? '').then(setViewingChat);
+    setViewingConversation(StorageUtils.getOneConversation(convId ?? ''));
    return () => {
      StorageUtils.offConversationChanged(handleConversationChange);
    };
@@ -132,39 +118,23 @@ export const AppContextProvider = ({

  const generateMessage = async (
    convId: string,
-    leafNodeId: Message['id'],
-    onChunk: CallbackGeneratedChunk
+    onChunk?: CallbackGeneratedChunk
  ) => {
    if (isGenerating(convId)) return;

    const config = StorageUtils.getConfig();
-    const currConversation = await StorageUtils.getOneConversation(convId);
+    const currConversation = StorageUtils.getOneConversation(convId);
    if (!currConversation) {
      throw new Error('Current conversation is not found');
    }

-    const currMessages = StorageUtils.filterByLeafNodeId(
-      await StorageUtils.getMessages(convId),
-      leafNodeId,
-      false
-    );
    const abortController = new AbortController();
    setAbort(convId, abortController);

-    if (!currMessages) {
-      throw new Error('Current messages are not found');
-    }
-
-    const pendingId = Date.now() + 1;
    let pendingMsg: PendingMessage = {
-      id: pendingId,
-      convId,
-      type: 'text',
-      timestamp: pendingId,
+      id: Date.now() + 1,
      role: 'assistant',
      content: null,
-      parent: leafNodeId,
-      children: [],
    };
    setPending(convId, pendingMsg);

@@ -174,7 +144,7 @@ export const AppContextProvider = ({
        ...(config.systemMessage.length === 0
          ? []
          : [{ role: 'system', content: config.systemMessage } as APIMessage]),
-        ...normalizeMsgsForAPI(currMessages),
+        ...normalizeMsgsForAPI(currConversation?.messages ?? []),
      ];
      if (config.excludeThoughtOnReq) {
        messages = filterThoughtFromMsgs(messages);
@@ -235,7 +205,8 @@ export const AppContextProvider = ({
        const lastContent = pendingMsg.content || '';
        if (addedContent) {
          pendingMsg = {
-            ...pendingMsg,
+            id: pendingMsg.id,
+            role: 'assistant',
            content: lastContent + addedContent,
          };
        }
@@ -250,7 +221,7 @@ export const AppContextProvider = ({
          };
        }
        setPending(convId, pendingMsg);
-        onChunk(); // don't need to switch node for pending message
+        onChunk?.();
      }
    } catch (err) {
      setPending(convId, null);
@@ -265,55 +236,37 @@ export const AppContextProvider = ({
      }
    }

-    if (pendingMsg.content !== null) {
-      await StorageUtils.appendMsg(pendingMsg as Message, leafNodeId);
+    if (pendingMsg.content) {
+      StorageUtils.appendMsg(currConversation.id, {
+        id: pendingMsg.id,
+        content: pendingMsg.content,
+        role: pendingMsg.role,
+        timings: pendingMsg.timings,
+      });
    }
    setPending(convId, null);
-    onChunk(pendingId); // trigger scroll to bottom and switch to the last node
+    onChunk?.(); // trigger scroll to bottom
  };

  const sendMessage = async (
-    convId: string | null,
-    leafNodeId: Message['id'] | null,
+    convId: string,
    content: string,
-    extra: Message['extra'],
-    onChunk: CallbackGeneratedChunk
+    onChunk?: CallbackGeneratedChunk
  ): Promise<boolean> => {
-    if (isGenerating(convId ?? '') || content.trim().length === 0) return false;
+    if (isGenerating(convId) || content.trim().length === 0) return false;

-    if (convId === null || convId.length === 0 || leafNodeId === null) {
-      const conv = await StorageUtils.createConversation(
-        content.substring(0, 256)
-      );
-      convId = conv.id;
-      leafNodeId = conv.currNode;
-      // if user is creating a new conversation, redirect to the new conversation
-      navigate(`/chat/${convId}`);
-    }
-
-    const now = Date.now();
-    const currMsgId = now;
-    StorageUtils.appendMsg(
-      {
-        id: currMsgId,
-        timestamp: now,
-        type: 'text',
-        convId,
-        role: 'user',
-        content,
-        extra,
-        parent: leafNodeId,
-        children: [],
-      },
-      leafNodeId
-    );
-    onChunk(currMsgId);
+    StorageUtils.appendMsg(convId, {
+      id: Date.now(),
+      role: 'user',
+      content,
+    });

    try {
-      await generateMessage(convId, currMsgId, onChunk);
+      await generateMessage(convId, onChunk);
      return true;
    } catch (_) {
-      // TODO: rollback
+      // rollback
+      StorageUtils.popMsg(convId);
    }
    return false;
  };
@@ -326,35 +279,22 @@ export const AppContextProvider = ({
  // if content is undefined, we remove last assistant message
  const replaceMessageAndGenerate = async (
    convId: string,
-    parentNodeId: Message['id'], // the parent node of the message to be replaced
-    content: string | null,
-    extra: Message['extra'],
-    onChunk: CallbackGeneratedChunk
+    origMsgId: Message['id'],
+    content?: string,
+    onChunk?: CallbackGeneratedChunk
  ) => {
    if (isGenerating(convId)) return;

-    if (content !== null) {
-      const now = Date.now();
-      const currMsgId = now;
-      StorageUtils.appendMsg(
-        {
-          id: currMsgId,
-          timestamp: now,
-          type: 'text',
-          convId,
-          role: 'user',
-          content,
-          extra,
-          parent: parentNodeId,
-          children: [],
-        },
-        parentNodeId
-      );
-      parentNodeId = currMsgId;
+    StorageUtils.filterAndKeepMsgs(convId, (msg) => msg.id < origMsgId);
+    if (content) {
+      StorageUtils.appendMsg(convId, {
+        id: Date.now(),
+        role: 'user',
+        content,
+      });
    }
-    onChunk(parentNodeId);

-    await generateMessage(convId, parentNodeId, onChunk);
+    await generateMessage(convId, onChunk);
  };

  const saveConfig = (config: typeof CONFIG_DEFAULT) => {
@@ -366,7 +306,7 @@ export const AppContextProvider = ({
    <AppContext.Provider
      value={{
        isGenerating,
-        viewingChat,
+        viewingConversation,
        pendingMessages,
        sendMessage,
        stopGenerating,
--- a/examples/server/webui/src/utils/llama-vscode.ts
+++ b/examples/server/webui/src/utils/llama-vscode.ts
@@ -1,62 +0,0 @@
-import { useEffect, useState } from 'react';
-import { MessageExtraContext } from './types';
-
-// Extra context when using llama.cpp WebUI from llama-vscode, inside an iframe
-// Ref: https://github.com/ggml-org/llama.cpp/pull/11940
-
-interface SetTextEvData {
-  text: string;
-  context: string;
-}
-
-/**
- * To test it:
- * window.postMessage({ command: 'setText', text: 'Spot the syntax error', context: 'def test()\n  return 123' }, '*');
- */
-
-export const useVSCodeContext = (
-  inputRef: React.RefObject<HTMLTextAreaElement>,
-  setInputMsg: (text: string) => void
-) => {
-  const [extraContext, setExtraContext] = useState<MessageExtraContext | null>(
-    null
-  );
-
-  // Accept setText message from a parent window and set inputMsg and extraContext
-  useEffect(() => {
-    const handleMessage = (event: MessageEvent) => {
-      if (event.data?.command === 'setText') {
-        const data: SetTextEvData = event.data;
-        setInputMsg(data?.text);
-        if (data?.context && data.context.length > 0) {
-          setExtraContext({
-            type: 'context',
-            content: data.context,
-          });
-        }
-        inputRef.current?.focus();
-      }
-    };
-
-    window.addEventListener('message', handleMessage);
-    return () => window.removeEventListener('message', handleMessage);
-  }, [inputRef, setInputMsg]);
-
-  // Add a keydown listener that sends the "escapePressed" message to the parent window
-  useEffect(() => {
-    const handleKeyDown = (event: KeyboardEvent) => {
-      if (event.key === 'Escape') {
-        window.parent.postMessage({ command: 'escapePressed' }, '*');
-      }
-    };
-
-    window.addEventListener('keydown', handleKeyDown);
-    return () => window.removeEventListener('keydown', handleKeyDown);
-  }, []);
-
-  return {
-    extraContext,
-    // call once the user message is sent, to clear the extra context
-    clearExtraContext: () => setExtraContext(null),
-  };
-};
--- a/examples/server/webui/src/utils/misc.ts
+++ b/examples/server/webui/src/utils/misc.ts
@@ -4,6 +4,7 @@ import { APIMessage, Message } from './types';

 // ponyfill for missing ReadableStream asyncIterator on Safari
 import { asyncIterator } from '@sec-ant/readable-stream/ponyfill/asyncIterator';
+import { isDev } from '../Config';

 // eslint-disable-next-line @typescript-eslint/no-explicit-any
 export const isString = (x: any) => !!x.toLowerCase;
@@ -22,7 +23,7 @@ export async function* getSSEStreamAsync(fetchResponse: Response) {
    .pipeThrough(new TextLineStream());
  // @ts-expect-error asyncIterator complains about type, but it should work
  for await (const line of asyncIterator(lines)) {
-    //if (isDev) console.log({ line });
+    if (isDev) console.log({ line });
    if (line.startsWith('data:') && !line.endsWith('[DONE]')) {
      const data = JSON.parse(line.slice(5));
      yield data;
@@ -53,23 +54,12 @@ export const copyStr = (textToCopy: string) => {

 /**
 * filter out redundant fields upon sending to API
- * also format extra into text
 */
-export function normalizeMsgsForAPI(messages: Readonly<Message[]>) {
+export function normalizeMsgsForAPI(messages: Message[]) {
  return messages.map((msg) => {
-    let newContent = '';
-
-    for (const extra of msg.extra ?? []) {
-      if (extra.type === 'context') {
-        newContent += `${extra.content}\n\n`;
-      }
-    }
-
-    newContent += msg.content;
-
    return {
      role: msg.role,
-      content: newContent,
+      content: msg.content,
    };
  }) as APIMessage[];
 }
@@ -98,31 +88,3 @@ export function classNames(classes: Record<string, boolean>): string {

 export const delay = (ms: number) =>
  new Promise((resolve) => setTimeout(resolve, ms));
-
-export const throttle = <T extends unknown[]>(
-  callback: (...args: T) => void,
-  delay: number
-) => {
-  let isWaiting = false;
-
-  return (...args: T) => {
-    if (isWaiting) {
-      return;
-    }
-
-    callback(...args);
-    isWaiting = true;
-
-    setTimeout(() => {
-      isWaiting = false;
-    }, delay);
-  };
-};
-
-export const cleanCurrentUrl = (removeQueryParams: string[]) => {
-  const url = new URL(window.location.href);
-  removeQueryParams.forEach((param) => {
-    url.searchParams.delete(param);
-  });
-  window.history.replaceState({}, '', url.toString());
-};
--- a/examples/server/webui/src/utils/storage.ts
+++ b/examples/server/webui/src/utils/storage.ts
@@ -2,8 +2,7 @@
 // format: { [convId]: { id: string, lastModified: number, messages: [...] } }

 import { CONFIG_DEFAULT } from '../Config';
-import { Conversation, Message, TimingReport } from './types';
-import Dexie, { Table } from 'dexie';
+import { Conversation, Message } from './types';

 const event = new EventTarget();

@@ -18,154 +17,85 @@ const dispatchConversationChange = (convId: string) => {
  );
 };

-const db = new Dexie('LlamacppWebui') as Dexie & {
-  conversations: Table<Conversation>;
-  messages: Table<Message>;
-};
-
-// https://dexie.org/docs/Version/Version.stores()
-db.version(1).stores({
-  // Unlike SQL, you don’t need to specify all properties but only the one you wish to index.
-  conversations: '&id, lastModified',
-  messages: '&id, convId, [convId+id], timestamp',
-});
-
 // convId is a string prefixed with 'conv-'
 const StorageUtils = {
  /**
   * manage conversations
   */
-  async getAllConversations(): Promise<Conversation[]> {
-    await migrationLStoIDB().catch(console.error); // noop if already migrated
-    return (await db.conversations.toArray()).sort(
-      (a, b) => b.lastModified - a.lastModified
-    );
+  getAllConversations(): Conversation[] {
+    const res = [];
+    for (const key in localStorage) {
+      if (key.startsWith('conv-')) {
+        res.push(JSON.parse(localStorage.getItem(key) ?? '{}'));
+      }
+    }
+    res.sort((a, b) => b.lastModified - a.lastModified);
+    return res;
  },
  /**
   * can return null if convId does not exist
   */
-  async getOneConversation(convId: string): Promise<Conversation | null> {
-    return (await db.conversations.where('id').equals(convId).first()) ?? null;
+  getOneConversation(convId: string): Conversation | null {
+    return JSON.parse(localStorage.getItem(convId) || 'null');
  },
  /**
-   * get all message nodes in a conversation
+   * if convId does not exist, create one
   */
-  async getMessages(convId: string): Promise<Message[]> {
-    return await db.messages.where({ convId }).toArray();
-  },
-  /**
-   * use in conjunction with getMessages to filter messages by leafNodeId
-   * includeRoot: whether to include the root node in the result
-   * if node with leafNodeId does not exist, return the path with the latest timestamp
-   */
-  filterByLeafNodeId(
-    msgs: Readonly<Message[]>,
-    leafNodeId: Message['id'],
-    includeRoot: boolean
-  ): Readonly<Message[]> {
-    const res: Message[] = [];
-    const nodeMap = new Map<Message['id'], Message>();
-    for (const msg of msgs) {
-      nodeMap.set(msg.id, msg);
-    }
-    let startNode: Message | undefined = nodeMap.get(leafNodeId);
-    if (!startNode) {
-      // if not found, we return the path with the latest timestamp
-      let latestTime = -1;
-      for (const msg of msgs) {
-        if (msg.timestamp > latestTime) {
-          startNode = msg;
-          latestTime = msg.timestamp;
-        }
-      }
-    }
-    // traverse the path from leafNodeId to root
-    // startNode can never be undefined here
-    let currNode: Message | undefined = startNode;
-    while (currNode) {
-      if (currNode.type !== 'root' || (currNode.type === 'root' && includeRoot))
-        res.push(currNode);
-      currNode = nodeMap.get(currNode.parent ?? -1);
-    }
-    res.sort((a, b) => a.timestamp - b.timestamp);
-    return res;
-  },
-  /**
-   * create a new conversation with a default root node
-   */
-  async createConversation(name: string): Promise<Conversation> {
-    const now = Date.now();
-    const msgId = now;
-    const conv: Conversation = {
-      id: `conv-${now}`,
-      lastModified: now,
-      currNode: msgId,
-      name,
-    };
-    await db.conversations.add(conv);
-    // create a root node
-    await db.messages.add({
-      id: msgId,
-      convId: conv.id,
-      type: 'root',
-      timestamp: now,
-      role: 'system',
-      content: '',
-      parent: -1,
-      children: [],
-    });
-    return conv;
-  },
-  /**
-   * if convId does not exist, throw an error
-   */
-  async appendMsg(
-    msg: Exclude<Message, 'parent' | 'children'>,
-    parentNodeId: Message['id']
-  ): Promise<void> {
+  appendMsg(convId: string, msg: Message): void {
    if (msg.content === null) return;
-    const { convId } = msg;
-    await db.transaction('rw', db.conversations, db.messages, async () => {
-      const conv = await StorageUtils.getOneConversation(convId);
-      const parentMsg = await db.messages
-        .where({ convId, id: parentNodeId })
-        .first();
-      // update the currNode of conversation
-      if (!conv) {
-        throw new Error(`Conversation ${convId} does not exist`);
-      }
-      if (!parentMsg) {
-        throw new Error(
-          `Parent message ID ${parentNodeId} does not exist in conversation ${convId}`
-        );
-      }
-      await db.conversations.update(convId, {
-        lastModified: Date.now(),
-        currNode: msg.id,
-      });
-      // update parent
-      await db.messages.update(parentNodeId, {
-        children: [...parentMsg.children, msg.id],
-      });
-      // create message
-      await db.messages.add({
-        ...msg,
-        parent: parentNodeId,
-        children: [],
-      });
-    });
+    const conv = StorageUtils.getOneConversation(convId) || {
+      id: convId,
+      lastModified: Date.now(),
+      messages: [],
+    };
+    conv.messages.push(msg);
+    conv.lastModified = Date.now();
+    localStorage.setItem(convId, JSON.stringify(conv));
    dispatchConversationChange(convId);
  },
+  /**
+   * Get new conversation id
+   */
+  getNewConvId(): string {
+    return `conv-${Date.now()}`;
+  },
  /**
   * remove conversation by id
   */
-  async remove(convId: string): Promise<void> {
-    await db.transaction('rw', db.conversations, db.messages, async () => {
-      await db.conversations.delete(convId);
-      await db.messages.where({ convId }).delete();
-    });
+  remove(convId: string): void {
+    localStorage.removeItem(convId);
    dispatchConversationChange(convId);
  },
+  /**
+   * remove all conversations
+   */
+  filterAndKeepMsgs(
+    convId: string,
+    predicate: (msg: Message) => boolean
+  ): void {
+    const conv = StorageUtils.getOneConversation(convId);
+    if (!conv) return;
+    conv.messages = conv.messages.filter(predicate);
+    conv.lastModified = Date.now();
+    localStorage.setItem(convId, JSON.stringify(conv));
+    dispatchConversationChange(convId);
+  },
+  /**
+   * remove last message from conversation
+   */
+  popMsg(convId: string): Message | undefined {
+    const conv = StorageUtils.getOneConversation(convId);
+    if (!conv) return;
+    const msg = conv.messages.pop();
+    conv.lastModified = Date.now();
+    if (conv.messages.length === 0) {
+      StorageUtils.remove(convId);
+    } else {
+      localStorage.setItem(convId, JSON.stringify(conv));
+    }
+    dispatchConversationChange(convId);
+    return msg;
+  },

  // event listeners
  onConversationChanged(callback: CallbackConversationChanged) {
@@ -206,79 +136,3 @@ const StorageUtils = {
 };

 export default StorageUtils;
-
-// Migration from localStorage to IndexedDB
-
-// these are old types, LS prefix stands for LocalStorage
-interface LSConversation {
-  id: string; // format: `conv-{timestamp}`
-  lastModified: number; // timestamp from Date.now()
-  messages: LSMessage[];
-}
-interface LSMessage {
-  id: number;
-  role: 'user' | 'assistant' | 'system';
-  content: string;
-  timings?: TimingReport;
-}
-async function migrationLStoIDB() {
-  if (localStorage.getItem('migratedToIDB')) return;
-  const res: LSConversation[] = [];
-  for (const key in localStorage) {
-    if (key.startsWith('conv-')) {
-      res.push(JSON.parse(localStorage.getItem(key) ?? '{}'));
-    }
-  }
-  if (res.length === 0) return;
-  await db.transaction('rw', db.conversations, db.messages, async () => {
-    let migratedCount = 0;
-    for (const conv of res) {
-      const { id: convId, lastModified, messages } = conv;
-      const firstMsg = messages[0];
-      const lastMsg = messages.at(-1);
-      if (messages.length < 2 || !firstMsg || !lastMsg) {
-        console.log(
-          `Skipping conversation ${convId} with ${messages.length} messages`
-        );
-        continue;
-      }
-      const name = firstMsg.content ?? '(no messages)';
-      await db.conversations.add({
-        id: convId,
-        lastModified,
-        currNode: lastMsg.id,
-        name,
-      });
-      const rootId = messages[0].id - 2;
-      await db.messages.add({
-        id: rootId,
-        convId: convId,
-        type: 'root',
-        timestamp: rootId,
-        role: 'system',
-        content: '',
-        parent: -1,
-        children: [firstMsg.id],
-      });
-      for (let i = 0; i < messages.length; i++) {
-        const msg = messages[i];
-        await db.messages.add({
-          ...msg,
-          type: 'text',
-          convId: convId,
-          timestamp: msg.id,
-          parent: i === 0 ? rootId : messages[i - 1].id,
-          children: i === messages.length - 1 ? [] : [messages[i + 1].id],
-        });
-      }
-      migratedCount++;
-      console.log(
-        `Migrated conversation ${convId} with ${messages.length} messages`
-      );
-    }
-    console.log(
-      `Migrated ${migratedCount} conversations from localStorage to IndexedDB`
-    );
-    localStorage.setItem('migratedToIDB', '1');
-  });
-}
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
0cc4m	25840747e6	Vulkan: Add device architecture enum and logic to recognize AMD generations	2025-03-08 08:04:45 +00:00
Daniele	7037e94852	vulkan: subgroup size test	2025-02-26 15:44:42 +01:00