llama : make llm_tokenizer more private

ggml-ci
refactor tokenizer
2026-02-12 14:03:20 +02:00 · 2024-09-20 11:41:51 +03:00 · 2024-09-20 15:02:44 +08:00
1073 changed files with 124635 additions and 232380 deletions
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -13,15 +13,12 @@ Checks: >
    -readability-magic-numbers,
    -readability-uppercase-literal-suffix,
    -readability-simplify-boolean-expr,
-    -readability-math-missing-parentheses,
    clang-analyzer-*,
    -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
    performance-*,
    portability-*,
-    -portability-simd-intrinsics,
    misc-*,
    -misc-const-correctness,
    -misc-non-private-member-variables-in-classes,
    -misc-no-recursion,
-    -misc-use-anonymous-namespace,
 FormatStyle: none
--- a/.devops/cpu.Dockerfile
+++ b/.devops/cpu.Dockerfile
@@ -1,92 +0,0 @@
-ARG UBUNTU_VERSION=22.04
-
-FROM ubuntu:$UBUNTU_VERSION AS build
-
-ARG TARGETARCH
-
-ARG GGML_CPU_ARM_ARCH=armv8-a
-
-RUN apt-get update && \
-    apt-get install -y build-essential git cmake libcurl4-openssl-dev
-
-WORKDIR /app
-
-COPY . .
-
-RUN if [ "$TARGETARCH" = "amd64" ]; then \
-        cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
-    elif [ "$TARGETARCH" = "arm64" ]; then \
-        cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_CPU_ARM_ARCH=${GGML_CPU_ARM_ARCH}; \
-    else \
-        echo "Unsupported architecture"; \
-        exit 1; \
-    fi && \
-    cmake --build build -j $(nproc)
-
-RUN mkdir -p /app/lib && \
-    find build -name "*.so" -exec cp {} /app/lib \;
-
-RUN mkdir -p /app/full \
-    && cp build/bin/* /app/full \
-    && cp *.py /app/full \
-    && cp -r gguf-py /app/full \
-    && cp -r requirements /app/full \
-    && cp requirements.txt /app/full \
-    && cp .devops/tools.sh /app/full/tools.sh
-
-## Base image
-FROM ubuntu:$UBUNTU_VERSION AS base
-
-RUN apt-get update \
-    && apt-get install -y libgomp1 curl\
-    && apt autoremove -y \
-    && apt clean -y \
-    && rm -rf /tmp/* /var/tmp/* \
-    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
-    && find /var/cache -type f -delete
-
-COPY --from=build /app/lib/ /app
-
-### Full
-FROM base AS full
-
-COPY --from=build /app/full /app
-
-WORKDIR /app
-
-RUN apt-get update \
-    && apt-get install -y \
-    git \
-    python3 \
-    python3-pip \
-    && pip install --upgrade pip setuptools wheel \
-    && pip install -r requirements.txt \
-    && apt autoremove -y \
-    && apt clean -y \
-    && rm -rf /tmp/* /var/tmp/* \
-    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
-    && find /var/cache -type f -delete
-
-ENTRYPOINT ["/app/tools.sh"]
-
-### Light, CLI only
-FROM base AS light
-
-COPY --from=build /app/full/llama-cli /app
-
-WORKDIR /app
-
-ENTRYPOINT [ "/app/llama-cli" ]
-
-### Server, Server only
-FROM base AS server
-
-ENV LLAMA_ARG_HOST=0.0.0.0
-
-COPY --from=build /app/full/llama-server /app
-
-WORKDIR /app
-
-HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
-
-ENTRYPOINT [ "/app/llama-server" ]
--- a/.devops/cuda.Dockerfile
+++ b/.devops/cuda.Dockerfile
@@ -1,94 +0,0 @@
-ARG UBUNTU_VERSION=22.04
-# This needs to generally match the container host's environment.
-ARG CUDA_VERSION=12.4.0
-# Target the CUDA build image
-ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
-
-ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
-
-FROM ${BASE_CUDA_DEV_CONTAINER} AS build
-
-# CUDA architecture to build for (defaults to all supported archs)
-ARG CUDA_DOCKER_ARCH=default
-
-RUN apt-get update && \
-    apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
-
-WORKDIR /app
-
-COPY . .
-
-RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
-    export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
-    fi && \
-    cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
-    cmake --build build --config Release -j$(nproc)
-
-RUN mkdir -p /app/lib && \
-    find build -name "*.so" -exec cp {} /app/lib \;
-
-RUN mkdir -p /app/full \
-    && cp build/bin/* /app/full \
-    && cp *.py /app/full \
-    && cp -r gguf-py /app/full \
-    && cp -r requirements /app/full \
-    && cp requirements.txt /app/full \
-    && cp .devops/tools.sh /app/full/tools.sh
-
-## Base image
-FROM ${BASE_CUDA_RUN_CONTAINER} AS base
-
-RUN apt-get update \
-    && apt-get install -y libgomp1 curl\
-    && apt autoremove -y \
-    && apt clean -y \
-    && rm -rf /tmp/* /var/tmp/* \
-    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
-    && find /var/cache -type f -delete
-
-COPY --from=build /app/lib/ /app
-
-### Full
-FROM base AS full
-
-COPY --from=build /app/full /app
-
-WORKDIR /app
-
-RUN apt-get update \
-    && apt-get install -y \
-    git \
-    python3 \
-    python3-pip \
-    && pip install --upgrade pip setuptools wheel \
-    && pip install -r requirements.txt \
-    && apt autoremove -y \
-    && apt clean -y \
-    && rm -rf /tmp/* /var/tmp/* \
-    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
-    && find /var/cache -type f -delete
-
-
-ENTRYPOINT ["/app/tools.sh"]
-
-### Light, CLI only
-FROM base AS light
-
-COPY --from=build /app/full/llama-cli /app
-
-WORKDIR /app
-
-ENTRYPOINT [ "/app/llama-cli" ]
-
-### Server, Server only
-FROM base AS server
-
-ENV LLAMA_ARG_HOST=0.0.0.0
-
-COPY --from=build /app/full/llama-server /app
-
-WORKDIR /app
-
-HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
-
-ENTRYPOINT [ "/app/llama-server" ]
--- a/.devops/full-cuda.Dockerfile
+++ b/.devops/full-cuda.Dockerfile
@@ -0,0 +1,33 @@
+ARG UBUNTU_VERSION=22.04
+# This needs to generally match the container host's environment.
+ARG CUDA_VERSION=12.6.0
+# Target the CUDA build image
+ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+
+FROM ${BASE_CUDA_DEV_CONTAINER} AS build
+
+# CUDA architecture to build for (defaults to all supported archs)
+ARG CUDA_DOCKER_ARCH=default
+
+RUN apt-get update && \
+    apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
+
+COPY requirements.txt   requirements.txt
+COPY requirements       requirements
+
+RUN pip install --upgrade pip setuptools wheel \
+    && pip install -r requirements.txt
+
+WORKDIR /app
+
+COPY . .
+
+# Use the default CUDA archs if not specified
+RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
+        export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
+    fi && \
+    cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake --build build --config Release -j$(nproc) && \
+    cp build/bin/* .
+
+ENTRYPOINT ["/app/.devops/tools.sh"]
--- a/.devops/full-rocm.Dockerfile
+++ b/.devops/full-rocm.Dockerfile
@@ -0,0 +1,50 @@
+ARG UBUNTU_VERSION=22.04
+
+# This needs to generally match the container host's environment.
+ARG ROCM_VERSION=5.6
+
+# Target the CUDA build image
+ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
+
+FROM ${BASE_ROCM_DEV_CONTAINER} AS build
+
+# Unless otherwise specified, we make a fat build.
+# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
+# This is mostly tied to rocBLAS supported archs.
+ARG ROCM_DOCKER_ARCH=\
+    gfx803 \
+    gfx900 \
+    gfx906 \
+    gfx908 \
+    gfx90a \
+    gfx1010 \
+    gfx1030 \
+    gfx1100 \
+    gfx1101 \
+    gfx1102
+
+COPY requirements.txt   requirements.txt
+COPY requirements       requirements
+
+RUN pip install --upgrade pip setuptools wheel \
+    && pip install -r requirements.txt
+
+WORKDIR /app
+
+COPY . .
+
+# Set nvcc architecture
+ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
+# Enable ROCm
+ENV GGML_HIPBLAS=1
+ENV CC=/opt/rocm/llvm/bin/clang
+ENV CXX=/opt/rocm/llvm/bin/clang++
+
+# Enable cURL
+ENV LLAMA_CURL=1
+RUN apt-get update && \
+    apt-get install -y libcurl4-openssl-dev
+
+RUN make -j$(nproc)
+
+ENTRYPOINT ["/app/.devops/tools.sh"]
--- a/.devops/full.Dockerfile
+++ b/.devops/full.Dockerfile
@@ -0,0 +1,25 @@
+ARG UBUNTU_VERSION=22.04
+
+FROM ubuntu:$UBUNTU_VERSION AS build
+
+RUN apt-get update && \
+    apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
+
+COPY requirements.txt   requirements.txt
+COPY requirements       requirements
+
+RUN pip install --upgrade pip setuptools wheel \
+    && pip install -r requirements.txt
+
+WORKDIR /app
+
+COPY . .
+
+ENV LLAMA_CURL=1
+
+
+RUN make -j$(nproc)
+
+ENV LC_ALL=C.utf8
+
+ENTRYPOINT ["/app/.devops/tools.sh"]
--- a/.devops/intel.Dockerfile
+++ b/.devops/intel.Dockerfile
@@ -1,91 +0,0 @@
-ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04
-
-## Build Image
-
-FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
-
-ARG GGML_SYCL_F16=OFF
-RUN apt-get update && \
-    apt-get install -y git libcurl4-openssl-dev
-
-WORKDIR /app
-
-COPY . .
-
-RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
-        echo "GGML_SYCL_F16 is set" \
-        && export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
-    fi && \
-    echo "Building with dynamic libs" && \
-    cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${OPT_SYCL_F16} && \
-    cmake --build build --config Release -j$(nproc)
-
-RUN mkdir -p /app/lib && \
-    find build -name "*.so" -exec cp {} /app/lib \;
-
-RUN mkdir -p /app/full \
-    && cp build/bin/* /app/full \
-    && cp *.py /app/full \
-    && cp -r gguf-py /app/full \
-    && cp -r requirements /app/full \
-    && cp requirements.txt /app/full \
-    && cp .devops/tools.sh /app/full/tools.sh
-
-FROM intel/oneapi-basekit:$ONEAPI_VERSION AS base
-
-RUN apt-get update \
-    && apt-get install -y libgomp1 curl\
-    && apt autoremove -y \
-    && apt clean -y \
-    && rm -rf /tmp/* /var/tmp/* \
-    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
-    && find /var/cache -type f -delete
-
-### Full
-FROM base AS full
-
-COPY --from=build /app/lib/ /app
-COPY --from=build /app/full /app
-
-WORKDIR /app
-
-RUN apt-get update \
-    && apt-get install -y \
-    git \
-    python3 \
-    python3-pip \
-    && pip install --upgrade pip setuptools wheel \
-    && pip install -r requirements.txt \
-    && apt autoremove -y \
-    && apt clean -y \
-    && rm -rf /tmp/* /var/tmp/* \
-    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
-    && find /var/cache -type f -delete
-
-
-ENTRYPOINT ["/app/tools.sh"]
-
-### Light, CLI only
-FROM base AS light
-
-COPY --from=build /app/lib/ /app
-COPY --from=build /app/full/llama-cli /app
-
-WORKDIR /app
-
-ENTRYPOINT [ "/app/llama-cli" ]
-
-### Server, Server only
-FROM base AS server
-
-ENV LLAMA_ARG_HOST=0.0.0.0
-
-COPY --from=build /app/lib/ /app
-COPY --from=build /app/full/llama-server /app
-
-WORKDIR /app
-
-HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
-
-ENTRYPOINT [ "/app/llama-server" ]
-
--- a/.devops/llama-cli-cann.Dockerfile
+++ b/.devops/llama-cli-cann.Dockerfile
@@ -1,12 +1,12 @@
-ARG ASCEND_VERSION=8.1.RC1.alpha001-910b-openeuler22.03-py3.10
+ARG ASCEND_VERSION=8.0.rc2.alpha003-910b-openeuler22.03-py3.8

-FROM ascendai/cann:$ASCEND_VERSION AS build
+FROM cosdt/cann:$ASCEND_VERSION AS build

 WORKDIR /app

 COPY . .

-RUN yum install -y gcc g++ cmake make libcurl-devel
+RUN yum install -y gcc g++ cmake make
 ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
 ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
 ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
@@ -22,11 +22,11 @@ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH

 RUN echo "Building with static libs" && \
    source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
-    cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_TESTS=OFF  && \
+    cmake -B build -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF  && \
    cmake --build build --config Release --target llama-cli

 # TODO: use image with NNRT
-FROM ascendai/cann:$ASCEND_VERSION AS runtime
+FROM cosdt/cann:$ASCEND_VERSION AS runtime
 COPY --from=build /app/build/bin/llama-cli /llama-cli

 ENV LC_ALL=C.utf8
--- a/.devops/llama-cli-cuda.Dockerfile
+++ b/.devops/llama-cli-cuda.Dockerfile
@@ -0,0 +1,37 @@
+ARG UBUNTU_VERSION=22.04
+# This needs to generally match the container host's environment.
+ARG CUDA_VERSION=12.6.0
+# Target the CUDA build image
+ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+# Target the CUDA runtime image
+ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
+
+FROM ${BASE_CUDA_DEV_CONTAINER} AS build
+
+# CUDA architecture to build for (defaults to all supported archs)
+ARG CUDA_DOCKER_ARCH=default
+
+RUN apt-get update && \
+    apt-get install -y build-essential git cmake
+
+WORKDIR /app
+
+COPY . .
+
+# Use the default CUDA archs if not specified
+RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
+        export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
+    fi && \
+    cmake -B build -DGGML_CUDA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake --build build --config Release --target llama-cli -j$(nproc)
+
+FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
+
+RUN apt-get update && \
+    apt-get install -y libgomp1
+
+COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
+COPY --from=build /app/build/src/libllama.so /libllama.so
+COPY --from=build /app/build/bin/llama-cli /llama-cli
+
+ENTRYPOINT [ "/llama-cli" ]
--- a/.devops/llama-cli-intel.Dockerfile
+++ b/.devops/llama-cli-intel.Dockerfile
@@ -0,0 +1,28 @@
+ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
+
+FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
+
+ARG GGML_SYCL_F16=OFF
+RUN apt-get update && \
+    apt-get install -y git
+
+WORKDIR /app
+
+COPY . .
+
+RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
+        echo "GGML_SYCL_F16 is set" && \
+        export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
+    fi && \
+    echo "Building with static libs" && \
+    cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \
+    ${OPT_SYCL_F16} -DBUILD_SHARED_LIBS=OFF && \
+    cmake --build build --config Release --target llama-cli
+
+FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
+
+COPY --from=build /app/build/bin/llama-cli /llama-cli
+
+ENV LC_ALL=C.utf8
+
+ENTRYPOINT [ "/llama-cli" ]
--- a/.devops/llama-cli-rocm.Dockerfile
+++ b/.devops/llama-cli-rocm.Dockerfile
@@ -0,0 +1,45 @@
+ARG UBUNTU_VERSION=22.04
+
+# This needs to generally match the container host's environment.
+ARG ROCM_VERSION=5.6
+
+# Target the CUDA build image
+ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
+
+FROM ${BASE_ROCM_DEV_CONTAINER} AS build
+
+# Unless otherwise specified, we make a fat build.
+# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
+# This is mostly tied to rocBLAS supported archs.
+ARG ROCM_DOCKER_ARCH=\
+    gfx803 \
+    gfx900 \
+    gfx906 \
+    gfx908 \
+    gfx90a \
+    gfx1010 \
+    gfx1030 \
+    gfx1100 \
+    gfx1101 \
+    gfx1102
+
+COPY requirements.txt   requirements.txt
+COPY requirements       requirements
+
+RUN pip install --upgrade pip setuptools wheel \
+    && pip install -r requirements.txt
+
+WORKDIR /app
+
+COPY . .
+
+# Set nvcc architecture
+ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
+# Enable ROCm
+ENV GGML_HIPBLAS=1
+ENV CC=/opt/rocm/llvm/bin/clang
+ENV CXX=/opt/rocm/llvm/bin/clang++
+
+RUN make -j$(nproc) llama-cli
+
+ENTRYPOINT [ "/app/llama-cli" ]
--- a/.devops/llama-cli-vulkan.Dockerfile
+++ b/.devops/llama-cli-vulkan.Dockerfile
@@ -0,0 +1,27 @@
+ARG UBUNTU_VERSION=jammy
+
+FROM ubuntu:$UBUNTU_VERSION AS build
+
+# Install build tools
+RUN apt update && apt install -y git build-essential cmake wget libgomp1
+
+# Install Vulkan SDK
+RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
+    wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
+    apt update -y && \
+    apt-get install -y vulkan-sdk
+
+# Build it
+WORKDIR /app
+COPY . .
+RUN cmake -B build -DGGML_VULKAN=1 && \
+    cmake --build build --config Release --target llama-cli
+
+# Clean up
+WORKDIR /
+RUN cp /app/build/bin/llama-cli /llama-cli && \
+    rm -rf /app
+
+ENV LC_ALL=C.utf8
+
+ENTRYPOINT [ "/llama-cli" ]
--- a/.devops/llama-cli.Dockerfile
+++ b/.devops/llama-cli.Dockerfile
@@ -0,0 +1,23 @@
+ARG UBUNTU_VERSION=22.04
+
+FROM ubuntu:$UBUNTU_VERSION AS build
+
+RUN apt-get update && \
+    apt-get install -y build-essential git
+
+WORKDIR /app
+
+COPY . .
+
+RUN make -j$(nproc) llama-cli
+
+FROM ubuntu:$UBUNTU_VERSION AS runtime
+
+RUN apt-get update && \
+    apt-get install -y libgomp1
+
+COPY --from=build /app/llama-cli /llama-cli
+
+ENV LC_ALL=C.utf8
+
+ENTRYPOINT [ "/llama-cli" ]
--- a/.devops/llama-cpp-cuda.srpm.spec
+++ b/.devops/llama-cpp-cuda.srpm.spec
@@ -17,10 +17,10 @@ Version:        %( date "+%%Y%%m%%d" )
 Release:        1%{?dist}
 Summary:        CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
 License:        MIT
-Source0:        https://github.com/ggml-org/llama.cpp/archive/refs/heads/master.tar.gz
+Source0:        https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
 BuildRequires:  coreutils make gcc-c++ git cuda-toolkit
 Requires:       cuda-toolkit
-URL:            https://github.com/ggml-org/llama.cpp
+URL:            https://github.com/ggerganov/llama.cpp

 %define debug_package %{nil}
 %define source_date_epoch_from_changelog 0
--- a/.devops/llama-cpp.srpm.spec
+++ b/.devops/llama-cpp.srpm.spec
@@ -18,10 +18,10 @@ Version:        %( date "+%%Y%%m%%d" )
 Release:        1%{?dist}
 Summary:        CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
 License:        MIT
-Source0:        https://github.com/ggml-org/llama.cpp/archive/refs/heads/master.tar.gz
+Source0:        https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
 BuildRequires:  coreutils make gcc-c++ git libstdc++-devel
 Requires:       libstdc++
-URL:            https://github.com/ggml-org/llama.cpp
+URL:            https://github.com/ggerganov/llama.cpp

 %define debug_package %{nil}
 %define source_date_epoch_from_changelog 0
--- a/.devops/llama-server-cuda.Dockerfile
+++ b/.devops/llama-server-cuda.Dockerfile
@@ -0,0 +1,42 @@
+ARG UBUNTU_VERSION=22.04
+# This needs to generally match the container host's environment.
+ARG CUDA_VERSION=12.6.0
+# Target the CUDA build image
+ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+# Target the CUDA runtime image
+ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
+
+FROM ${BASE_CUDA_DEV_CONTAINER} AS build
+
+# CUDA architecture to build for (defaults to all supported archs)
+ARG CUDA_DOCKER_ARCH=default
+
+RUN apt-get update && \
+    apt-get install -y build-essential git cmake libcurl4-openssl-dev
+
+WORKDIR /app
+
+COPY . .
+
+# Use the default CUDA archs if not specified
+RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
+        export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
+    fi && \
+    cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake --build build --config Release --target llama-server -j$(nproc)
+
+FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
+
+RUN apt-get update && \
+    apt-get install -y libcurl4-openssl-dev libgomp1 curl
+
+COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
+COPY --from=build /app/build/src/libllama.so /libllama.so
+COPY --from=build /app/build/bin/llama-server /llama-server
+
+# Must be set to 0.0.0.0 so it can listen to requests from host machine
+ENV LLAMA_ARG_HOST=0.0.0.0
+
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+
+ENTRYPOINT [ "/llama-server" ]
--- a/.devops/llama-server-intel.Dockerfile
+++ b/.devops/llama-server-intel.Dockerfile
@@ -0,0 +1,34 @@
+ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
+
+FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
+
+ARG GGML_SYCL_F16=OFF
+RUN apt-get update && \
+    apt-get install -y git libcurl4-openssl-dev
+
+WORKDIR /app
+
+COPY . .
+
+RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
+        echo "GGML_SYCL_F16 is set" && \
+        export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
+    fi && \
+    echo "Building with dynamic libs" && \
+    cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
+    cmake --build build --config Release --target llama-server
+
+FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
+
+RUN apt-get update && \
+    apt-get install -y libcurl4-openssl-dev curl
+
+COPY --from=build /app/build/bin/llama-server /llama-server
+
+ENV LC_ALL=C.utf8
+# Must be set to 0.0.0.0 so it can listen to requests from host machine
+ENV LLAMA_ARG_HOST=0.0.0.0
+
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+
+ENTRYPOINT [ "/llama-server" ]
--- a/.devops/llama-server-rocm.Dockerfile
+++ b/.devops/llama-server-rocm.Dockerfile
@@ -0,0 +1,54 @@
+ARG UBUNTU_VERSION=22.04
+
+# This needs to generally match the container host's environment.
+ARG ROCM_VERSION=5.6
+
+# Target the CUDA build image
+ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
+
+FROM ${BASE_ROCM_DEV_CONTAINER} AS build
+
+# Unless otherwise specified, we make a fat build.
+# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
+# This is mostly tied to rocBLAS supported archs.
+ARG ROCM_DOCKER_ARCH=\
+    gfx803 \
+    gfx900 \
+    gfx906 \
+    gfx908 \
+    gfx90a \
+    gfx1010 \
+    gfx1030 \
+    gfx1100 \
+    gfx1101 \
+    gfx1102
+
+COPY requirements.txt   requirements.txt
+COPY requirements       requirements
+
+RUN pip install --upgrade pip setuptools wheel \
+    && pip install -r requirements.txt
+
+WORKDIR /app
+
+COPY . .
+
+# Set nvcc architecture
+ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
+# Enable ROCm
+ENV GGML_HIPBLAS=1
+ENV CC=/opt/rocm/llvm/bin/clang
+ENV CXX=/opt/rocm/llvm/bin/clang++
+# Must be set to 0.0.0.0 so it can listen to requests from host machine
+ENV LLAMA_ARG_HOST=0.0.0.0
+
+# Enable cURL
+ENV LLAMA_CURL=1
+RUN apt-get update && \
+    apt-get install -y libcurl4-openssl-dev curl
+
+RUN make -j$(nproc) llama-server
+
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+
+ENTRYPOINT [ "/app/llama-server" ]
--- a/.devops/llama-server-vulkan.Dockerfile
+++ b/.devops/llama-server-vulkan.Dockerfile
@@ -0,0 +1,31 @@
+ARG UBUNTU_VERSION=jammy
+
+FROM ubuntu:$UBUNTU_VERSION AS build
+
+# Install build tools
+RUN apt update && apt install -y git build-essential cmake wget
+
+# Install Vulkan SDK and cURL
+RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
+    wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
+    apt update -y && \
+    apt-get install -y vulkan-sdk libcurl4-openssl-dev curl
+
+# Build it
+WORKDIR /app
+COPY . .
+RUN cmake -B build -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
+    cmake --build build --config Release --target llama-server
+
+# Clean up
+WORKDIR /
+RUN cp /app/build/bin/llama-server /llama-server && \
+    rm -rf /app
+
+ENV LC_ALL=C.utf8
+# Must be set to 0.0.0.0 so it can listen to requests from host machine
+ENV LLAMA_ARG_HOST=0.0.0.0
+
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+
+ENTRYPOINT [ "/llama-server" ]
--- a/.devops/llama-server.Dockerfile
+++ b/.devops/llama-server.Dockerfile
@@ -0,0 +1,29 @@
+ARG UBUNTU_VERSION=22.04
+
+FROM ubuntu:$UBUNTU_VERSION AS build
+
+RUN apt-get update && \
+    apt-get install -y build-essential git libcurl4-openssl-dev
+
+WORKDIR /app
+
+COPY . .
+
+ENV LLAMA_CURL=1
+
+RUN make -j$(nproc) llama-server
+
+FROM ubuntu:$UBUNTU_VERSION AS runtime
+
+RUN apt-get update && \
+    apt-get install -y libcurl4-openssl-dev libgomp1 curl
+
+COPY --from=build /app/llama-server /llama-server
+
+ENV LC_ALL=C.utf8
+# Must be set to 0.0.0.0 so it can listen to requests from host machine
+ENV LLAMA_ARG_HOST=0.0.0.0
+
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+
+ENTRYPOINT [ "/llama-server" ]
--- a/.devops/musa.Dockerfile
+++ b/.devops/musa.Dockerfile
@@ -1,108 +0,0 @@
-ARG UBUNTU_VERSION=22.04
-# This needs to generally match the container host's environment.
-ARG MUSA_VERSION=rc3.1.1
-# Target the MUSA build image
-ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
-
-ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
-
-FROM ${BASE_MUSA_DEV_CONTAINER} AS build
-
-# MUSA architecture to build for (defaults to all supported archs)
-ARG MUSA_DOCKER_ARCH=default
-
-RUN apt-get update && \
-    apt-get install -y \
-    build-essential \
-    cmake \
-    python3 \
-    python3-pip \
-    git \
-    libcurl4-openssl-dev \
-    libgomp1
-
-COPY requirements.txt   requirements.txt
-COPY requirements       requirements
-
-RUN pip install --upgrade pip setuptools wheel \
-    && pip install -r requirements.txt
-
-WORKDIR /app
-
-COPY . .
-
-# Use the default MUSA archs if not specified
-RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
-        export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
-    fi && \
-    cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
-    cmake --build build --config Release -j$(nproc)
-
-RUN mkdir -p /app/lib && \
-    find build -name "*.so" -exec cp {} /app/lib \;
-
-RUN mkdir -p /app/full \
-    && cp build/bin/* /app/full \
-    && cp *.py /app/full \
-    && cp -r gguf-py /app/full \
-    && cp -r requirements /app/full \
-    && cp requirements.txt /app/full \
-    && cp .devops/tools.sh /app/full/tools.sh
-
-## Base image
-FROM ${BASE_MUSA_RUN_CONTAINER} AS base
-
-RUN apt-get update \
-    && apt-get install -y libgomp1 curl\
-    && apt autoremove -y \
-    && apt clean -y \
-    && rm -rf /tmp/* /var/tmp/* \
-    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
-    && find /var/cache -type f -delete
-
-COPY --from=build /app/lib/ /app
-
-### Full
-FROM base AS full
-
-COPY --from=build /app/full /app
-
-WORKDIR /app
-
-RUN apt-get update \
-    && apt-get install -y \
-    git \
-    python3 \
-    python3-pip \
-    && pip install --upgrade pip setuptools wheel \
-    && pip install -r requirements.txt \
-    && apt autoremove -y \
-    && apt clean -y \
-    && rm -rf /tmp/* /var/tmp/* \
-    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
-    && find /var/cache -type f -delete
-
-
-ENTRYPOINT ["/app/tools.sh"]
-
-### Light, CLI only
-FROM base AS light
-
-COPY --from=build /app/full/llama-cli /app
-
-WORKDIR /app
-
-ENTRYPOINT [ "/app/llama-cli" ]
-
-### Server, Server only
-FROM base AS server
-
-ENV LLAMA_ARG_HOST=0.0.0.0
-
-COPY --from=build /app/full/llama-server /app
-
-WORKDIR /app
-
-HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
-
-ENTRYPOINT [ "/app/llama-server" ]
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -31,7 +31,6 @@
  # Increases the runtime closure size by ~700M
  useMpi ? false,
  useRocm ? config.rocmSupport,
-  rocmGpuTargets ? builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets,
  enableCurl ? true,
  useVulkan ? false,
  llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
@@ -127,18 +126,18 @@ effectiveStdenv.mkDerivation (finalAttrs: {
  };

  postPatch = ''
-    substituteInPlace ./ggml/src/ggml-metal/ggml-metal.m \
+    substituteInPlace ./ggml/src/ggml-metal.m \
      --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
-    substituteInPlace ./ggml/src/ggml-metal/ggml-metal.m \
+    substituteInPlace ./ggml/src/ggml-metal.m \
      --replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
  '';

-  # With PR#6015 https://github.com/ggml-org/llama.cpp/pull/6015,
+  # With PR#6015 https://github.com/ggerganov/llama.cpp/pull/6015,
  # `default.metallib` may be compiled with Metal compiler from XCode
  # and we need to escape sandbox on MacOS to access Metal compiler.
  # `xcrun` is used find the path of the Metal compiler, which is varible
  # and not on $PATH
-  # see https://github.com/ggml-org/llama.cpp/pull/6118 for discussion
+  # see https://github.com/ggerganov/llama.cpp/pull/6118 for discussion
  __noChroot = effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders;

  nativeBuildInputs =
@@ -174,7 +173,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
      (cmakeBool "GGML_NATIVE" false)
      (cmakeBool "GGML_BLAS" useBlas)
      (cmakeBool "GGML_CUDA" useCuda)
-      (cmakeBool "GGML_HIP" useRocm)
+      (cmakeBool "GGML_HIPBLAS" useRocm)
      (cmakeBool "GGML_METAL" useMetalKit)
      (cmakeBool "GGML_VULKAN" useVulkan)
      (cmakeBool "GGML_STATIC" enableStatic)
@@ -189,7 +188,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
    ]
    ++ optionals useRocm [
      (cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
-      (cmakeFeature "CMAKE_HIP_ARCHITECTURES" rocmGpuTargets)
+      (cmakeFeature "CMAKE_HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets))
    ]
    ++ optionals useMetalKit [
      (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
@@ -220,7 +219,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
    broken = (useMetalKit && !effectiveStdenv.isDarwin);

    description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
-    homepage = "https://github.com/ggml-org/llama.cpp/";
+    homepage = "https://github.com/ggerganov/llama.cpp/";
    license = lib.licenses.mit;

    # Accommodates `nix run` and `lib.getExe`
--- a/.devops/nix/python-scripts.nix
+++ b/.devops/nix/python-scripts.nix
@@ -34,7 +34,7 @@ let

    # server tests
    openai
-    pytest
+    behave
    prometheus-client
  ];
 in
--- a/.devops/rocm.Dockerfile
+++ b/.devops/rocm.Dockerfile
@@ -1,113 +0,0 @@
-ARG UBUNTU_VERSION=24.04
-
-# This needs to generally match the container host's environment.
-ARG ROCM_VERSION=6.3
-ARG AMDGPU_VERSION=6.3
-
-# Target the CUDA build image
-ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
-
-### Build image
-FROM ${BASE_ROCM_DEV_CONTAINER} AS build
-
-# Unless otherwise specified, we make a fat build.
-# List from https://github.com/ggml-org/llama.cpp/pull/1087#issuecomment-1682807878
-# This is mostly tied to rocBLAS supported archs.
-# gfx803, gfx900, gfx1032, gfx1101, gfx1102,not officialy supported
-# gfx906 is deprecated
-#check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.2.4/reference/system-requirements.html
-
-ARG ROCM_DOCKER_ARCH='gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102'
-#ARG ROCM_DOCKER_ARCH=gfx1100
-
-# Set nvcc architectured
-ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
-# Enable ROCm
-# ENV CC=/opt/rocm/llvm/bin/clang
-# ENV CXX=/opt/rocm/llvm/bin/clang++
-
-RUN apt-get update \
-    && apt-get install -y \
-    build-essential \
-    cmake \
-    git \
-    libcurl4-openssl-dev \
-    curl \
-    libgomp1
-
-WORKDIR /app
-
-COPY . .
-
-RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
-    cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_BUILD_TESTS=OFF \
-    && cmake --build build --config Release -j$(nproc)
-
-RUN mkdir -p /app/lib \
-    && find build -name "*.so" -exec cp {} /app/lib \;
-
-RUN mkdir -p /app/full \
-    && cp build/bin/* /app/full \
-    && cp *.py /app/full \
-    && cp -r gguf-py /app/full \
-    && cp -r requirements /app/full \
-    && cp requirements.txt /app/full \
-    && cp .devops/tools.sh /app/full/tools.sh
-
-## Base image
-FROM ${BASE_ROCM_DEV_CONTAINER} AS base
-
-RUN apt-get update \
-    && apt-get install -y libgomp1 curl\
-    && apt autoremove -y \
-    && apt clean -y \
-    && rm -rf /tmp/* /var/tmp/* \
-    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
-    && find /var/cache -type f -delete
-
-COPY --from=build /app/lib/ /app
-
-### Full
-FROM base AS full
-
-COPY --from=build /app/full /app
-
-WORKDIR /app
-
-RUN apt-get update \
-    && apt-get install -y \
-    git \
-    python3-pip \
-    python3 \
-    python3-wheel\
-    && pip install --break-system-packages --upgrade setuptools \
-    && pip install --break-system-packages -r requirements.txt \
-    && apt autoremove -y \
-    && apt clean -y \
-    && rm -rf /tmp/* /var/tmp/* \
-    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
-    && find /var/cache -type f -delete
-
-ENTRYPOINT ["/app/tools.sh"]
-
-### Light, CLI only
-FROM base AS light
-
-COPY --from=build /app/full/llama-cli /app
-
-WORKDIR /app
-
-ENTRYPOINT [ "/app/llama-cli" ]
-
-### Server, Server only
-FROM base AS server
-
-ENV LLAMA_ARG_HOST=0.0.0.0
-
-COPY --from=build /app/full/llama-server /app
-
-WORKDIR /app
-
-HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
-
-ENTRYPOINT [ "/app/llama-server" ]
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@@ -8,36 +8,28 @@ arg1="$1"
 shift

 if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
-    exec python3 ./convert_hf_to_gguf.py "$@"
+    python3 ./convert_hf_to_gguf.py "$@"
 elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
-    exec ./llama-quantize "$@"
+    ./llama-quantize "$@"
 elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
-    exec ./llama-cli "$@"
-elif [[ "$arg1" == '--bench' || "$arg1" == '-b' ]]; then
-    exec ./llama-bench "$@"
-elif [[ "$arg1" == '--perplexity' || "$arg1" == '-p' ]]; then
-    exec ./llama-perplexity "$@"
+    ./llama-cli "$@"
 elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
    echo "Converting PTH to GGML..."
-    for i in $(ls $1/$2/ggml-model-f16.bin*); do
+    for i in `ls $1/$2/ggml-model-f16.bin*`; do
        if [ -f "${i/f16/q4_0}" ]; then
            echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
        else
            echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
-            exec ./llama-quantize "$i" "${i/f16/q4_0}" q4_0
+            ./llama-quantize "$i" "${i/f16/q4_0}" q4_0
        fi
    done
 elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
-    exec ./llama-server "$@"
+    ./llama-server "$@"
 else
    echo "Unknown command: $arg1"
    echo "Available commands: "
    echo "  --run (-r): Run a model previously converted into ggml"
    echo "              ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
-    echo "  --bench (-b): Benchmark the performance of the inference for various parameters."
-    echo "              ex: -m model.gguf"
-    echo "  --perplexity (-p): Measure the perplexity of a model over a given text."
-    echo "              ex: -m model.gguf -f file.txt"
    echo "  --convert (-c): Convert a llama model into ggml"
    echo "              ex: --outtype f16 \"/models/7B/\" "
    echo "  --quantize (-q): Optimize with quantization process ggml"
--- a/.devops/vulkan.Dockerfile
+++ b/.devops/vulkan.Dockerfile
@@ -1,89 +0,0 @@
-ARG UBUNTU_VERSION=24.04
-
-FROM ubuntu:$UBUNTU_VERSION AS build
-
-# Install build tools
-RUN apt update && apt install -y git build-essential cmake wget
-
-# Install Vulkan SDK and cURL
-RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
-    wget -qO /etc/apt/sources.list.d/lunarg-vulkan-noble.list https://packages.lunarg.com/vulkan/lunarg-vulkan-noble.list && \
-    apt update -y && \
-    apt-get install -y vulkan-sdk libcurl4-openssl-dev curl
-
-# Build it
-WORKDIR /app
-
-COPY . .
-
-RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1  -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON && \
-    cmake --build build --config Release -j$(nproc)
-
-RUN mkdir -p /app/lib && \
-    find build -name "*.so" -exec cp {} /app/lib \;
-
-RUN mkdir -p /app/full \
-    && cp build/bin/* /app/full \
-    && cp *.py /app/full \
-    && cp -r gguf-py /app/full \
-    && cp -r requirements /app/full \
-    && cp requirements.txt /app/full \
-    && cp .devops/tools.sh /app/full/tools.sh
-
-## Base image
-FROM ubuntu:$UBUNTU_VERSION AS base
-
-RUN apt-get update \
-    && apt-get install -y libgomp1 curl libvulkan-dev \
-    && apt autoremove -y \
-    && apt clean -y \
-    && rm -rf /tmp/* /var/tmp/* \
-    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
-    && find /var/cache -type f -delete
-
-COPY --from=build /app/lib/ /app
-
-### Full
-FROM base AS full
-
-COPY --from=build /app/full /app
-
-WORKDIR /app
-
-RUN apt-get update \
-    && apt-get install -y \
-    git \
-    python3 \
-    python3-pip \
-    python3-wheel \
-    && pip install --break-system-packages --upgrade setuptools \
-    && pip install --break-system-packages -r requirements.txt \
-    && apt autoremove -y \
-    && apt clean -y \
-    && rm -rf /tmp/* /var/tmp/* \
-    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
-    && find /var/cache -type f -delete
-
-ENTRYPOINT ["/app/tools.sh"]
-
-### Light, CLI only
-FROM base AS light
-
-COPY --from=build /app/full/llama-cli /app
-
-WORKDIR /app
-
-ENTRYPOINT [ "/app/llama-cli" ]
-
-### Server, Server only
-FROM base AS server
-
-ENV LLAMA_ARG_HOST=0.0.0.0
-
-COPY --from=build /app/full/llama-server /app
-
-WORKDIR /app
-
-HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
-
-ENTRYPOINT [ "/app/llama-server" ]
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,7 +1,7 @@
 *.o
 *.a
 .cache/
-# Do not ignore .git directory, otherwise the reported build number will always be 0
+.git/
 .github/
 .gitignore
 .vs/
--- a/.editorconfig
+++ b/.editorconfig
@@ -21,30 +21,12 @@ indent_style = tab
 [prompts/*.txt]
 insert_final_newline = unset

-[tools/server/public/*]
+[examples/server/public/*]
 indent_size = 2

-[tools/server/public/deps_*]
-trim_trailing_whitespace = unset
-indent_style = unset
-indent_size = unset
-
-[tools/server/deps_*]
-trim_trailing_whitespace = unset
-indent_style = unset
-indent_size = unset
-
 [examples/llama.swiftui/llama.swiftui.xcodeproj/*]
 indent_style = tab

-[tools/cvector-generator/*.txt]
-trim_trailing_whitespace = unset
-insert_final_newline = unset
-
-[models/templates/*.jinja]
-indent_style = unset
-indent_size = unset
-end_of_line = unset
-charset = unset
+[examples/cvector-generator/*.txt]
 trim_trailing_whitespace = unset
 insert_final_newline = unset
--- a/.flake8
+++ b/.flake8
@@ -2,9 +2,8 @@
 max-line-length = 125
 ignore = E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503
 exclude =
-    # Do not traverse examples and tools
+    # Do not traverse examples
    examples,
-    tools,
    # Do not include package initializers
    __init__.py,
    # No need to traverse our git directory
--- a/.github/ISSUE_TEMPLATE/01-bug-low.yml
+++ b/.github/ISSUE_TEMPLATE/01-bug-low.yml
@@ -0,0 +1,50 @@
+name: Low Severity Bugs
+description: Used to report low severity bugs in llama.cpp (e.g. cosmetic issues, non critical UI glitches)
+title: "Bug: "
+labels: ["bug-unconfirmed", "low severity"]
+body:
+  - type: markdown
+    attributes:
+      value: |
+        Thanks for taking the time to fill out this bug report!
+        Please include information about your system, the steps to reproduce the bug,
+        and the version of llama.cpp that you are using.
+        If possible, please provide a minimal code example that reproduces the bug.
+  - type: textarea
+    id: what-happened
+    attributes:
+      label: What happened?
+      description: Also tell us, what did you expect to happen?
+      placeholder: Tell us what you see!
+    validations:
+      required: true
+  - type: textarea
+    id: version
+    attributes:
+      label: Name and Version
+      description: Which executable and which version of our software are you running? (use `--version` to get a version string)
+      placeholder: |
+        $./llama-cli --version
+        version: 2999 (42b4109e)
+        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
+    validations:
+      required: true
+  - type: dropdown
+    id: operating-system
+    attributes:
+      label: What operating system are you seeing the problem on?
+      multiple: true
+      options:
+        - Linux
+        - Mac
+        - Windows
+        - BSD
+        - Other? (Please let us know in description)
+    validations:
+      required: false
+  - type: textarea
+    id: logs
+    attributes:
+      label: Relevant log output
+      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
+      render: shell
--- a/.github/ISSUE_TEMPLATE/010-bug-compilation.yml
+++ b/.github/ISSUE_TEMPLATE/010-bug-compilation.yml
@@ -1,87 +0,0 @@
-name: Bug (compilation)
-description: Something goes wrong when trying to compile llama.cpp.
-title: "Compile bug: "
-labels: ["bug-unconfirmed", "compilation"]
-body:
-  - type: markdown
-    attributes:
-      value: >
-        Thanks for taking the time to fill out this bug report!
-        This issue template is intended for bug reports where the compilation of llama.cpp fails.
-        Before opening an issue, please confirm that the compilation still fails with `-DGGML_CCACHE=OFF`.
-        If the compilation succeeds with ccache disabled you should be able to permanently fix the issue
-        by clearing `~/.cache/ccache` (on Linux).
-  - type: textarea
-    id: commit
-    attributes:
-      label: Git commit
-      description: Which commit are you trying to compile?
-      placeholder: |
-        $git rev-parse HEAD
-        84a07a17b1b08cf2b9747c633a2372782848a27f
-    validations:
-      required: true
-  - type: dropdown
-    id: operating-system
-    attributes:
-      label: Operating systems
-      description: Which operating systems do you know to be affected?
-      multiple: true
-      options:
-        - Linux
-        - Mac
-        - Windows
-        - BSD
-        - Other? (Please let us know in description)
-    validations:
-      required: true
-  - type: dropdown
-    id: backends
-    attributes:
-        label: GGML backends
-        description: Which GGML backends do you know to be affected?
-        options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
-        multiple: true
-    validations:
-      required: true
-  - type: textarea
-    id: info
-    attributes:
-      label: Problem description & steps to reproduce
-      description: >
-        Please give us a summary of the problem and tell us how to reproduce it.
-        If you can narrow down the bug to specific compile flags, that information would be very much appreciated by us.
-      placeholder: >
-        I'm trying to compile llama.cpp with CUDA support on a fresh install of Ubuntu and get error XY.
-        Here are the exact commands that I used: ...
-    validations:
-      required: true
-  - type: textarea
-    id: first_bad_commit
-    attributes:
-      label: First Bad Commit
-      description: >
-        If the bug was not present on an earlier version: when did it start appearing?
-        If possible, please do a git bisect and identify the exact commit that introduced the bug.
-    validations:
-      required: false
-  - type: textarea
-    id: command
-    attributes:
-      label: Compile command
-      description: >
-        Please provide the exact command you used to compile llama.cpp. For example: `cmake -B ...`.
-        This will be automatically formatted into code, so no need for backticks.
-      render: shell
-    validations:
-      required: true
-  - type: textarea
-    id: logs
-    attributes:
-      label: Relevant log output
-      description: >
-          Please copy and paste any relevant log output, including any generated text.
-          This will be automatically formatted into code, so no need for backticks.
-      render: shell
-    validations:
-      required: true
--- a/.github/ISSUE_TEMPLATE/011-bug-results.yml
+++ b/.github/ISSUE_TEMPLATE/011-bug-results.yml
@@ -1,101 +0,0 @@
-name: Bug (model use)
-description: Something goes wrong when using a model (in general, not specific to a single llama.cpp module).
-title: "Eval bug: "
-labels: ["bug-unconfirmed", "model evaluation"]
-body:
-  - type: markdown
-    attributes:
-      value: >
-        Thanks for taking the time to fill out this bug report!
-        This issue template is intended for bug reports where the model evaluation results
-        (i.e. the generated text) are incorrect or llama.cpp crashes during model evaluation.
-        If you encountered the issue while using an external UI (e.g. ollama),
-        please reproduce your issue using one of the examples/binaries in this repository.
-        The `llama-cli` binary can be used for simple and reproducible model inference.
-  - type: textarea
-    id: version
-    attributes:
-      label: Name and Version
-      description: Which version of our software are you running? (use `--version` to get a version string)
-      placeholder: |
-        $./llama-cli --version
-        version: 2999 (42b4109e)
-        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
-    validations:
-      required: true
-  - type: dropdown
-    id: operating-system
-    attributes:
-      label: Operating systems
-      description: Which operating systems do you know to be affected?
-      multiple: true
-      options:
-        - Linux
-        - Mac
-        - Windows
-        - BSD
-        - Other? (Please let us know in description)
-    validations:
-      required: true
-  - type: dropdown
-    id: backends
-    attributes:
-        label: GGML backends
-        description: Which GGML backends do you know to be affected?
-        options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
-        multiple: true
-    validations:
-      required: true
-  - type: textarea
-    id: hardware
-    attributes:
-      label: Hardware
-      description: Which CPUs/GPUs are you using?
-      placeholder: >
-        e.g. Ryzen 5950X + 2x RTX 4090
-    validations:
-      required: true
-  - type: textarea
-    id: model
-    attributes:
-      label: Models
-      description: >
-        Which model(s) at which quantization were you using when encountering the bug?
-        If you downloaded a GGUF file off of Huggingface, please provide a link.
-      placeholder: >
-        e.g. Meta LLaMA 3.1 Instruct 8b q4_K_M
-    validations:
-      required: false
-  - type: textarea
-    id: info
-    attributes:
-      label: Problem description & steps to reproduce
-      description: >
-        Please give us a summary of the problem and tell us how to reproduce it.
-        If you can narrow down the bug to specific hardware, compile flags, or command line arguments,
-        that information would be very much appreciated by us.
-      placeholder: >
-        e.g. when I run llama-cli with -ngl 99 I get garbled outputs.
-        When I use -ngl 0 it works correctly.
-        Here are the exact commands that I used: ...
-    validations:
-      required: true
-  - type: textarea
-    id: first_bad_commit
-    attributes:
-      label: First Bad Commit
-      description: >
-        If the bug was not present on an earlier version: when did it start appearing?
-        If possible, please do a git bisect and identify the exact commit that introduced the bug.
-    validations:
-      required: false
-  - type: textarea
-    id: logs
-    attributes:
-      label: Relevant log output
-      description: >
-          Please copy and paste any relevant log output, including the command that you entered and any generated text.
-          This will be automatically formatted into code, so no need for backticks.
-      render: shell
-    validations:
-      required: true
--- a/.github/ISSUE_TEMPLATE/019-bug-misc.yml
+++ b/.github/ISSUE_TEMPLATE/019-bug-misc.yml
@@ -1,91 +0,0 @@
-name: Bug (misc.)
-description: Something is not working the way it should (and it's not covered by any of the above cases).
-title: "Misc. bug: "
-labels: ["bug-unconfirmed"]
-body:
-  - type: markdown
-    attributes:
-      value: >
-        Thanks for taking the time to fill out this bug report!
-        This issue template is intended for miscellaneous bugs that don't fit into any other category.
-        If you encountered the issue while using an external UI (e.g. ollama),
-        please reproduce your issue using one of the examples/binaries in this repository.
-  - type: textarea
-    id: version
-    attributes:
-      label: Name and Version
-      description: Which version of our software is affected? (You can use `--version` to get a version string.)
-      placeholder: |
-        $./llama-cli --version
-        version: 2999 (42b4109e)
-        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
-    validations:
-      required: true
-  - type: dropdown
-    id: operating-system
-    attributes:
-      label: Operating systems
-      description: Which operating systems do you know to be affected?
-      multiple: true
-      options:
-        - Linux
-        - Mac
-        - Windows
-        - BSD
-        - Other? (Please let us know in description)
-    validations:
-      required: false
-  - type: dropdown
-    id: module
-    attributes:
-      label: Which llama.cpp modules do you know to be affected?
-      multiple: true
-      options:
-        - Documentation/Github
-        - libllama (core library)
-        - llama-cli
-        - llama-server
-        - llama-bench
-        - llama-quantize
-        - Python/Bash scripts
-        - Test code
-        - Other (Please specify in the next section)
-    validations:
-      required: false
-  - type: textarea
-    id: command
-    attributes:
-      label: Command line
-      description: >
-        Please provide the exact commands you entered, if applicable. For example: `llama-server -m ... -c ...`, `llama-cli -m ...`, etc.
-        This will be automatically formatted into code, so no need for backticks.
-      render: shell
-    validations:
-      required: false
-  - type: textarea
-    id: info
-    attributes:
-      label: Problem description & steps to reproduce
-      description: >
-        Please give us a summary of the problem and tell us how to reproduce it (if applicable).
-    validations:
-      required: true
-  - type: textarea
-    id: first_bad_commit
-    attributes:
-      label: First Bad Commit
-      description: >
-        If the bug was not present on an earlier version and it's not trivial to track down: when did it start appearing?
-        If possible, please do a git bisect and identify the exact commit that introduced the bug.
-    validations:
-      required: false
-  - type: textarea
-    id: logs
-    attributes:
-      label: Relevant log output
-      description: >
-          If applicable, please copy and paste any relevant log output, including any generated text.
-          This will be automatically formatted into code, so no need for backticks.
-      render: shell
-    validations:
-      required: false
--- a/.github/ISSUE_TEMPLATE/02-bug-medium.yml
+++ b/.github/ISSUE_TEMPLATE/02-bug-medium.yml
@@ -0,0 +1,50 @@
+name: Medium Severity Bug
+description: Used to report medium severity bugs in llama.cpp (e.g. Malfunctioning Features but generally still useable)
+title: "Bug: "
+labels: ["bug-unconfirmed", "medium severity"]
+body:
+  - type: markdown
+    attributes:
+      value: |
+        Thanks for taking the time to fill out this bug report!
+        Please include information about your system, the steps to reproduce the bug,
+        and the version of llama.cpp that you are using.
+        If possible, please provide a minimal code example that reproduces the bug.
+  - type: textarea
+    id: what-happened
+    attributes:
+      label: What happened?
+      description: Also tell us, what did you expect to happen?
+      placeholder: Tell us what you see!
+    validations:
+      required: true
+  - type: textarea
+    id: version
+    attributes:
+      label: Name and Version
+      description: Which executable and which version of our software are you running? (use `--version` to get a version string)
+      placeholder: |
+        $./llama-cli --version
+        version: 2999 (42b4109e)
+        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
+    validations:
+      required: true
+  - type: dropdown
+    id: operating-system
+    attributes:
+      label: What operating system are you seeing the problem on?
+      multiple: true
+      options:
+        - Linux
+        - Mac
+        - Windows
+        - BSD
+        - Other? (Please let us know in description)
+    validations:
+      required: false
+  - type: textarea
+    id: logs
+    attributes:
+      label: Relevant log output
+      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
+      render: shell
--- a/.github/ISSUE_TEMPLATE/03-bug-high.yml
+++ b/.github/ISSUE_TEMPLATE/03-bug-high.yml
@@ -0,0 +1,50 @@
+name: High Severity Bug
+description: Used to report high severity bugs in llama.cpp (e.g. Malfunctioning features hindering important common workflow)
+title: "Bug: "
+labels: ["bug-unconfirmed", "high severity"]
+body:
+  - type: markdown
+    attributes:
+      value: |
+        Thanks for taking the time to fill out this bug report!
+        Please include information about your system, the steps to reproduce the bug,
+        and the version of llama.cpp that you are using.
+        If possible, please provide a minimal code example that reproduces the bug.
+  - type: textarea
+    id: what-happened
+    attributes:
+      label: What happened?
+      description: Also tell us, what did you expect to happen?
+      placeholder: Tell us what you see!
+    validations:
+      required: true
+  - type: textarea
+    id: version
+    attributes:
+      label: Name and Version
+      description: Which executable and which version of our software are you running? (use `--version` to get a version string)
+      placeholder: |
+        $./llama-cli --version
+        version: 2999 (42b4109e)
+        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
+    validations:
+      required: true
+  - type: dropdown
+    id: operating-system
+    attributes:
+      label: What operating system are you seeing the problem on?
+      multiple: true
+      options:
+        - Linux
+        - Mac
+        - Windows
+        - BSD
+        - Other? (Please let us know in description)
+    validations:
+      required: false
+  - type: textarea
+    id: logs
+    attributes:
+      label: Relevant log output
+      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
+      render: shell
--- a/.github/ISSUE_TEMPLATE/04-bug-critical.yml
+++ b/.github/ISSUE_TEMPLATE/04-bug-critical.yml
@@ -0,0 +1,50 @@
+name: Critical Severity Bug
+description: Used to report critical severity bugs in llama.cpp (e.g. Crashing, Corrupted, Dataloss)
+title: "Bug: "
+labels: ["bug-unconfirmed", "critical severity"]
+body:
+  - type: markdown
+    attributes:
+      value: |
+        Thanks for taking the time to fill out this bug report!
+        Please include information about your system, the steps to reproduce the bug,
+        and the version of llama.cpp that you are using.
+        If possible, please provide a minimal code example that reproduces the bug.
+  - type: textarea
+    id: what-happened
+    attributes:
+      label: What happened?
+      description: Also tell us, what did you expect to happen?
+      placeholder: Tell us what you see!
+    validations:
+      required: true
+  - type: textarea
+    id: version
+    attributes:
+      label: Name and Version
+      description: Which executable and which version of our software are you running? (use `--version` to get a version string)
+      placeholder: |
+        $./llama-cli --version
+        version: 2999 (42b4109e)
+        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
+    validations:
+      required: true
+  - type: dropdown
+    id: operating-system
+    attributes:
+      label: What operating system are you seeing the problem on?
+      multiple: true
+      options:
+        - Linux
+        - Mac
+        - Windows
+        - BSD
+        - Other? (Please let us know in description)
+    validations:
+      required: false
+  - type: textarea
+    id: logs
+    attributes:
+      label: Relevant log output
+      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
+      render: shell
--- a/.github/ISSUE_TEMPLATE/020-enhancement.yml
+++ b/.github/ISSUE_TEMPLATE/020-enhancement.yml
@@ -1,12 +1,12 @@
 name: Enhancement
-description: Used to request enhancements for llama.cpp.
+description: Used to request enhancements for llama.cpp
 title: "Feature Request: "
 labels: ["enhancement"]
 body:
  - type: markdown
    attributes:
      value: |
-        [Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggml-org/llama.cpp/discussions/categories/ideas)
+        [Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggerganov/llama.cpp/discussions/categories/ideas)

  - type: checkboxes
    id: prerequisites
@@ -16,11 +16,11 @@ body:
      options:
        - label: I am running the latest code. Mention the version if possible as well.
          required: true
-        - label: I carefully followed the [README.md](https://github.com/ggml-org/llama.cpp/blob/master/README.md).
+        - label: I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
          required: true
        - label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed).
          required: true
-        - label: I reviewed the [Discussions](https://github.com/ggml-org/llama.cpp/discussions), and have a new and useful enhancement to share.
+        - label: I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new and useful enhancement to share.
          required: true

  - type: textarea
--- a/.github/ISSUE_TEMPLATE/030-research.yml
+++ b/.github/ISSUE_TEMPLATE/030-research.yml
@@ -1,12 +1,12 @@
 name: Research
-description: Track new technical research area.
+description: Track new technical research area
 title: "Research: "
 labels: ["research 🔬"]
 body:
  - type: markdown
    attributes:
      value: |
-        Don't forget to check for any [duplicate research issue tickets](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)
+        Don't forget to check for any [duplicate research issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)

  - type: checkboxes
    id: research-stage
--- a/.github/ISSUE_TEMPLATE/040-refactor.yml
+++ b/.github/ISSUE_TEMPLATE/040-refactor.yml
@@ -1,13 +1,13 @@
 name: Refactor (Maintainers)
-description: Used to track refactoring opportunities.
+description: Used to track refactoring opportunities
 title: "Refactor: "
 labels: ["refactor"]
 body:
  - type: markdown
    attributes:
      value: |
-        Don't forget to [check for existing refactor issue tickets](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
-        Also you may want to check [Pull request refactor label as well](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.
+        Don't forget to [check for existing refactor issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
+        Also you may want to check [Pull request refactor label as well](https://github.com/ggerganov/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.

  - type: textarea
    id: background-description
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -1,11 +1,11 @@
 blank_issues_enabled: true
 contact_links:
  - name: Got an idea?
-    url: https://github.com/ggml-org/llama.cpp/discussions/categories/ideas
+    url: https://github.com/ggerganov/llama.cpp/discussions/categories/ideas
    about: Pop it there. It may then become an enhancement ticket.
  - name: Got a question?
-    url: https://github.com/ggml-org/llama.cpp/discussions/categories/q-a
+    url: https://github.com/ggerganov/llama.cpp/discussions/categories/q-a
    about: Ask a question there!
  - name: Want to contribute?
-    url: https://github.com/ggml-org/llama.cpp/wiki/contribute
+    url: https://github.com/ggerganov/llama.cpp/wiki/contribute
    about: Head to the contribution guide page of the wiki for areas you can help with
--- a/.github/actions/get-tag-name/action.yml
+++ b/.github/actions/get-tag-name/action.yml
@@ -1,22 +0,0 @@
-name: "Determine tag name"
-description: "Determine the tag name to use for a release"
-outputs:
-  name:
-    description: "The name of the tag"
-    value: ${{ steps.tag.outputs.name }}
-
-runs:
-  using: "composite"
-  steps:
-    - name: Determine tag name
-      id: tag
-      shell: bash
-      run: |
-        BUILD_NUMBER="$(git rev-list --count HEAD)"
-        SHORT_HASH="$(git rev-parse --short=7 HEAD)"
-        if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
-          echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
-        else
-          SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
-          echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
-        fi
--- a/.github/actions/windows-setup-cuda/action.yml
+++ b/.github/actions/windows-setup-cuda/action.yml
@@ -1,67 +0,0 @@
-name: "Windows - Setup CUDA Toolkit"
-description: "Setup CUDA Toolkit for Windows"
-inputs:
-  cuda_version:
-    description: "CUDA toolkit version"
-    required: true
-
-runs:
-  using: "composite"
-  steps:
-    - name: Install Cuda Toolkit 11.7
-      if: ${{ inputs.cuda_version == '11.7' }}
-      shell: pwsh
-      run: |
-          mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
-          choco install unzip -y
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-11.7.99-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-11.7.99-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-11.7.99-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-11.7.4.6-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-11.7.91-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-11.7.91-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-11.7.101-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-11.7.91-archive.zip"
-          unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cudart-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvcc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvrtc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libcublas-windows-x86_64-11.7.4.6-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvtx-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\visual_studio_integration-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvprof-windows-x86_64-11.7.101-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cccl-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
-          echo "CUDA_PATH_V11_7=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
-
-    - name: Install Cuda Toolkit 12.4
-      if: ${{ inputs.cuda_version == '12.4' }}
-      shell: pwsh
-      run: |
-          mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
-          choco install unzip -y
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.4.127-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-12.4.131-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-12.4.127-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.4.5.8-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-12.4.127-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-12.4.127-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-12.4.127-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-12.4.127-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-12.4.127-archive.zip"
-          unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cudart-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvcc-windows-x86_64-12.4.131-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvrtc-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libcublas-windows-x86_64-12.4.5.8-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvtx-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_profiler_api-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\visual_studio_integration-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvprof-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cccl-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
-          echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
--- a/.github/actions/windows-setup-curl/action.yml
+++ b/.github/actions/windows-setup-curl/action.yml
@@ -1,25 +0,0 @@
-name: 'Windows - Setup CURL'
-description: 'Composite action, to be reused in other workflow'
-inputs:
-  curl_version:
-    description: 'CURL version'
-    required: false
-    default: '8.6.0_6'
-outputs:
-  curl_path:
-    description: "Path to the downloaded libcurl"
-    value: ${{ steps.get_libcurl.outputs.curl_path }}
-
-runs:
-  using: "composite"
-  steps:
-    - name: libCURL
-      id: get_libcurl
-      shell: powershell
-      env:
-        CURL_VERSION: ${{ inputs.curl_version }}
-      run: |
-        curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-win64-mingw.zip"
-        mkdir $env:RUNNER_TEMP/libcurl
-        tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl
-        echo "curl_path=$env:RUNNER_TEMP/libcurl" >> $env:GITHUB_OUTPUT
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -3,18 +3,19 @@ Kompute:
    - changed-files:
        - any-glob-to-any-file:
            - ggml/include/ggml-kompute.h
-            - ggml/src/ggml-kompute/**
+            - ggml/src/ggml-kompute.cpp
            - README-kompute.md
 Apple Metal:
    - changed-files:
        - any-glob-to-any-file:
            - ggml/include/ggml-metal.h
-            - ggml/src/ggml-metal/**
+            - ggml/src/ggml-metal.cpp
            - README-metal.md
 SYCL:
    - changed-files:
        - any-glob-to-any-file:
            - ggml/include/ggml-sycl.h
+            - ggml/src/ggml-sycl.cpp
            - ggml/src/ggml-sycl/**
            - docs/backend/SYCL.md
            - examples/sycl/**
@@ -26,8 +27,8 @@ Nvidia GPU:
 Vulkan:
    - changed-files:
        - any-glob-to-any-file:
-            - ggml/include/ggml-vulkan.h
-            - ggml/src/ggml-vulkan/**
+            - ggml/ggml_vk_generate_shaders.py
+            - ggml/src/ggml-vulkan*
 documentation:
    - changed-files:
        - any-glob-to-any-file:
@@ -45,9 +46,7 @@ build:
            - CMakePresets.json
 examples:
    - changed-files:
-        - any-glob-to-any-file:
-            - examples/**
-            - tools/**
+        - any-glob-to-any-file: examples/**
 devops:
    - changed-files:
        - any-glob-to-any-file:
@@ -72,11 +71,15 @@ android:
 server:
    - changed-files:
        - any-glob-to-any-file:
-            - tools/server/**
+            - examples/server/**
 ggml:
    - changed-files:
        - any-glob-to-any-file:
-            - ggml/**
+            - ggml/include/ggml*.h
+            - ggml/src/ggml*.c
+            - ggml/src/ggml*.cpp
+            - ggml/src/ggml*.h
+            - ggml-cuda/**
 nix:
    - changed-files:
        - any-glob-to-any-file:
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -1 +1,7 @@
-*Make sure to read the [contributing guidelines](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md) before submitting a PR*
+
+
+- [x] I have read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md)
+- Self-reported review complexity:
+  - [ ] Low
+  - [ ] Medium
+  - [ ] High
--- a/.github/workflows/bench.yml.disabled
+++ b/.github/workflows/bench.yml.disabled
@@ -1,5 +1,5 @@
 # TODO: there have been some issues with the workflow, so disabling for now
-#       https://github.com/ggml-org/llama.cpp/issues/7893
+#       https://github.com/ggerganov/llama.cpp/issues/7893
 #
 # Benchmark
 name: Benchmark
@@ -27,10 +27,10 @@ on:
  push:
    branches:
      - master
-    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'tools/server/*.h*', 'tools/server/*.cpp']
+    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
  pull_request_target:
    types: [opened, synchronize, reopened]
-    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'tools/server/*.h*', 'tools/server/*.cpp']
+    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
  schedule:
    -  cron: '04 2 * * *'

@@ -57,7 +57,17 @@ jobs:

    if: |
      inputs.gpu-series == 'Standard_NC4as_T4_v3'
+      || (
+        github.event_name == 'schedule'
+        && github.ref_name == 'master'
+        && github.repository_owner == 'ggerganov'
+      )
      || github.event_name == 'pull_request_target'
+      || (
+        github.event_name == 'push'
+        && github.event.ref == 'refs/heads/master'
+        && github.repository_owner == 'ggerganov'
+      )
    steps:
      - name: Clone
        id: checkout
@@ -69,7 +79,7 @@ jobs:
      - name: Install python env
        id: pipenv
        run: |
-          cd tools/server/bench
+          cd examples/server/bench
          python3 -m venv venv
          source venv/bin/activate
          pip install -r requirements.txt
@@ -79,7 +89,7 @@ jobs:
        run: |
          wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz
          tar xzf prometheus*.tar.gz --strip-components=1
-          ./prometheus --config.file=tools/server/bench/prometheus.yml &
+          ./prometheus --config.file=examples/server/bench/prometheus.yml &
          while ! nc -z localhost 9090; do
            sleep 0.1
          done
@@ -92,7 +102,7 @@ jobs:
      - name: Install k6 and xk6-sse
        id: k6_installation
        run: |
-          cd tools/server/bench
+          cd examples/server/bench
          go install go.k6.io/xk6/cmd/xk6@latest
          xk6 build master \
              --with github.com/phymbert/xk6-sse
@@ -104,6 +114,7 @@ jobs:
          cmake -B build \
              -DGGML_NATIVE=OFF \
              -DLLAMA_BUILD_SERVER=ON \
+              -DLLAMA_CURL=ON \
              -DLLAMA_CUBLAS=ON \
              -DCUDAToolkit_ROOT=/usr/local/cuda \
              -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
@@ -116,7 +127,7 @@ jobs:
      - name: Download the dataset
        id: download_dataset
        run: |
-          cd tools/server/bench
+          cd examples/server/bench
          wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json

      - name: Server bench
@@ -126,7 +137,7 @@ jobs:
        run: |
          set -eux

-          cd tools/server/bench
+          cd examples/server/bench
          source venv/bin/activate
          python bench.py \
              --runner-label ${{ env.RUNNER_LABEL }} \
@@ -157,9 +168,9 @@ jobs:
          name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
          compression-level: 9
          path: |
-            tools/server/bench/*.jpg
-            tools/server/bench/*.json
-            tools/server/bench/*.log
+            examples/server/bench/*.jpg
+            examples/server/bench/*.json
+            examples/server/bench/*.log

      - name: Commit status
        uses: Sibz/github-status-action@v1
@@ -178,17 +189,17 @@ jobs:
        with:
          client_id: ${{secrets.IMGUR_CLIENT_ID}}
          path: |
-            tools/server/bench/prompt_tokens_seconds.jpg
-            tools/server/bench/predicted_tokens_seconds.jpg
-            tools/server/bench/kv_cache_usage_ratio.jpg
-            tools/server/bench/requests_processing.jpg
+            examples/server/bench/prompt_tokens_seconds.jpg
+            examples/server/bench/predicted_tokens_seconds.jpg
+            examples/server/bench/kv_cache_usage_ratio.jpg
+            examples/server/bench/requests_processing.jpg

      - name: Extract mermaid
        id: set_mermaid
        run: |
          set -eux

-          cd tools/server/bench
+          cd examples/server/bench
          PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid)
          echo "PROMPT_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
          echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV
--- a/.github/workflows/build-linux-cross.yml
+++ b/.github/workflows/build-linux-cross.yml
@@ -1,142 +0,0 @@
-name: Build on Linux using cross-compiler
-on:
-  workflow_dispatch:
-  workflow_call:
-
-jobs:
-  ubuntu-24-riscv64-cpu-cross:
-    runs-on: ubuntu-24.04
-
-    steps:
-      - uses: actions/checkout@v4
-      - name: Setup Riscv
-        run: |
-          sudo dpkg --add-architecture riscv64
-
-          # Add arch-specific repositories for non-amd64 architectures
-          cat << EOF | sudo tee /etc/apt/sources.list.d/riscv64-ports.list
-          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
-          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
-          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
-          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
-          EOF
-
-          sudo apt-get update || true    ;# Prevent failure due to missing URLs.
-
-          sudo apt-get install -y --no-install-recommends \
-                  build-essential \
-                  gcc-14-riscv64-linux-gnu \
-                  g++-14-riscv64-linux-gnu \
-                  libcurl4-openssl-dev:riscv64
-
-      - name: Build
-        run: |
-          cmake -B build -DCMAKE_BUILD_TYPE=Release \
-                         -DGGML_OPENMP=OFF \
-                         -DLLAMA_BUILD_EXAMPLES=ON \
-                         -DLLAMA_BUILD_TOOLS=ON \
-                         -DLLAMA_BUILD_TESTS=OFF \
-                         -DCMAKE_SYSTEM_NAME=Linux \
-                         -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
-                         -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-                         -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
-                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-                         -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
-
-          cmake --build build --config Release -j $(nproc)
-
-  ubuntu-24-riscv64-vulkan-cross:
-    runs-on: ubuntu-24.04
-
-    steps:
-      - uses: actions/checkout@v4
-      - name: Setup Riscv
-        run: |
-          sudo dpkg --add-architecture riscv64
-
-          # Add arch-specific repositories for non-amd64 architectures
-          cat << EOF | sudo tee /etc/apt/sources.list.d/riscv64-ports.list
-          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
-          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
-          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
-          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
-          EOF
-
-          sudo apt-get update || true    ;# Prevent failure due to missing URLs.
-
-          sudo apt-get install -y --no-install-recommends \
-                  build-essential \
-                  glslc \
-                  gcc-14-riscv64-linux-gnu \
-                  g++-14-riscv64-linux-gnu \
-                  libvulkan-dev:riscv64 \
-                  libcurl4-openssl-dev:riscv64
-
-      - name: Build
-        run: |
-          cmake -B build -DCMAKE_BUILD_TYPE=Release \
-                         -DGGML_VULKAN=ON \
-                         -DGGML_OPENMP=OFF \
-                         -DLLAMA_BUILD_EXAMPLES=ON \
-                         -DLLAMA_BUILD_TOOLS=ON \
-                         -DLLAMA_BUILD_TESTS=OFF \
-                         -DCMAKE_SYSTEM_NAME=Linux \
-                         -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
-                         -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-                         -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
-                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-                         -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
-
-          cmake --build build --config Release -j $(nproc)
-
-  ubuntu-24-arm64-vulkan-cross:
-    runs-on: ubuntu-24.04
-
-    steps:
-      - uses: actions/checkout@v4
-      - name: Setup Arm64
-        run: |
-          sudo dpkg --add-architecture arm64
-
-          # Add arch-specific repositories for non-amd64 architectures
-          cat << EOF | sudo tee /etc/apt/sources.list.d/arm64-ports.list
-          deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
-          deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
-          deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
-          deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
-          EOF
-
-          sudo apt-get update || true    ;# Prevent failure due to missing URLs.
-
-          sudo apt-get install -y --no-install-recommends \
-                  build-essential \
-                  glslc \
-                  crossbuild-essential-arm64 \
-                  libvulkan-dev:arm64 \
-                  libcurl4-openssl-dev:arm64
-
-      - name: Build
-        run: |
-          cmake -B build -DCMAKE_BUILD_TYPE=Release \
-                         -DGGML_VULKAN=ON \
-                         -DGGML_OPENMP=OFF \
-                         -DLLAMA_BUILD_EXAMPLES=ON \
-                         -DLLAMA_BUILD_TOOLS=ON \
-                         -DLLAMA_BUILD_TESTS=OFF \
-                         -DCMAKE_SYSTEM_NAME=Linux \
-                         -DCMAKE_SYSTEM_PROCESSOR=aarch64 \
-                         -DCMAKE_C_COMPILER=aarch64-linux-gnu-gcc \
-                         -DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++ \
-                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-                         -DCMAKE_FIND_ROOT_PATH=/usr/lib/aarch64-linux-gnu \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
-
-          cmake --build build --config Release -j $(nproc)
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
--- a/.github/workflows/close-issue.yml
+++ b/.github/workflows/close-issue.yml
@@ -3,11 +3,6 @@ on:
  schedule:
    - cron: "42 0 * * *"

-# Fine-grant permission
-# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
-permissions:
-  issues: write
-
 jobs:
  close-issues:
    runs-on: ubuntu-latest
@@ -17,7 +12,7 @@ jobs:
    steps:
      - uses: actions/stale@v5
        with:
-          exempt-issue-labels: "refactor,help wanted,good first issue,research,bug,roadmap"
+          exempt-issue-labels: "refactor,help wanted,good first issue,research,bug"
          days-before-issue-stale: 30
          days-before-issue-close: 14
          stale-issue-label: "stale"
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -10,55 +10,48 @@
 name: Publish Docker image

 on:
-  workflow_dispatch: # allows manual triggering
-  schedule:
-    # Rebuild daily rather than on every push because it is expensive
-    - cron: '12 4 * * *'
+  #pull_request:
+  push:
+    branches:
+      - master
+    paths: ['.github/workflows/docker.yml', '.devops/*.Dockerfile', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true

-# Fine-grant permission
-# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
-permissions:
-  packages: write
-
 jobs:
  push_to_registry:
    name: Push Docker image to Docker Hub
+    #if: github.event.pull_request.draft == false

-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
    env:
      COMMIT_SHA: ${{ github.sha }}
    strategy:
-      fail-fast: false
      matrix:
        config:
-          # Multi-stage build
-          # Note: the arm64 images are failing, which prevents the amd64 images from being built
-          # https://github.com/ggml-org/llama.cpp/issues/11888
-          #- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: false }
-          - { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
-          - { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
-          - { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
-          - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
-          - { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
+          - { tag: "light", dockerfile: ".devops/llama-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          - { tag: "server", dockerfile: ".devops/llama-server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          - { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
+          - { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
+          - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
          # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
-          #- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: true }
+          #- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          #- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          #- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          - { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" }
+          - { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" }
    steps:
      - name: Check out the repo
        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0 # preserve git history, so we can determine the build number

      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v3
-        with:
-          image: tonistiigi/binfmt:qemu-v7.0.0-28
+        uses: docker/setup-qemu-action@v2

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@v2

      - name: Log in to Docker Hub
        uses: docker/login-action@v2
@@ -67,45 +60,9 @@ jobs:
          username: ${{ github.repository_owner }}
          password: ${{ secrets.GITHUB_TOKEN }}

-      - name: Determine tag name
-        id: tag
-        shell: bash
-        run: |
-          BUILD_NUMBER="$(git rev-list --count HEAD)"
-          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
-          REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}"  # to lower case
-          REPO_NAME="${{ github.event.repository.name }}"
-
-          # determine tag name postfix (build number, commit hash)
-          if [[ "${{ env.GITHUB_BRANCH_NAME }}" == "master" ]]; then
-            TAG_POSTFIX="-b${BUILD_NUMBER}"
-          else
-            SAFE_NAME=$(echo "${{ env.GITHUB_BRANCH_NAME }}" | tr '/' '-')
-            TAG_POSTFIX="-${SAFE_NAME}-${SHORT_HASH}"
-          fi
-          # list all tags possible
-          if [[ "${{ matrix.config.tag }}" == "cpu" ]]; then
-              TYPE=""
-          else
-              TYPE="-${{ matrix.config.tag }}"
-          fi
-          PREFIX="ghcr.io/${REPO_OWNER}/${REPO_NAME}:"
-          FULLTAGS="${PREFIX}full${TYPE},${PREFIX}full${TYPE}${TAG_POSTFIX}"
-          LIGHTTAGS="${PREFIX}light${TYPE},${PREFIX}light${TYPE}${TAG_POSTFIX}"
-          SERVERTAGS="${PREFIX}server${TYPE},${PREFIX}server${TYPE}${TAG_POSTFIX}"
-          echo "full_output_tags=$FULLTAGS" >> $GITHUB_OUTPUT
-          echo "light_output_tags=$LIGHTTAGS" >> $GITHUB_OUTPUT
-          echo "server_output_tags=$SERVERTAGS" >> $GITHUB_OUTPUT
-          echo "full_output_tags=$FULLTAGS"  # print out for debugging
-          echo "light_output_tags=$LIGHTTAGS"  # print out for debugging
-          echo "server_output_tags=$SERVERTAGS"  # print out for debugging
-        env:
-          GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
-          GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
-
+      # https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example
      - name: Free Disk Space (Ubuntu)
-        if: ${{ matrix.config.free_disk_space == true }}
-        uses: ggml-org/free-disk-space@v1.3.1
+        uses: jlumbroso/free-disk-space@main
        with:
          # this might remove tools that are actually needed,
          # if set to "true" but frees about 6 GB
@@ -120,59 +77,31 @@ jobs:
          docker-images: true
          swap-storage: true

-      - name: Build and push Full Docker image (tagged + versioned)
-        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.full == true }}
-        uses: docker/build-push-action@v6
-        with:
-          context: .
-          push: true
-          platforms: ${{ matrix.config.platforms }}
-          # tag list is generated from step above
-          tags: ${{ steps.tag.outputs.full_output_tags }}
-          file: ${{ matrix.config.dockerfile }}
-          target: full
-          provenance: false
-          # using github experimental cache
-          cache-from: type=gha
-          cache-to: type=gha,mode=max
-          # return to this if the experimental github cache is having issues
-          #cache-to: type=local,dest=/tmp/.buildx-cache
-          #cache-from: type=local,src=/tmp/.buildx-cache
+      - name: Determine tag name
+        id: tag
+        shell: bash
+        run: |
+          BUILD_NUMBER="$(git rev-list --count HEAD)"
+          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+          else
+            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+          fi

-      - name: Build and push Light Docker image (tagged + versioned)
-        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.light == true }}
-        uses: docker/build-push-action@v6
-        with:
-          context: .
-          push: true
-          platforms: ${{ matrix.config.platforms }}
-          # tag list is generated from step above
-          tags: ${{ steps.tag.outputs.light_output_tags }}
-          file: ${{ matrix.config.dockerfile }}
-          target: light
-          provenance: false
-          # using github experimental cache
-          cache-from: type=gha
-          cache-to: type=gha,mode=max
-          # return to this if the experimental github cache is having issues
-          #cache-to: type=local,dest=/tmp/.buildx-cache
-          #cache-from: type=local,src=/tmp/.buildx-cache
+      - name: Downcase github.repository_owner
+        run: |
+          echo "repository_owner_lowercase=${GITHUB_REPOSITORY_OWNER@L}" >> $GITHUB_ENV
+        env:
+          GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'

-      - name: Build and push Server Docker image (tagged + versioned)
-        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.server == true }}
+      - name: Build and push Docker image (tagged + versioned)
+        if: github.event_name == 'push'
        uses: docker/build-push-action@v6
        with:
          context: .
          push: true
          platforms: ${{ matrix.config.platforms }}
-          # tag list is generated from step above
-          tags: ${{ steps.tag.outputs.server_output_tags }}
+          tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
          file: ${{ matrix.config.dockerfile }}
-          target: server
-          provenance: false
-          # using github experimental cache
-          cache-from: type=gha
-          cache-to: type=gha,mode=max
-          # return to this if the experimental github cache is having issues
-          #cache-to: type=local,dest=/tmp/.buildx-cache
-          #cache-from: type=local,src=/tmp/.buildx-cache
--- a/.github/workflows/editorconfig.yml
+++ b/.github/workflows/editorconfig.yml
@@ -23,7 +23,5 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
-      - uses: editorconfig-checker/action-editorconfig-checker@v2
-        with:
-          version: v3.0.3
+      - uses: editorconfig-checker/action-editorconfig-checker@main
      - run: editorconfig-checker
--- a/.github/workflows/labeler.yml
+++ b/.github/workflows/labeler.yml
@@ -11,7 +11,7 @@ jobs:
    steps:
    - uses: actions/checkout@v4
      with:
-        repository: "ggml-org/llama.cpp"
+        repository: "ggerganov/llama.cpp"
    - uses: actions/labeler@v5
      with:
        configuration-path: '.github/labeler.yml'
--- a/.github/workflows/nix-ci-aarch64.yml
+++ b/.github/workflows/nix-ci-aarch64.yml
@@ -0,0 +1,65 @@
+name: Nix aarch64 builds
+
+on:
+  workflow_dispatch: # allows manual triggering
+  schedule:
+    # Rebuild daily rather than on every push because QEMU is expensive (e.g.
+    # 1.5h instead of minutes with the cold cache).
+    #
+    # randint(0, 59), randint(0, 23)
+    - cron: '26 12 * * *'
+  # But also rebuild if we touched any of the Nix expressions:
+  push:
+    branches:
+      - master
+    paths: ['**/*.nix', 'flake.lock']
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths: ['**/*.nix', 'flake.lock']
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  nix-build-aarch64:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v4
+    - name: Install QEMU
+      # Copy-paste from https://github.com/orgs/community/discussions/8305#discussioncomment-5888654
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y qemu-user-static qemu-system-aarch64
+        sudo usermod -a -G kvm $USER
+    - name: Install Nix
+      uses: DeterminateSystems/nix-installer-action@v9
+      with:
+        github-token: ${{ secrets.GITHUB_TOKEN }}
+        extra-conf: |
+          extra-platforms = aarch64-linux
+          extra-system-features = nixos-test kvm
+          extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
+          extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
+    - uses: DeterminateSystems/magic-nix-cache-action@v2
+      with:
+        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
+    - name: Set-up cachix to push the results to
+      uses: cachix/cachix-action@v13
+      with:
+        authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
+        name: llama-cpp
+    - name: Show all output paths
+      run: >
+          nix run github:nix-community/nix-eval-jobs
+          -- --gc-roots-dir gcroot
+          --flake
+          ".#packages.aarch64-linux"
+    - name: Build
+      run: >
+          nix run github:Mic92/nix-fast-build
+          -- --skip-cached --no-nom
+          --systems aarch64-linux
+          --flake
+          ".#checks.aarch64-linux"
--- a/.github/workflows/nix-ci.yml
+++ b/.github/workflows/nix-ci.yml
@@ -0,0 +1,72 @@
+name: Nix CI
+
+on:
+  workflow_dispatch: # allows manual triggering
+  push:
+    branches:
+      - master
+  pull_request:
+    types: [opened, synchronize, reopened]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  nix-eval:
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ ubuntu-latest, macos-latest ]
+    runs-on: ${{ matrix.os }}
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v4
+    - name: Install Nix
+      uses: DeterminateSystems/nix-installer-action@v9
+      with:
+        github-token: ${{ secrets.GITHUB_TOKEN }}
+        extra-conf: |
+          extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
+          extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
+    - uses: DeterminateSystems/magic-nix-cache-action@v2
+      with:
+        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
+    - name: List all flake outputs
+      run: nix flake show --all-systems
+    - name: Show all output paths
+      run: >
+          nix run github:nix-community/nix-eval-jobs
+          -- --gc-roots-dir gcroot
+          --flake
+          ".#packages.$(nix eval --raw --impure --expr builtins.currentSystem)"
+  nix-build:
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ ubuntu-latest, macos-latest ]
+    runs-on: ${{ matrix.os }}
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v4
+    - name: Install Nix
+      uses: DeterminateSystems/nix-installer-action@v9
+      with:
+        github-token: ${{ secrets.GITHUB_TOKEN }}
+        extra-conf: |
+          extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
+          extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
+    - uses: DeterminateSystems/magic-nix-cache-action@v2
+      with:
+        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
+    - name: Set-up cachix to push the results to
+      uses: cachix/cachix-action@v13
+      with:
+        authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
+        name: llama-cpp
+    - name: Build
+      run: >
+          nix run github:Mic92/nix-fast-build
+          -- --skip-cached --no-nom
+          --flake
+          ".#checks.$(nix eval --raw --impure --expr builtins.currentSystem)"
--- a/.github/workflows/nix-flake-update.yml
+++ b/.github/workflows/nix-flake-update.yml
@@ -0,0 +1,22 @@
+name: update-flake-lock
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: '0 0 * * 0' # runs weekly on Sunday at 00:00
+
+jobs:
+  lockfile:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      - name: Install Nix
+        uses: DeterminateSystems/nix-installer-action@main
+      - name: Update flake.lock
+        uses: DeterminateSystems/update-flake-lock@main
+        with:
+          pr-title: "nix: update flake.lock"
+          pr-labels: |
+            nix
+          pr-reviewers: philiptaron,SomeoneSerge
+          token: ${{ secrets.FLAKE_TOKEN }}
--- a/.github/workflows/nix-publish-flake.yml
+++ b/.github/workflows/nix-publish-flake.yml
@@ -0,0 +1,36 @@
+# Make the flake discoverable on https://flakestry.dev and https://flakehub.com/flakes
+name: "Publish a flake to flakestry & flakehub"
+on:
+    push:
+        tags:
+        - "*"
+    workflow_dispatch:
+        inputs:
+            tag:
+                description: "The existing tag to publish"
+                type: "string"
+                required: true
+jobs:
+    flakestry-publish:
+        runs-on: ubuntu-latest
+        permissions:
+            id-token: "write"
+            contents: "read"
+        steps:
+            - uses: flakestry/flakestry-publish@main
+              with:
+                version: "${{ inputs.tag || github.ref_name }}"
+    flakehub-publish:
+      runs-on: "ubuntu-latest"
+      permissions:
+        id-token: "write"
+        contents: "read"
+      steps:
+        - uses: "actions/checkout@v4"
+          with:
+            ref: "${{ (inputs.tag != null) && format('refs/tags/{0}', inputs.tag) || '' }}"
+        - uses: "DeterminateSystems/nix-installer-action@main"
+        - uses: "DeterminateSystems/flakehub-push@main"
+          with:
+            visibility: "public"
+            tag: "${{ inputs.tag }}"
--- a/.github/workflows/python-lint.yml
+++ b/.github/workflows/python-lint.yml
@@ -1,13 +1,6 @@
 name: flake8 Lint

-on:
-  push:
-    branches:
-      - master
-    paths: ['.github/workflows/python-lint.yml', '**/*.py']
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: ['.github/workflows/python-lint.yml', '**/*.py']
+on: [push, pull_request]

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
--- a/.github/workflows/python-type-check.yml
+++ b/.github/workflows/python-type-check.yml
@@ -4,13 +4,11 @@ on:
  push:
    paths:
      - '.github/workflows/python-type-check.yml'
-      - 'pyrightconfig.json'
      - '**.py'
      - '**/requirements*.txt'
  pull_request:
    paths:
      - '.github/workflows/python-type-check.yml'
-      - 'pyrightconfig.json'
      - '**.py'
      - '**/requirements*.txt'

@@ -35,6 +33,6 @@ jobs:
      - name: Type-check with Pyright
        uses: jakebailey/pyright-action@v2
        with:
-          version: 1.1.382
+          version: 1.1.370
          level: warning
          warnings: true
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -1,709 +0,0 @@
-name: Create Release
-
-on:
-  workflow_dispatch: # allows manual triggering
-    inputs:
-      create_release:
-        description: 'Create new release'
-        required: true
-        type: boolean
-  push:
-    branches:
-      - master
-    paths: ['.github/workflows/release.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
-  CMAKE_ARGS: "-DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=ON -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON"
-
-jobs:
-  macOS-arm64:
-    runs-on: macos-14
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: macOS-latest-cmake-arm64
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-          brew install curl
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          cmake -B build \
-            -DCMAKE_BUILD_RPATH="@loader_path" \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DGGML_METAL_USE_BF16=ON \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DGGML_RPC=ON \
-            ${{ env.CMAKE_ARGS }}
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        run: |
-          cp LICENSE ./build/bin/
-          zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
-          name: llama-bin-macos-arm64.zip
-
-  macOS-x64:
-    runs-on: macos-13
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: macOS-latest-cmake-x64
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-          brew install curl
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          # Metal is disabled due to intermittent failures with Github runners not having a GPU:
-          # https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
-          cmake -B build \
-            -DCMAKE_BUILD_RPATH="@loader_path" \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DGGML_METAL=OFF \
-            -DGGML_RPC=ON
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        run: |
-          cp LICENSE ./build/bin/
-          zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
-          name: llama-bin-macos-x64.zip
-
-  ubuntu-22-cpu:
-    strategy:
-      matrix:
-        include:
-          - build: 'x64'
-            os: ubuntu-22.04
-          - build: 'arm64'
-            os: ubuntu-22.04-arm
-
-    runs-on: ${{ matrix.os }}
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: ubuntu-cpu-cmake
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential libcurl4-openssl-dev
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            ${{ env.CMAKE_ARGS }}
-          cmake --build build --config Release -j $(nproc)
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        run: |
-          cp LICENSE ./build/bin/
-          zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/*
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip
-          name: llama-bin-ubuntu-${{ matrix.build }}.zip
-
-  ubuntu-22-vulkan:
-    runs-on: ubuntu-22.04
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: ubuntu-22-cmake-vulkan
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        run: |
-          wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
-          sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
-          sudo apt-get update -y
-          sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libcurl4-openssl-dev
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build \
-            -DGGML_VULKAN=ON \
-            ${{ env.CMAKE_ARGS }}
-          cmake --build build --config Release -j $(nproc)
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        run: |
-          cp LICENSE ./build/bin/
-          zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip ./build/bin/*
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip
-          name: llama-bin-ubuntu-vulkan-x64.zip
-
-  windows:
-    runs-on: windows-latest
-
-    env:
-      OPENBLAS_VERSION: 0.3.23
-      VULKAN_VERSION: 1.4.309.0
-
-    strategy:
-      matrix:
-        include:
-          - build: 'cpu-x64'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF'
-          #- build: 'openblas-x64'
-          #  defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
-          - build: 'vulkan-x64'
-            defines: '-DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
-          - build: 'cpu-arm64'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF'
-          - build: 'opencl-adreno-arm64'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: windows-latest-cmake-${{ matrix.build }}
-          variant: ccache
-          evict-old-files: 1d
-
-      - name: Download OpenBLAS
-        id: get_openblas
-        if: ${{ matrix.build == 'openblas-x64' }}
-        run: |
-          curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
-          curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
-          mkdir $env:RUNNER_TEMP/openblas
-          tar.exe -xvf $env:RUNNER_TEMP/openblas.zip -C $env:RUNNER_TEMP/openblas
-          $vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
-          $msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
-          $lib =  $(join-path $msvc 'bin\Hostx64\x64\lib.exe')
-          & $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll
-
-      - name: Install Vulkan SDK
-        id: get_vulkan
-        if: ${{ matrix.build == 'vulkan-x64' }}
-        run: |
-          curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
-          & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
-          Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
-          Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
-
-      - name: Install Ninja
-        id: install_ninja
-        run: |
-          choco install ninja
-
-      - name: Install OpenCL Headers and Libs
-        id: install_opencl
-        if: ${{ matrix.build == 'opencl-adreno-arm64' }}
-        run: |
-          git clone https://github.com/KhronosGroup/OpenCL-Headers
-          cd OpenCL-Headers
-          cmake -B build `
-            -DBUILD_TESTING=OFF `
-            -DOPENCL_HEADERS_BUILD_TESTING=OFF `
-            -DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
-            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
-          cmake --build build --target install
-          git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
-          cd OpenCL-ICD-Loader
-          cmake -B build-arm64-release `
-            -A arm64 `
-            -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" `
-            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
-          cmake --build build-arm64-release --target install --config release
-
-      - name: libCURL
-        id: get_libcurl
-        uses: ./.github/actions/windows-setup-curl
-
-      - name: Build
-        id: cmake_build
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
-        run: |
-          cmake -S . -B build ${{ matrix.defines }} `
-            -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" `
-            ${{ env.CMAKE_ARGS }}
-          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
-
-      - name: Add libopenblas.dll
-        id: add_libopenblas_dll
-        if: ${{ matrix.build == 'openblas-x64' }}
-        run: |
-          cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
-          cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
-        run: |
-          Copy-Item $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll
-          7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
-          name: llama-bin-win-${{ matrix.build }}.zip
-
-  windows-cuda:
-    runs-on: windows-2019
-
-    strategy:
-      matrix:
-        cuda: ['12.4', '11.7']
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-            fetch-depth: 0
-
-      - name: Install ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: windows-cuda-${{ matrix.cuda }}
-          variant: ccache
-          evict-old-files: 1d
-
-      - name: Install Cuda Toolkit
-        uses: ./.github/actions/windows-setup-cuda
-        with:
-          cuda_version: ${{ matrix.cuda }}
-
-      - name: Install Ninja
-        id: install_ninja
-        run: |
-          choco install ninja
-
-      - name: libCURL
-        id: get_libcurl
-        uses: ./.github/actions/windows-setup-curl
-
-      - name: Build
-        id: cmake_build
-        shell: cmd
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
-        run: |
-          call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
-          cmake -S . -B build -G "Ninja Multi-Config" ^
-            -DGGML_NATIVE=OFF ^
-            -DGGML_BACKEND_DL=ON ^
-            -DGGML_CPU_ALL_VARIANTS=ON ^
-            -DGGML_CUDA=ON ^
-            -DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include" ^
-            ${{ env.CMAKE_ARGS }}
-          set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
-          cmake --build build --config Release -j %NINJA_JOBS% -t ggml
-          cmake --build build --config Release
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
-        run: |
-          cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll
-          7z a llama-${{ steps.tag.outputs.name }}-bin-win-cuda${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-win-cuda${{ matrix.cuda }}-x64.zip
-          name: llama-bin-win-cuda${{ matrix.cuda }}-x64.zip
-
-      - name: Copy and pack Cuda runtime
-        run: |
-          echo "Cuda install location: ${{ env.CUDA_PATH }}"
-          $dst='.\build\bin\cudart\'
-          robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
-          robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
-          7z a cudart-llama-bin-win-cuda${{ matrix.cuda }}-x64.zip $dst\*
-
-      - name: Upload Cuda runtime
-        uses: actions/upload-artifact@v4
-        with:
-          path: cudart-llama-bin-win-cuda${{ matrix.cuda }}-x64.zip
-          name: cudart-llama-bin-win-cuda${{ matrix.cuda }}-x64.zip
-
-  windows-sycl:
-    runs-on: windows-latest
-
-    defaults:
-      run:
-        shell: bash
-
-    env:
-      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b380d914-366b-4b77-a74a-05e3c38b3514/intel-oneapi-base-toolkit-2025.0.0.882_offline.exe
-      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
-      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: windows-latest-cmake-sycl
-          variant: ccache
-          evict-old-files: 1d
-
-      - name: Install
-        run:  |
-          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
-
-      # TODO: add libcurl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
-
-      - name: Build
-        id: cmake_build
-        run:  examples/sycl/win-build-sycl.bat
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Build the release package
-        id: pack_artifacts
-        run: |
-          echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
-
-          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
-
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
-
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
-
-          cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
-
-          echo "cp oneAPI running time dll files to ./build/bin done"
-          7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
-
-      - name: Upload the release package
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
-          name: llama-bin-win-sycl-x64.zip
-
-  windows-hip:
-    runs-on: windows-latest
-
-    strategy:
-      matrix:
-        gpu_target: [gfx1100, gfx1101, gfx1030]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-            fetch-depth: 0
-
-      - name: Clone rocWMMA repository
-        id: clone_rocwmma
-        run: |
-          git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: windows-latest-cmake-hip-release
-          evict-old-files: 1d
-
-      - name: Install
-        id: depends
-        run: |
-          $ErrorActionPreference = "Stop"
-          write-host "Downloading AMD HIP SDK Installer"
-          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
-          write-host "Installing AMD HIP SDK"
-          Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
-          write-host "Completed AMD HIP SDK installation"
-
-      - name: Verify ROCm
-        id: verify
-        run: |
-          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
-
-      - name: libCURL
-        id: get_libcurl
-        uses: ./.github/actions/windows-setup-curl
-
-      - name: Build
-        id: cmake_build
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
-        run: |
-          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
-          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
-          cmake -G "Unix Makefiles" -B build -S . `
-            -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
-            -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
-            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/" `
-            -DCMAKE_BUILD_TYPE=Release `
-            -DAMDGPU_TARGETS=${{ matrix.gpu_target }} `
-            -DGGML_HIP_ROCWMMA_FATTN=ON `
-            -DGGML_HIP=ON `
-            -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" `
-            ${{ env.CMAKE_ARGS }}
-          cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
-          md "build\bin\rocblas\library\"
-          cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
-          cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
-          cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
-        run: |
-          cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\libcurl-x64.dll
-          7z a llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip .\build\bin\*
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
-          name: llama-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
-
-  ios-xcode-build:
-    runs-on: macos-latest
-
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          cmake -B build -G Xcode \
-            -DGGML_METAL_USE_BF16=ON \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DLLAMA_CURL=OFF \
-            -DLLAMA_BUILD_EXAMPLES=OFF \
-            -DLLAMA_BUILD_TOOLS=OFF \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DLLAMA_BUILD_SERVER=OFF \
-            -DCMAKE_SYSTEM_NAME=iOS \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
-            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
-
-      - name: xcodebuild for swift package
-        id: xcodebuild
-        run: |
-          ./build-xcframework.sh
-
-      - name: Build Xcode project
-        run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        run: |
-          zip --symlinks -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
-          name: llama-${{ steps.tag.outputs.name }}-xcframework
-
-  release:
-    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-
-    # Fine-grant permission
-    # https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
-    permissions:
-        contents: write # for creating release
-
-    runs-on: ubuntu-latest
-
-    needs:
-      - ubuntu-22-cpu
-      - ubuntu-22-vulkan
-      - windows
-      - windows-cuda
-      - windows-sycl
-      - windows-hip
-      - macOS-arm64
-      - macOS-x64
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Download artifacts
-        id: download-artifact
-        uses: actions/download-artifact@v4
-        with:
-          path: ./artifact
-
-      - name: Move artifacts
-        id: move_artifacts
-        run: mkdir -p ./artifact/release && mv ./artifact/*/*.zip ./artifact/release
-
-      - name: Create release
-        id: create_release
-        uses: ggml-org/action-create-release@v1
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        with:
-          tag_name: ${{ steps.tag.outputs.name }}
-
-      - name: Upload release
-        id: upload_release
-        uses: actions/github-script@v3
-        with:
-          github-token: ${{secrets.GITHUB_TOKEN}}
-          script: |
-            const path = require('path');
-            const fs = require('fs');
-            const release_id = '${{ steps.create_release.outputs.id }}';
-            for (let file of await fs.readdirSync('./artifact/release')) {
-              if (path.extname(file) === '.zip') {
-                console.log('uploadReleaseAsset', file);
-                await github.repos.uploadReleaseAsset({
-                  owner: context.repo.owner,
-                  repo: context.repo.repo,
-                  release_id: release_id,
-                  name: file,
-                  data: await fs.readFileSync(`./artifact/release/${file}`)
-                });
-              }
-            }
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -15,10 +15,10 @@ on:
  push:
    branches:
      - master
-    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*']
+    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
  pull_request:
    types: [opened, synchronize, reopened]
-    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*']
+    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']

 env:
  LLAMA_LOG_COLORS: 1
@@ -74,51 +74,22 @@ jobs:
      - name: Tests dependencies
        id: test_dependencies
        run: |
-          pip install -r tools/server/tests/requirements.txt
+          pip install -r examples/server/tests/requirements.txt

-      # Setup nodejs (to be used for verifying bundled index.html)
-      - uses: actions/setup-node@v4
-        with:
-          node-version: '22.11.0'
-
-      - name: WebUI - Install dependencies
-        id: webui_lint
-        run: |
-          cd tools/server/webui
-          npm ci
-
-      - name: WebUI - Check code format
-        id: webui_format
+      - name: Verify server deps
+        id: verify_server_deps
        run: |
          git config --global --add safe.directory $(realpath .)
-          cd tools/server/webui
+          cd examples/server
+          git ls-files --others --modified
          git status
-
-          npm run format
+          ./deps.sh
          git status
-          modified_files="$(git status -s)"
-          echo "Modified files: ${modified_files}"
-          if [ -n "${modified_files}" ]; then
-            echo "Files do not follow coding style. To fix: npm run format"
-            echo "${modified_files}"
-            exit 1
-          fi
-
-      - name: Verify bundled index.html
-        id: verify_server_index_html
-        run: |
-          git config --global --add safe.directory $(realpath .)
-          cd tools/server/webui
-          git status
-
-          npm run build
-          git status
-          modified_files="$(git status -s)"
-          echo "Modified files: ${modified_files}"
-          if [ -n "${modified_files}" ]; then
-            echo "Repository is dirty or server/webui is not built as expected"
-            echo "Hint: You may need to follow Web UI build guide in server/README.md"
-            echo "${modified_files}"
+          not_ignored_files="$(git ls-files --others --modified)"
+          echo "Modified files: ${not_ignored_files}"
+          if [ -n "${not_ignored_files}" ]; then
+            echo "Repository is dirty or server deps are not built as expected"
+            echo "${not_ignored_files}"
            exit 1
          fi

@@ -129,54 +100,36 @@ jobs:
          cmake -B build \
              -DGGML_NATIVE=OFF \
              -DLLAMA_BUILD_SERVER=ON \
+              -DLLAMA_CURL=ON \
              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
              -DGGML_OPENMP=OFF ;
          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server

-      - name: Build (sanitizers)
-        id: cmake_build_sanitizers
-        if: ${{ matrix.sanitizer != '' && matrix.sanitizer != 'THREAD' }}
+      - name: Build
+        id: cmake_build
+        if: ${{ matrix.sanitizer != 'THREAD' }}
        run: |
          cmake -B build \
              -DGGML_NATIVE=OFF \
              -DLLAMA_BUILD_SERVER=ON \
+              -DLLAMA_CURL=ON \
              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server

-      - name: Build (sanitizers)
-        id: cmake_build
-        if: ${{ matrix.sanitizer == '' }}
-        run: |
-          cmake -B build \
-              -DGGML_NATIVE=OFF \
-              -DLLAMA_BUILD_SERVER=ON \
-              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
-
      - name: Tests
        id: server_integration_tests
-        if: ${{ matrix.sanitizer == '' }}
-        env:
-          GITHUB_ACTIONS: "true"
        run: |
-          cd tools/server/tests
-          ./tests.sh
-
-      - name: Tests (sanitizers)
-        id: server_integration_tests_sanitizers
-        if: ${{ matrix.sanitizer != '' }}
-        run: |
-          cd tools/server/tests
-          LLAMA_SANITIZE=1 ./tests.sh
+          cd examples/server/tests
+          PORT=8888 ./tests.sh

      - name: Slow tests
        id: server_integration_tests_slow
        if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
        run: |
-          cd tools/server/tests
-          SLOW_TESTS=1 ./tests.sh
+          cd examples/server/tests
+          PORT=8888 ./tests.sh --stop --no-skipped --no-capture --tags slow


  server-windows:
@@ -192,14 +145,17 @@ jobs:

      - name: libCURL
        id: get_libcurl
-        uses: ./.github/actions/windows-setup-curl
+        env:
+          CURL_VERSION: 8.6.0_6
+        run: |
+          curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-win64-mingw.zip"
+          mkdir $env:RUNNER_TEMP/libcurl
+          tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl

      - name: Build
        id: cmake_build
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
        run: |
-          cmake -B build -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
+          cmake -B build -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server

      - name: Python setup
@@ -211,27 +167,24 @@ jobs:
      - name: Tests dependencies
        id: test_dependencies
        run: |
-          pip install -r tools/server/tests/requirements.txt
+          pip install -r examples/server/tests/requirements.txt

      - name: Copy Libcurl
        id: prepare_libcurl
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
        run: |
-          cp $env:CURL_PATH/bin/libcurl-x64.dll ./build/bin/Release/libcurl-x64.dll
+          cp $env:RUNNER_TEMP/libcurl/bin/libcurl-x64.dll ./build/bin/Release/libcurl-x64.dll

      - name: Tests
        id: server_integration_tests
        if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
        run: |
-          cd tools/server/tests
+          cd examples/server/tests
          $env:PYTHONIOENCODING = ":replace"
-          pytest -v -x -m "not slow"
+          behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp

      - name: Slow tests
        id: server_integration_tests_slow
        if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
        run: |
-          cd tools/server/tests
-          $env:SLOW_TESTS = "1"
-          pytest -v -x
+          cd examples/server/tests
+          behave.exe --stop --no-skipped --no-capture --tags slow
--- a/.gitignore
+++ b/.gitignore
@@ -3,7 +3,6 @@
 *.a
 *.bat
 *.bin
-*.d
 *.dll
 *.dot
 *.etag
@@ -18,7 +17,6 @@
 *.metallib
 *.o
 *.so
-*.swp
 *.tmp

 # IDE / OS
@@ -45,8 +43,6 @@ lcov-report/
 tags
 .build/
 build*
-release
-debug
 !build-info.cmake
 !build-info.cpp.in
 !build-info.sh
@@ -96,11 +92,10 @@ perf-*.txt
 # Examples

 examples/jeopardy/results.txt
-tools/server/*.css.hpp
-tools/server/*.html.hpp
-tools/server/*.js.hpp
-tools/server/*.mjs.hpp
-tools/server/*.gz.hpp
+examples/server/*.css.hpp
+examples/server/*.html.hpp
+examples/server/*.js.hpp
+examples/server/*.mjs.hpp
 !build_64.sh
 !examples/*.bat
 !examples/*/*.kts
@@ -108,10 +103,6 @@ tools/server/*.gz.hpp
 !examples/sycl/*.bat
 !examples/sycl/*.sh

-# Server Web UI temporary files
-node_modules
-tools/server/webui/dist
-
 # Python

 /.venv
@@ -142,7 +133,3 @@ poetry.toml

 # Test models for lora adapters
 /lora-tests
-
-# Local scripts
-/run-vim.sh
-/run-chat.sh
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,3 @@
 [submodule "kompute"]
-	path = ggml/src/ggml-kompute/kompute
+	path = ggml/src/kompute
 	url = https://github.com/nomic-ai/kompute.git
--- a/326
+++ b/326
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,7 +16,6 @@ endif()
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")

 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
-set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)

 if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
    set(LLAMA_STANDALONE ON)
@@ -29,8 +28,6 @@ else()
    set(LLAMA_STANDALONE OFF)
 endif()

-option(LLAMA_USE_SYSTEM_GGML "Use system libggml" OFF)
-
 if (EMSCRIPTEN)
    set(BUILD_SHARED_LIBS_DEFAULT OFF)

@@ -49,13 +46,6 @@ if (WIN32)
    add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
 endif()

-if (MSVC)
-    add_compile_options("$<$<COMPILE_LANGUAGE:C>:/utf-8>")
-    add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/utf-8>")
-    add_compile_options("$<$<COMPILE_LANGUAGE:C>:/bigobj>")
-    add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/bigobj>")
-endif()
-
 #
 # option list
 #
@@ -72,26 +62,23 @@ option(LLAMA_SANITIZE_THREAD    "llama: enable thread sanitizer"    OFF)
 option(LLAMA_SANITIZE_ADDRESS   "llama: enable address sanitizer"   OFF)
 option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)

-# utils
-option(LLAMA_BUILD_COMMON "llama: build common utils library" ${LLAMA_STANDALONE})
-
 # extra artifacts
 option(LLAMA_BUILD_TESTS    "llama: build tests"          ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_TOOLS    "llama: build tools"          ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES "llama: build examples"       ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_SERVER   "llama: build server example" ${LLAMA_STANDALONE})

 # 3rd party libs
-option(LLAMA_CURL       "llama: use libcurl to download model from an URL" ON)
-option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF)
+option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)

 # Required for relocatable CMake package
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
-include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake)

 # override ggml options
-set(GGML_ALL_WARNINGS   ${LLAMA_ALL_WARNINGS})
-set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})
+set(GGML_SANITIZE_THREAD    ${LLAMA_SANITIZE_THREAD})
+set(GGML_SANITIZE_ADDRESS   ${LLAMA_SANITIZE_ADDRESS})
+set(GGML_SANITIZE_UNDEFINED ${LLAMA_SANITIZE_UNDEFINED})
+set(GGML_ALL_WARNINGS       ${LLAMA_ALL_WARNINGS})
+set(GGML_FATAL_WARNINGS     ${LLAMA_FATAL_WARNINGS})

 # change the default for these ggml options
 if (NOT DEFINED GGML_LLAMAFILE)
@@ -121,77 +108,16 @@ llama_option_depr(WARNING     LLAMA_SYCL                GGML_SYCL)
 llama_option_depr(WARNING     LLAMA_SYCL_F16            GGML_SYCL_F16)
 llama_option_depr(WARNING     LLAMA_CANN                GGML_CANN)

-if (NOT MSVC)
-    if (LLAMA_SANITIZE_THREAD)
-        message(STATUS "Using -fsanitize=thread")
-
-        add_compile_options(-fsanitize=thread)
-        link_libraries     (-fsanitize=thread)
-    endif()
-
-    if (LLAMA_SANITIZE_ADDRESS)
-        message(STATUS "Using -fsanitize=address")
-
-        add_compile_options(-fsanitize=address -fno-omit-frame-pointer)
-        link_libraries     (-fsanitize=address)
-    endif()
-
-    if (LLAMA_SANITIZE_UNDEFINED)
-        message(STATUS "Using -fsanitize=undefined")
-
-        add_compile_options(-fsanitize=undefined)
-        link_libraries     (-fsanitize=undefined)
-    endif()
-endif()
-
-#
-# 3rd-party
-#
-
-if (LLAMA_USE_SYSTEM_GGML)
-    message(STATUS "Using system-provided libggml, skipping ggml build")
-    find_package(ggml REQUIRED)
-    add_library(ggml ALIAS ggml::ggml)
-endif()
-
-if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML)
-    add_subdirectory(ggml)
-    # ... otherwise assume ggml is added by a parent CMakeLists.txt
-endif()
-
 #
 # build the library
 #

+if (NOT TARGET ggml)
+    add_subdirectory(ggml)
+    # ... otherwise assume ggml is added by a parent CMakeLists.txt
+endif()
 add_subdirectory(src)

-#
-# utils, programs, examples and tests
-#
-
-if (NOT LLAMA_BUILD_COMMON)
-    message(STATUS "LLAMA_BUILD_COMMON is OFF, disabling LLAMA_CURL")
-    set(LLAMA_CURL OFF)
-endif()
-
-if (LLAMA_BUILD_COMMON)
-    add_subdirectory(common)
-endif()
-
-if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
-    include(CTest)
-    add_subdirectory(tests)
-endif()
-
-if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES)
-    add_subdirectory(examples)
-    add_subdirectory(pocs)
-endif()
-
-if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TOOLS)
-    add_subdirectory(tools)
-endif()
-
 #
 # install
 #
@@ -207,14 +133,25 @@ set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location o
 set(LLAMA_LIB_INSTALL_DIR     ${CMAKE_INSTALL_LIBDIR}     CACHE PATH "Location of library files")
 set(LLAMA_BIN_INSTALL_DIR     ${CMAKE_INSTALL_BINDIR}     CACHE PATH "Location of binary  files")

-set(LLAMA_PUBLIC_HEADERS
-    ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h
-    ${CMAKE_CURRENT_SOURCE_DIR}/include/llama-cpp.h)

-set_target_properties(llama
-    PROPERTIES
-        PUBLIC_HEADER "${LLAMA_PUBLIC_HEADERS}")
+# At the moment some compile definitions are placed within the ggml/src
+# directory but not exported on the `ggml` target. This could be improved by
+# determining _precisely_ which defines are necessary for the llama-config
+# package.
+#
+set(GGML_TRANSIENT_DEFINES)
+get_target_property(GGML_DIRECTORY ggml SOURCE_DIR)
+get_directory_property(GGML_DIR_DEFINES DIRECTORY ${GGML_DIRECTORY} COMPILE_DEFINITIONS)
+if (GGML_DIR_DEFINES)
+    list(APPEND GGML_TRANSIENT_DEFINES ${GGML_DIR_DEFINES})
+endif()
+get_target_property(GGML_TARGET_DEFINES ggml COMPILE_DEFINITIONS)
+if (GGML_TARGET_DEFINES)
+    list(APPEND GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES})
+endif()
+get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)

+set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h)
 install(TARGETS llama LIBRARY PUBLIC_HEADER)

 configure_package_config_file(
@@ -251,4 +188,20 @@ configure_file(cmake/llama.pc.in
        @ONLY)

 install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
-        DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
+        DESTINATION lib/pkgconfig)
+
+#
+# programs, examples and tests
+#
+
+add_subdirectory(common)
+
+if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
+    include(CTest)
+    add_subdirectory(tests)
+endif ()
+
+if (LLAMA_BUILD_EXAMPLES)
+    add_subdirectory(examples)
+    add_subdirectory(pocs)
+endif()
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -24,17 +24,18 @@
            "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
        }
    },
-    { "name": "debug",    "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
-    { "name": "release",  "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
-    { "name": "reldbg",   "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
-    { "name": "static",   "hidden": true, "cacheVariables": { "GGML_STATIC":      "ON" } },
-    { "name": "sycl_f16", "hidden": true, "cacheVariables": { "GGML_SYCL_F16":    "ON" } },
-    { "name": "vulkan",   "hidden": true, "cacheVariables": { "GGML_VULKAN":      "ON" } },
+    { "name": "debug",   "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
+    { "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
+    { "name": "reldbg",  "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
+    { "name": "static",  "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } },
+    { "name": "sycl_f16",  "hidden": true, "cacheVariables": { "GGML_SYCL_F16": "ON" } },

    {
-        "name": "x64-windows-llvm", "hidden": true,
+        "name": "arm64-windows-msvc", "hidden": true,
+        "architecture": { "value": "arm64",    "strategy": "external" },
+        "toolset":      { "value": "host=x64", "strategy": "external" },
        "cacheVariables": {
-            "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/x64-windows-llvm.cmake"
+            "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
        }
    },

@@ -47,38 +48,21 @@
        }
    },

-    {
-        "name": "arm64-apple-clang", "hidden": true,
-        "architecture": { "value": "arm64",    "strategy": "external" },
-        "toolset":      { "value": "host=x64", "strategy": "external" },
-        "cacheVariables": {
-            "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-apple-clang.cmake"
-        }
-    },
+    { "name": "arm64-windows-llvm-debug"  , "inherits": [ "base", "arm64-windows-llvm",  "debug"   ] },
+    { "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm",  "reldbg" ] },
+    { "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm",  "reldbg", "static" ] },

-    { "name": "arm64-windows-llvm-debug", "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
-    { "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] },
-    { "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg", "static" ] },
+    { "name": "arm64-windows-msvc-debug"  , "inherits": [ "base", "arm64-windows-msvc",  "debug"   ] },
+    { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc",  "reldbg" ] },
+    { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc",  "reldbg", "static" ] },

-    { "name": "arm64-apple-clang-debug", "inherits": [ "base", "arm64-apple-clang", "debug" ] },
-    { "name": "arm64-apple-clang-release", "inherits": [ "base", "arm64-apple-clang", "reldbg" ] },
-    { "name": "arm64-apple-clang+static-release", "inherits": [ "base", "arm64-apple-clang",  "reldbg", "static" ] },
-
-    { "name": "x64-windows-llvm-debug", "inherits": [ "base", "x64-windows-llvm", "debug" ] },
-    { "name": "x64-windows-llvm-release", "inherits": [ "base", "x64-windows-llvm", "release" ] },
-    { "name": "x64-windows-llvm-reldbg", "inherits": [ "base", "x64-windows-llvm", "reldbg" ] },
-    { "name": "x64-windows-llvm+static-release", "inherits": [ "base", "x64-windows-llvm", "reldbg", "static" ] },
-
-    { "name": "x64-windows-msvc-debug", "inherits": [ "base", "debug" ] },
+    { "name": "x64-windows-msvc-debug"  , "inherits": [ "base", "debug"   ] },
    { "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] },
    { "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },

-    { "name": "x64-windows-sycl-debug", "inherits": [ "sycl-base", "debug" ] },
+    { "name": "x64-windows-sycl-debug"  , "inherits": [ "sycl-base", "debug"   ] },
    { "name": "x64-windows-sycl-debug-f16", "inherits": [ "sycl-base", "debug", "sycl_f16" ] },
    { "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] },
-    { "name": "x64-windows-sycl-release-f16", "inherits": [ "sycl-base", "release", "sycl_f16" ] },
-
-    { "name": "x64-windows-vulkan-debug", "inherits": [ "base", "vulkan", "debug" ] },
-    { "name": "x64-windows-vulkan-release", "inherits": [ "base", "vulkan", "release" ] }
+    { "name": "x64-windows-sycl-release-f16", "inherits": [ "sycl-base", "release", "sycl_f16" ] }
  ]
 }
--- a/11
+++ b/11
@@ -1,11 +0,0 @@
-# collaborators can optionally add themselves here to indicate their availability for reviewing related PRs
-
-/ci/ @ggerganov
-/.devops/*.Dockerfile @ngxson
-/tools/server/ @ngxson
-/ggml/src/ggml-cuda/fattn* @JohannesGaessler
-/ggml/src/ggml-cuda/mmq.* @JohannesGaessler
-/ggml/src/ggml-cuda/mmv.* @JohannesGaessler
-/ggml/src/ggml-cuda/mmvq.* @JohannesGaessler
-/ggml/src/ggml-opt.cpp @JohannesGaessler
-/ggml/src/gguf.cpp @JohannesGaessler
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,127 +1,29 @@
 # Pull requests (for contributors)

- llama.cpp uses the ggml tensor library for model evaluation. If you are unfamiliar with ggml, consider taking a look at the [examples in the ggml repository](https://github.com/ggml-org/ggml/tree/master/examples/). [simple](https://github.com/ggml-org/ggml/tree/master/examples/simple) shows the bare minimum for using ggml. [gpt-2](https://github.com/ggml-org/ggml/tree/master/examples/gpt-2) has minimal implementations for language model inference using GPT-2. [mnist](https://github.com/ggml-org/ggml/tree/master/examples/mnist) demonstrates how to train and evaluate a simple image classifier
 - Test your changes:
-    - Execute [the full CI locally on your machine](ci/README.md) before publishing
-    - Verify that the perplexity and the performance are not affected negatively by your changes (use `llama-perplexity` and `llama-bench`)
-    - If you modified the `ggml` source, run the `test-backend-ops` tool to check whether different backend implementations of the `ggml` operators produce consistent results (this requires access to at least two different `ggml` backends)
-    - If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops`
- Create separate PRs for each feature or fix. Avoid combining unrelated changes in a single PR
- Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
+  - Using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the GGML library
+  - Execute [the full CI locally on your machine](ci/README.md) before publishing
+- Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs.
+  - The PR template has a series of review complexity checkboxes `[ ]` that [you can mark as](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) `[X]` for your convenience
+- Consider allowing write access to your branch for faster review
 - If your PR becomes stale, don't hesitate to ping the maintainers in the comments

 # Pull requests (for collaborators)

 - Squash-merge PRs
 - Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
- Optionally pick a `<module>` from here: https://github.com/ggml-org/llama.cpp/wiki/Modules
- Consider adding yourself to [CODEOWNERS](CODEOWNERS)
+- Optionally, pick a `<module>` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules

 # Coding guidelines

 - Avoid adding third-party dependencies, extra files, extra headers, etc.
 - Always consider cross-compatibility with other operating systems and architectures
- Avoid fancy-looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple
- Vertical alignment makes things more readable and easier to batch edit
+- Avoid fancy looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple
+- There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit
 - Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
- Use sized integer types such as `int32_t` in the public API, e.g. `size_t` may also be appropriate for allocation sizes or byte offsets
- Declare structs with `struct foo {}` instead of `typedef struct foo {} foo`
-    - In C++ code omit optional `struct` and `enum` keyword whenever they are not necessary
-    ```cpp
-    // OK
-    llama_context * ctx;
-    const llama_rope_type rope_type;
-
-    // not OK
-    struct llama_context * ctx;
-    const enum llama_rope_type rope_type;
-    ```
-
-    _(NOTE: this guideline is yet to be applied to the `llama.cpp` codebase. New code should follow this guideline.)_
-
- Try to follow the existing patterns in the code (indentation, spaces, etc.). In case of doubt use `clang-format` (from clang-tools v15+) to format the added code
- For anything not covered in the current guidelines, refer to the [C++ Core Guidelines](https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines)
+- Naming usually optimizes for common prefix (see https://github.com/ggerganov/ggml/pull/302#discussion_r1243240963)
 - Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggml-org/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$
+- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$

 ![matmul](media/matmul.png)

-# Naming guidelines
-
- Use `snake_case` for function, variable and type names
- Naming usually optimizes for longest common prefix (see https://github.com/ggml-org/ggml/pull/302#discussion_r1243240963)
-
-    ```cpp
-    // not OK
-    int small_number;
-    int big_number;
-
-    // OK
-    int number_small;
-    int number_big;
-    ```
-
- Enum values are always in upper case and prefixed with the enum name
-
-    ```cpp
-    enum llama_vocab_type {
-        LLAMA_VOCAB_TYPE_NONE = 0,
-        LLAMA_VOCAB_TYPE_SPM  = 1,
-        LLAMA_VOCAB_TYPE_BPE  = 2,
-        LLAMA_VOCAB_TYPE_WPM  = 3,
-        LLAMA_VOCAB_TYPE_UGM  = 4,
-        LLAMA_VOCAB_TYPE_RWKV = 5,
-    };
-    ```
-
- The general naming pattern is `<class>_<method>`, with `<method>` being `<action>_<noun>`
-
-    ```cpp
-    llama_model_init();           // class: "llama_model",         method: "init"
-    llama_sampler_chain_remove(); // class: "llama_sampler_chain", method: "remove"
-    llama_sampler_get_seed();     // class: "llama_sampler",       method: "get_seed"
-    llama_set_embeddings();       // class: "llama_context",       method: "set_embeddings"
-    llama_n_threads();            // class: "llama_context",       method: "n_threads"
-    llama_adapter_lora_free();    // class: "llama_adapter_lora",  method: "free"
-    ```
-
-    - The `get` `<action>` can be omitted
-    - The `<noun>` can be omitted if not necessary
-    - The `_context` suffix of the `<class>` is optional. Use it to disambiguate symbols when needed
-    - Use `init`/`free` for constructor/destructor `<action>`
-
- Use the `_t` suffix when a type is supposed to be opaque to the user - it's not relevant to them if it is a struct or anything else
-
-    ```cpp
-    typedef struct llama_context * llama_context_t;
-
-    enum llama_pooling_type llama_pooling_type(const llama_context_t ctx);
-    ```
-
-    _(NOTE: this guideline is yet to be applied to the `llama.cpp` codebase. New code should follow this guideline)_
-
- C/C++ filenames are all lowercase with dashes. Headers use the `.h` extension. Source files use the `.c` or `.cpp` extension
- Python filenames are all lowercase with underscores
-
- _(TODO: abbreviations usage)_
-
-# Preprocessor directives
-
- _(TODO: add guidelines with examples and apply them to the codebase)_
-
-    ```cpp
-    #ifdef FOO
-    #endif // FOO
-    ```
-
-# Documentation
-
- Documentation is a community effort
- When you need to look into the source code to figure out how to use an API consider adding a short summary to the header file for future reference
- When you notice incorrect or outdated documentation, please update it
-
-# Resources
-
-The Github issues, PRs and discussions contain a lot of information that can be useful to get familiar with the codebase. For convenience, some of the more important information is referenced from Github projects:
-
-https://github.com/ggml-org/llama.cpp/projects
--- a/775
+++ b/775
--- a/Package.swift
+++ b/Package.swift
@@ -0,0 +1,79 @@
+// swift-tools-version:5.5
+
+import PackageDescription
+
+var sources = [
+    "src/llama.cpp",
+    "src/llama-vocab.cpp",
+    "src/llama-grammar.cpp",
+    "src/llama-sampling.cpp",
+    "src/unicode.cpp",
+    "src/unicode-data.cpp",
+    "ggml/src/ggml.c",
+    "ggml/src/ggml-alloc.c",
+    "ggml/src/ggml-backend.c",
+    "ggml/src/ggml-quants.c",
+    "ggml/src/ggml-aarch64.c",
+]
+
+var resources: [Resource] = []
+var linkerSettings: [LinkerSetting] = []
+var cSettings: [CSetting] =  [
+    .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
+    .unsafeFlags(["-fno-objc-arc"]),
+    // NOTE: NEW_LAPACK will required iOS version 16.4+
+    // We should consider add this in the future when we drop support for iOS 14
+    // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
+    // .define("ACCELERATE_NEW_LAPACK"),
+    // .define("ACCELERATE_LAPACK_ILP64")
+]
+
+#if canImport(Darwin)
+sources.append("ggml/src/ggml-metal.m")
+resources.append(.process("ggml/src/ggml-metal.metal"))
+linkerSettings.append(.linkedFramework("Accelerate"))
+cSettings.append(
+    contentsOf: [
+        .define("GGML_USE_ACCELERATE"),
+        .define("GGML_USE_METAL")
+    ]
+)
+#endif
+
+#if os(Linux)
+    cSettings.append(.define("_GNU_SOURCE"))
+#endif
+
+let package = Package(
+    name: "llama",
+    platforms: [
+        .macOS(.v12),
+        .iOS(.v14),
+        .watchOS(.v4),
+        .tvOS(.v14)
+    ],
+    products: [
+        .library(name: "llama", targets: ["llama"]),
+    ],
+    targets: [
+        .target(
+            name: "llama",
+            path: ".",
+            exclude: [
+               "cmake",
+               "examples",
+               "scripts",
+               "models",
+               "tests",
+               "CMakeLists.txt",
+               "Makefile"
+            ],
+            sources: sources,
+            resources: resources,
+            publicHeadersPath: "spm-headers",
+            cSettings: cSettings,
+            linkerSettings: linkerSettings
+        )
+    ],
+    cxxLanguageStandard: .cxx11
+)
--- a/README.md
+++ b/README.md
@@ -3,55 +3,45 @@
 ![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)

 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
-[![Server](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml)
+[![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
+[![Conan Center](https://shields.io/conan/v/llama-cpp)](https://conan.io/center/llama-cpp)

-[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggml-org/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml)
+[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)

 Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++

 ## Recent API changes

- [Changelog for `libllama` API](https://github.com/ggml-org/llama.cpp/issues/9289)
- [Changelog for `llama-server` REST API](https://github.com/ggml-org/llama.cpp/issues/9291)
+- [Changelog for `libllama` API](https://github.com/ggerganov/llama.cpp/issues/9289)
+- [Changelog for `llama-server` REST API](https://github.com/ggerganov/llama.cpp/issues/9291)

 ## Hot topics

- 🔥 Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
- **GGML developer experience survey (organized and reviewed by NVIDIA):** [link](https://forms.gle/Gasw3cRgyhNEnrwK9)
- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141](https://github.com/ggml-org/llama.cpp/pull/13141)), `libllava` will be deprecated
- VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
- Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
- Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
- Introducing GGUF-my-LoRA https://github.com/ggml-org/llama.cpp/discussions/10123
- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggml-org/llama.cpp/discussions/9669
- Hugging Face GGUF editor: [discussion](https://github.com/ggml-org/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
+- Huggingface GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)

 ----

 ## Description

 The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide
-range of hardware - locally and in the cloud.
+variety of hardware - locally and in the cloud.

 - Plain C/C++ implementation without any dependencies
 - Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
- AVX, AVX2, AVX512 and AMX support for x86 architectures
+- AVX, AVX2 and AVX512 support for x86 architectures
 - 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads MTT GPUs via MUSA)
+- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP)
 - Vulkan and SYCL backend support
 - CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity

-The `llama.cpp` project is the main playground for developing new features for the [ggml](https://github.com/ggml-org/ggml) library.
+Since its [inception](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022), the project has
+improved significantly thanks to many contributions. It is the main playground for developing new features for the
+[ggml](https://github.com/ggerganov/ggml) library.

-<details>
-<summary>Models</summary>
+**Supported models:**

 Typically finetunes of the base models below are supported as well.

-Instructions for adding support for new models: [HOWTO-add-model.md](docs/development/HOWTO-add-model.md)
-
-#### Text-only
-
 - [X] LLaMA 🦙
 - [x] LLaMA 2 🦙🦙
 - [x] LLaMA 3 🦙🦙🦙
@@ -61,23 +51,22 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon)
 - [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
 - [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
- [X] [BERT](https://github.com/ggml-org/llama.cpp/pull/5423)
+- [X] [BERT](https://github.com/ggerganov/llama.cpp/pull/5423)
 - [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
 - [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft)
 - [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila)
- [X] [Starcoder models](https://github.com/ggml-org/llama.cpp/pull/3187)
+- [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187)
 - [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim)
- [X] [MPT](https://github.com/ggml-org/llama.cpp/pull/3417)
- [X] [Bloom](https://github.com/ggml-org/llama.cpp/pull/3553)
+- [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417)
+- [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553)
 - [x] [Yi models](https://huggingface.co/models?search=01-ai/Yi)
 - [X] [StableLM models](https://huggingface.co/stabilityai)
 - [x] [Deepseek models](https://huggingface.co/models?search=deepseek-ai/deepseek)
 - [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen)
- [x] [PLaMo-13B](https://github.com/ggml-org/llama.cpp/pull/3557)
+- [x] [PLaMo-13B](https://github.com/ggerganov/llama.cpp/pull/3557)
 - [x] [Phi models](https://huggingface.co/models?search=microsoft/phi)
- [x] [PhiMoE](https://github.com/ggml-org/llama.cpp/pull/11003)
 - [x] [GPT-2](https://huggingface.co/gpt2)
- [x] [Orion 14B](https://github.com/ggml-org/llama.cpp/pull/5118)
+- [x] [Orion 14B](https://github.com/ggerganov/llama.cpp/pull/5118)
 - [x] [InternLM2](https://huggingface.co/models?search=internlm2)
 - [x] [CodeShell](https://github.com/WisdomShell/codeshell)
 - [x] [Gemma](https://ai.google.dev/gemma)
@@ -88,7 +77,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
 - [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
 - [x] [OLMo](https://allenai.org/olmo)
- [x] [OLMo 2](https://allenai.org/olmo)
 - [x] [OLMoE](https://huggingface.co/allenai/OLMoE-1B-7B-0924)
 - [x] [Granite models](https://huggingface.co/collections/ibm-granite/granite-code-models-6624c5cec322e4c148c8b330)
 - [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)
@@ -98,20 +86,15 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - [x] [Bitnet b1.58 models](https://huggingface.co/1bitLLM)
 - [x] [Flan T5](https://huggingface.co/models?search=flan-t5)
 - [x] [Open Elm models](https://huggingface.co/collections/apple/openelm-instruct-models-6619ad295d7ae9f868b759ca)
- [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b) + [GLMEdge-1.5b](https://huggingface.co/THUDM/glm-edge-1.5b-chat) + [GLMEdge-4b](https://huggingface.co/THUDM/glm-edge-4b-chat)
- [x] [GLM-4-0414](https://huggingface.co/collections/THUDM/glm-4-0414-67f3cbcb34dd9d252707cb2e)
+- [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b)
 - [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)
 - [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct)
 - [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a)
 - [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat)
- [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a)
- [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM)
- [x] [QRWKV-6](https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1)
- [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct)
- [X] [Trillion-7B-preview](https://huggingface.co/trillionlabs/Trillion-7B-preview)
- [x] [Ling models](https://huggingface.co/collections/inclusionAI/ling-67c51c85b34a7ea0aba94c32)

-#### Multimodal
+(instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))
+
+**Multimodal models:**

 - [x] [LLaVA 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e), [LLaVA 1.6 models](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2)
 - [x] [BakLLaVA](https://huggingface.co/models?search=SkunkworksAI/Bakllava)
@@ -122,402 +105,363 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
 - [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
 - [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)
- [x] [GLM-EDGE](https://huggingface.co/models?search=glm-edge)
- [x] [Qwen2-VL](https://huggingface.co/collections/Qwen/qwen2-vl-66cee7455501d7126940800d)

-</details>
-
-<details>
-<summary>Bindings</summary>
+**Bindings:**

 - Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
 - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
 - Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
 - JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp)
- JS/TS (Programmable Prompt Engine CLI): [offline-ai/cli](https://github.com/offline-ai/cli)
 - JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm)
 - Typescript/Wasm (nicer API, available on npm): [ngxson/wllama](https://github.com/ngxson/wllama)
 - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
 - Rust (more features): [edgenai/llama_cpp-rs](https://github.com/edgenai/llama_cpp-rs)
 - Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
 - Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs)
- Rust (automated build from crates.io): [ShelbyJenkins/llm_client](https://github.com/ShelbyJenkins/llm_client)
 - C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
- C#/VB.NET (more features - community license): [LM-Kit.NET](https://docs.lm-kit.com/lm-kit-net/index.html)
 - Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
 - Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
 - React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn)
 - Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
 - Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
 - Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
- Flutter: [xuegao-tzx/Fllama](https://github.com/xuegao-tzx/Fllama)
- PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggml-org/llama.cpp/pull/6326)
+- PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326)
 - Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp)
- Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift)
- Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama)
- Delphi [Embarcadero/llama-cpp-delphi](https://github.com/Embarcadero/llama-cpp-delphi)

-</details>
+**UI:**

-<details>
-<summary>UIs</summary>
+Unless otherwise noted these projects are open-source with permissive licensing:
+
+- [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT)
+- [iohub/collama](https://github.com/iohub/coLLaMA)
+- [janhq/jan](https://github.com/janhq/jan) (AGPL)
+- [nat/openplayground](https://github.com/nat/openplayground)
+- [Faraday](https://faraday.dev/) (proprietary)
+- [LMStudio](https://lmstudio.ai/) (proprietary)
+- [Layla](https://play.google.com/store/apps/details?id=com.laylalite) (proprietary)
+- [ramalama](https://github.com/containers/ramalama) (MIT)
+- [LocalAI](https://github.com/mudler/LocalAI) (MIT)
+- [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
+- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile)
+- [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all)
+- [ollama/ollama](https://github.com/ollama/ollama)
+- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) (AGPL)
+- [psugihara/FreeChat](https://github.com/psugihara/FreeChat)
+- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
+- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)
+- [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
+- [RAGNA Desktop](https://ragna.app/) (proprietary)
+- [RecurseChat](https://recurse.chat/) (proprietary)
+- [semperai/amica](https://github.com/semperai/amica)
+- [withcatai/catai](https://github.com/withcatai/catai)
+- [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT)
+- [Msty](https://msty.app) (proprietary)
+- [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
+- [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file)(Apachev2.0 or later)
+- [Dot](https://github.com/alexpinel/Dot) (GPL)
+- [MindMac](https://mindmac.app) (proprietary)
+- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
+- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
+- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
+- [AIKit](https://github.com/sozercan/aikit) (MIT)
+- [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL)
+- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)

 *(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*

- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
- [Dot](https://github.com/alexpinel/Dot) (GPL)
- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
- [iohub/collama](https://github.com/iohub/coLLaMA) (Apache-2.0)
- [janhq/jan](https://github.com/janhq/jan) (AGPL)
- [johnbean393/Sidekick](https://github.com/johnbean393/Sidekick) (MIT)
- [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file) (Apache-2.0)
- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
- [llama.vim](https://github.com/ggml-org/llama.vim) (MIT)
- [LARS](https://github.com/abgulati/LARS) (AGPL)
- [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL)
- [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
- [LMStudio](https://lmstudio.ai/) (proprietary)
- [LocalAI](https://github.com/mudler/LocalAI) (MIT)
- [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
- [MindMac](https://mindmac.app) (proprietary)
- [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT)
- [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT)
- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile) (Apache-2.0)
- [nat/openplayground](https://github.com/nat/openplayground) (MIT)
- [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all) (MIT)
- [ollama/ollama](https://github.com/ollama/ollama) (MIT)
- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) (AGPL)
- [PocketPal AI](https://github.com/a-ghorbani/pocketpal-ai) (MIT)
- [psugihara/FreeChat](https://github.com/psugihara/FreeChat) (MIT)
- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal) (MIT)
- [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
- [ramalama](https://github.com/containers/ramalama) (MIT)
- [semperai/amica](https://github.com/semperai/amica) (MIT)
- [withcatai/catai](https://github.com/withcatai/catai) (MIT)
- [Autopen](https://github.com/blackhole89/autopen) (GPL)
-
-</details>
-
-<details>
-<summary>Tools</summary>
+**Tools:**

 - [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
- [akx/ollama-dl](https://github.com/akx/ollama-dl) – download models from the Ollama library to be used directly with llama.cpp
 - [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
 - [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with pre-built Mobile and Web platform wrappers and a model example)
+- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with prebuild Mobile and Web platform wrappers and a model example)

-</details>
-
-<details>
-<summary>Infrastructure</summary>
+**Infrastructure:**

 - [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
 - [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
- [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly
- [llama-swap](https://github.com/mostlygeek/llama-swap) - transparent proxy that adds automatic model switching with llama-server
- [Kalavai](https://github.com/kalavai-net/kalavai-client) - Crowdsource end to end LLM deployment at any scale
- [llmaz](https://github.com/InftyAI/llmaz) - ☸️ Easy, advanced inference platform for large language models on Kubernetes.
+
+**Games:**
+- [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you.
+
+## Demo
+
+<details>
+<summary>Typical run using LLaMA v2 13B on M2 Ultra</summary>
+
+```
+$ make -j && ./llama-cli -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e
+I llama.cpp build info:
+I UNAME_S:  Darwin
+I UNAME_P:  arm
+I UNAME_M:  arm64
+I CFLAGS:   -I.            -O3 -std=c11   -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -pthread -DGGML_USE_K_QUANTS -DGGML_USE_ACCELERATE
+I CXXFLAGS: -I. -I./common -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_K_QUANTS
+I LDFLAGS:   -framework Accelerate
+I CC:       Apple clang version 14.0.3 (clang-1403.0.22.14.1)
+I CXX:      Apple clang version 14.0.3 (clang-1403.0.22.14.1)
+
+make: Nothing to be done for `default'.
+main: build = 1041 (cf658ad)
+main: seed  = 1692823051
+llama_model_loader: loaded meta data with 16 key-value pairs and 363 tensors from models/llama-13b-v2/ggml-model-q4_0.gguf (version GGUF V1 (latest))
+llama_model_loader: - type  f32:   81 tensors
+llama_model_loader: - type q4_0:  281 tensors
+llama_model_loader: - type q6_K:    1 tensors
+llm_load_print_meta: format         = GGUF V1 (latest)
+llm_load_print_meta: arch           = llama
+llm_load_print_meta: vocab type     = SPM
+llm_load_print_meta: n_vocab        = 32000
+llm_load_print_meta: n_merges       = 0
+llm_load_print_meta: n_ctx_train    = 4096
+llm_load_print_meta: n_ctx          = 512
+llm_load_print_meta: n_embd         = 5120
+llm_load_print_meta: n_head         = 40
+llm_load_print_meta: n_head_kv      = 40
+llm_load_print_meta: n_layer        = 40
+llm_load_print_meta: n_rot          = 128
+llm_load_print_meta: n_gqa          = 1
+llm_load_print_meta: f_norm_eps     = 1.0e-05
+llm_load_print_meta: f_norm_rms_eps = 1.0e-05
+llm_load_print_meta: n_ff           = 13824
+llm_load_print_meta: freq_base      = 10000.0
+llm_load_print_meta: freq_scale     = 1
+llm_load_print_meta: model type     = 13B
+llm_load_print_meta: model ftype    = mostly Q4_0
+llm_load_print_meta: model size     = 13.02 B
+llm_load_print_meta: general.name   = LLaMA v2
+llm_load_print_meta: BOS token = 1 '<s>'
+llm_load_print_meta: EOS token = 2 '</s>'
+llm_load_print_meta: UNK token = 0 '<unk>'
+llm_load_print_meta: LF token  = 13 '<0x0A>'
+llm_load_tensors: ggml ctx size =    0.11 MB
+llm_load_tensors: mem required  = 7024.01 MB (+  400.00 MB per state)
+...................................................................................................
+llama_new_context_with_model: kv self size  =  400.00 MB
+llama_new_context_with_model: compute buffer total size =   75.41 MB
+
+system_info: n_threads = 16 / 24 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 |
+sampling: repeat_last_n = 64, repeat_penalty = 1.100000, presence_penalty = 0.000000, frequency_penalty = 0.000000, top_k = 40, tfs_z = 1.000000, top_p = 0.950000, typical_p = 1.000000, temp = 0.800000, mirostat = 0, mirostat_lr = 0.100000, mirostat_ent = 5.000000
+generate: n_ctx = 512, n_batch = 512, n_predict = 400, n_keep = 0
+
+
+ Building a website can be done in 10 simple steps:
+Step 1: Find the right website platform.
+Step 2: Choose your domain name and hosting plan.
+Step 3: Design your website layout.
+Step 4: Write your website content and add images.
+Step 5: Install security features to protect your site from hackers or spammers
+Step 6: Test your website on multiple browsers, mobile devices, operating systems etc…
+Step 7: Test it again with people who are not related to you personally – friends or family members will work just fine!
+Step 8: Start marketing and promoting the website via social media channels or paid ads
+Step 9: Analyze how many visitors have come to your site so far, what type of people visit more often than others (e.g., men vs women) etc…
+Step 10: Continue to improve upon all aspects mentioned above by following trends in web design and staying up-to-date on new technologies that can enhance user experience even further!
+How does a Website Work?
+A website works by having pages, which are made of HTML code. This code tells your computer how to display the content on each page you visit – whether it’s an image or text file (like PDFs). In order for someone else’s browser not only be able but also want those same results when accessing any given URL; some additional steps need taken by way of programming scripts that will add functionality such as making links clickable!
+The most common type is called static HTML pages because they remain unchanged over time unless modified manually (either through editing files directly or using an interface such as WordPress). They are usually served up via HTTP protocols – this means anyone can access them without having any special privileges like being part of a group who is allowed into restricted areas online; however, there may still exist some limitations depending upon where one lives geographically speaking.
+How to
+llama_print_timings:        load time =   576.45 ms
+llama_print_timings:      sample time =   283.10 ms /   400 runs   (    0.71 ms per token,  1412.91 tokens per second)
+llama_print_timings: prompt eval time =   599.83 ms /    19 tokens (   31.57 ms per token,    31.68 tokens per second)
+llama_print_timings:        eval time = 24513.59 ms /   399 runs   (   61.44 ms per token,    16.28 tokens per second)
+llama_print_timings:       total time = 25431.49 ms
+```
+
 </details>

 <details>
-<summary>Games</summary>
+<summary>Demo of running both LLaMA-7B and whisper.cpp on a single M1 Pro MacBook</summary>

- [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you.
+And here is another demo of running both LLaMA-7B and [whisper.cpp](https://github.com/ggerganov/whisper.cpp) on a single M1 Pro MacBook:
+
+https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8b4f-add84093ffff.mp4

 </details>

+## Usage
+
+Here are the end-to-end binary build and model conversion steps for most supported models.
+
+### Basic usage
+
+Firstly, you need to get the binary. There are different methods that you can follow:
+- Method 1: Clone this repository and build locally, see [how to build](./docs/build.md)
+- Method 2: If you are using MacOS or Linux, you can install llama.cpp via [brew, flox or nix](./docs/install.md)
+- Method 3: Use a Docker image, see [documentation for Docker](./docs/docker.md)
+- Method 4: Download pre-built binary from [releases](https://github.com/ggerganov/llama.cpp/releases)
+
+You can run a basic completion using this command:
+
+```bash
+llama-cli -m your_model.gguf -p "I believe the meaning of life is" -n 128
+
+# Output:
+# I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga – it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
+```
+
+See [this page](./examples/main/README.md) for a full list of parameters.
+
+### Conversation mode
+
+If you want a more ChatGPT-like experience, you can run in conversation mode by passing `-cnv` as a parameter:
+
+```bash
+llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv
+
+# Output:
+# > hi, who are you?
+# Hi there! I'm your helpful assistant! I'm an AI-powered chatbot designed to assist and provide information to users like you. I'm here to help answer your questions, provide guidance, and offer support on a wide range of topics. I'm a friendly and knowledgeable AI, and I'm always happy to help with anything you need. What's on your mind, and how can I assist you today?
+#
+# > what is 1+1?
+# Easy peasy! The answer to 1+1 is... 2!
+```
+
+By default, the chat template will be taken from the input model. If you want to use another chat template, pass `--chat-template NAME` as a parameter. See the list of [supported templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
+
+```bash
+./llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --chat-template chatml
+```
+
+You can also use your own template via in-prefix, in-suffix and reverse-prompt parameters:
+
+```bash
+./llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --in-prefix 'User: ' --reverse-prompt 'User:'
+```
+
+### Web server
+
+[llama.cpp web server](./examples/server/README.md) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.
+
+Example usage:
+
+```bash
+./llama-server -m your_model.gguf --port 8080
+
+# Basic web UI can be accessed via browser: http://localhost:8080
+# Chat completion endpoint: http://localhost:8080/v1/chat/completions
+```
+
+### Interactive mode
+
+> [!NOTE]
+> If you prefer basic usage, please consider using conversation mode instead of interactive mode
+
+In this mode, you can always interrupt generation by pressing Ctrl+C and entering one or more lines of text, which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt that makes LLaMA emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`.
+
+Here is an example of a few-shot interaction, invoked with the command
+
+```bash
+# default arguments using a 7B model
+./examples/chat.sh
+
+# advanced chat with a 13B model
+./examples/chat-13B.sh
+
+# custom arguments using a 13B model
+./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
+```
+
+Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `llama-cli` example program.
+
+![image](https://user-images.githubusercontent.com/1991296/224575029-2af3c7dc-5a65-4f64-a6bb-517a532aea38.png)
+
+### Persistent Interaction
+
+The prompt, user inputs, and model generations can be saved and resumed across calls to `./llama-cli` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file.
+
+```bash
+# Start a new chat
+PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/default ./examples/chat-persistent.sh
+
+# Resume that chat
+PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/default ./examples/chat-persistent.sh
+
+# Start a different chat with the same prompt/model
+PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/another ./examples/chat-persistent.sh
+
+# Different prompt cache for different prompt/model
+PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
+    CHAT_SAVE_DIR=./chat/bob ./examples/chat-persistent.sh
+```
+
+### Constrained output with grammars
+
+`llama.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only:
+
+```bash
+./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
+```
+
+The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
+
+For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
+
+## Build
+
+Please refer to [Build llama.cpp locally](./docs/build.md)
+
 ## Supported backends

 | Backend | Target devices |
 | --- | --- |
-| [Metal](docs/build.md#metal-build) | Apple Silicon |
-| [BLAS](docs/build.md#blas-build) | All |
-| [BLIS](docs/backend/BLIS.md) | All |
-| [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU |
-| [MUSA](docs/build.md#musa) | Moore Threads MTT GPU |
-| [CUDA](docs/build.md#cuda) | Nvidia GPU |
-| [HIP](docs/build.md#hip) | AMD GPU |
-| [Vulkan](docs/build.md#vulkan) | GPU |
-| [CANN](docs/build.md#cann) | Ascend NPU |
-| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
-| [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
+| [Metal](./docs/build.md#metal-build) | Apple Silicon |
+| [BLAS](./docs/build.md#blas-build) | All |
+| [BLIS](./docs/backend/BLIS.md) | All |
+| [SYCL](./docs/backend/SYCL.md) | Intel and Nvidia GPU |
+| [MUSA](./docs/build.md#musa) | Moore Threads GPU |
+| [CUDA](./docs/build.md#cuda) | Nvidia GPU |
+| [hipBLAS](./docs/build.md#hipblas) | AMD GPU |
+| [Vulkan](./docs/build.md#vulkan) | GPU |
+| [CANN](./docs/build.md#cann) | Ascend NPU |

-## Building the project
+## Tools

-The main product of this project is the `llama` library. Its C-style interface can be found in [include/llama.h](include/llama.h).
-The project also includes many example programs and tools using the `llama` library. The examples range from simple, minimal code snippets to sophisticated sub-projects such as an OpenAI-compatible HTTP server. Possible methods for obtaining the binaries:
+### Prepare and Quantize

- Clone this repository and build locally, see [how to build](docs/build.md)
- On MacOS or Linux, install `llama.cpp` via [brew, flox or nix](docs/install.md)
- Use a Docker image, see [documentation for Docker](docs/docker.md)
- Download pre-built binaries from [releases](https://github.com/ggml-org/llama.cpp/releases)
+> [!NOTE]
+> You can use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to quantise your model weights without any setup too. It is synced from `llama.cpp` main every 6 hours.

-## Obtaining and quantizing models
+To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.

-The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](https://huggingface.co/models?library=gguf&sort=trending) compatible with `llama.cpp`:
+Note: `convert.py` has been moved to `examples/convert_legacy_llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derivatives.
+It does not support LLaMA 3, you can use `convert_hf_to_gguf.py` with LLaMA 3 downloaded from Hugging Face.

- [Trending](https://huggingface.co/models?library=gguf&sort=trending)
- [LLaMA](https://huggingface.co/models?sort=trending&search=llama+gguf)
+To learn more about quantizing model, [read this documentation](./examples/quantize/README.md)

-You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from [Hugging Face](https://huggingface.co/) or other model hosting sites, such as [ModelScope](https://modelscope.cn/), by using this CLI argument: `-hf <user>/<model>[:quant]`.
+### Perplexity (measuring model quality)

-By default, the CLI would download from Hugging Face, you can switch to other options with the environment variable `MODEL_ENDPOINT`. For example, you may opt to downloading model checkpoints from ModelScope or other model sharing communities by setting the environment variable, e.g. `MODEL_ENDPOINT=https://www.modelscope.cn/`.
-
-After downloading a model, use the CLI tools to run it locally - see below.
-
-`llama.cpp` requires the model to be stored in the [GGUF](https://github.com/ggml-org/ggml/blob/master/docs/gguf.md) file format. Models in other data formats can be converted to GGUF using the `convert_*.py` Python scripts in this repo.
-
-The Hugging Face platform provides a variety of online tools for converting, quantizing and hosting models with `llama.cpp`:
-
- Use the [GGUF-my-repo space](https://huggingface.co/spaces/ggml-org/gguf-my-repo) to convert to GGUF format and quantize model weights to smaller sizes
- Use the [GGUF-my-LoRA space](https://huggingface.co/spaces/ggml-org/gguf-my-lora) to convert LoRA adapters to GGUF format (more info: https://github.com/ggml-org/llama.cpp/discussions/10123)
- Use the [GGUF-editor space](https://huggingface.co/spaces/CISCai/gguf-editor) to edit GGUF meta data in the browser (more info: https://github.com/ggml-org/llama.cpp/discussions/9268)
- Use the [Inference Endpoints](https://ui.endpoints.huggingface.co/) to directly host `llama.cpp` in the cloud (more info: https://github.com/ggml-org/llama.cpp/discussions/9669)
-
-To learn more about model quantization, [read this documentation](tools/quantize/README.md)
-
-## [`llama-cli`](tools/main)
-
-#### A CLI tool for accessing and experimenting with most of `llama.cpp`'s functionality.
-
- <details open>
-    <summary>Run in conversation mode</summary>
-
-    Models with a built-in chat template will automatically activate conversation mode. If this doesn't occur, you can manually enable it by adding `-cnv` and specifying a suitable chat template with `--chat-template NAME`
-
-    ```bash
-    llama-cli -m model.gguf
-
-    # > hi, who are you?
-    # Hi there! I'm your helpful assistant! I'm an AI-powered chatbot designed to assist and provide information to users like you. I'm here to help answer your questions, provide guidance, and offer support on a wide range of topics. I'm a friendly and knowledgeable AI, and I'm always happy to help with anything you need. What's on your mind, and how can I assist you today?
-    #
-    # > what is 1+1?
-    # Easy peasy! The answer to 1+1 is... 2!
-    ```
-
-    </details>
-
- <details>
-    <summary>Run in conversation mode with custom chat template</summary>
-
-    ```bash
-    # use the "chatml" template (use -h to see the list of supported templates)
-    llama-cli -m model.gguf -cnv --chat-template chatml
-
-    # use a custom template
-    llama-cli -m model.gguf -cnv --in-prefix 'User: ' --reverse-prompt 'User:'
-    ```
-
-    </details>
-
- <details>
-    <summary>Run simple text completion</summary>
-
-    To disable conversation mode explicitly, use `-no-cnv`
-
-    ```bash
-    llama-cli -m model.gguf -p "I believe the meaning of life is" -n 128 -no-cnv
-
-    # I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga – it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
-    ```
-
-    </details>
-
- <details>
-    <summary>Constrain the output with a custom grammar</summary>
-
-    ```bash
-    llama-cli -m model.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
-
-    # {"appointmentTime": "8pm", "appointmentDetails": "schedule a a call"}
-    ```
-
-    The [grammars/](grammars/) folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](grammars/README.md).
-
-    For authoring more complex JSON grammars, check out https://grammar.intrinsiclabs.ai/
-
-    </details>
-
-
-## [`llama-server`](tools/server)
-
-#### A lightweight, [OpenAI API](https://github.com/openai/openai-openapi) compatible, HTTP server for serving LLMs.
-
- <details open>
-    <summary>Start a local HTTP server with default configuration on port 8080</summary>
-
-    ```bash
-    llama-server -m model.gguf --port 8080
-
-    # Basic web UI can be accessed via browser: http://localhost:8080
-    # Chat completion endpoint: http://localhost:8080/v1/chat/completions
-    ```
-
-    </details>
-
- <details>
-    <summary>Support multiple-users and parallel decoding</summary>
-
-    ```bash
-    # up to 4 concurrent requests, each with 4096 max context
-    llama-server -m model.gguf -c 16384 -np 4
-    ```
-
-    </details>
-
- <details>
-    <summary>Enable speculative decoding</summary>
-
-    ```bash
-    # the draft.gguf model should be a small variant of the target model.gguf
-    llama-server -m model.gguf -md draft.gguf
-    ```
-
-    </details>
-
- <details>
-    <summary>Serve an embedding model</summary>
-
-    ```bash
-    # use the /embedding endpoint
-    llama-server -m model.gguf --embedding --pooling cls -ub 8192
-    ```
-
-    </details>
-
- <details>
-    <summary>Serve a reranking model</summary>
-
-    ```bash
-    # use the /reranking endpoint
-    llama-server -m model.gguf --reranking
-    ```
-
-    </details>
-
- <details>
-    <summary>Constrain all outputs with a grammar</summary>
-
-    ```bash
-    # custom grammar
-    llama-server -m model.gguf --grammar-file grammar.gbnf
-
-    # JSON
-    llama-server -m model.gguf --grammar-file grammars/json.gbnf
-    ```
-
-    </details>
-
-
-## [`llama-perplexity`](tools/perplexity)
-
-#### A tool for measuring the perplexity [^1][^2] (and other quality metrics) of a model over a given text.
-
- <details open>
-    <summary>Measure the perplexity over a text file</summary>
-
-    ```bash
-    llama-perplexity -m model.gguf -f file.txt
-
-    # [1]15.2701,[2]5.4007,[3]5.3073,[4]6.2965,[5]5.8940,[6]5.6096,[7]5.7942,[8]4.9297, ...
-    # Final estimate: PPL = 5.4007 +/- 0.67339
-    ```
-
-    </details>
-
- <details>
-    <summary>Measure KL divergence</summary>
-
-    ```bash
-    # TODO
-    ```
-
-    </details>
-
-[^1]: [tools/perplexity/README.md](./tools/perplexity/README.md)
-[^2]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity)
-
-## [`llama-bench`](tools/llama-bench)
-
-#### Benchmark the performance of the inference for various parameters.
-
- <details open>
-    <summary>Run default benchmark</summary>
-
-    ```bash
-    llama-bench -m model.gguf
-
-    # Output:
-    # | model               |       size |     params | backend    | threads |          test |                  t/s |
-    # | ------------------- | ---------: | ---------: | ---------- | ------: | ------------: | -------------------: |
-    # | qwen2 1.5B Q4_0     | 885.97 MiB |     1.54 B | Metal,BLAS |      16 |         pp512 |      5765.41 ± 20.55 |
-    # | qwen2 1.5B Q4_0     | 885.97 MiB |     1.54 B | Metal,BLAS |      16 |         tg128 |        197.71 ± 0.81 |
-    #
-    # build: 3e0ba0e60 (4229)
-    ```
-
-    </details>
-
-## [`llama-run`](tools/run)
-
-#### A comprehensive example for running `llama.cpp` models. Useful for inferencing. Used with RamaLama [^3].
-
- <details>
-    <summary>Run a model with a specific prompt (by default it's pulled from Ollama registry)</summary>
-
-    ```bash
-    llama-run granite-code
-    ```
-
-    </details>
-
-[^3]: [RamaLama](https://github.com/containers/ramalama)
-
-## [`llama-simple`](examples/simple)
-
-#### A minimal example for implementing apps with `llama.cpp`. Useful for developers.
-
- <details>
-    <summary>Basic text completion</summary>
-
-    ```bash
-    llama-simple -m model.gguf
-
-    # Hello my name is Kaitlyn and I am a 16 year old girl. I am a junior in high school and I am currently taking a class called "The Art of
-    ```
-
-    </details>
+You can use the `perplexity` example to measure perplexity over a given prompt (lower perplexity is better).
+For more information, see [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity).

+To learn more how to measure perplexity using llama.cpp, [read this documentation](./examples/perplexity/README.md)

 ## Contributing

 - Contributors can open PRs
 - Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
 - Collaborators will be invited based on contributions
- Any help with managing issues, PRs and projects is very appreciated!
- See [good first issues](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
+- Any help with managing issues and PRs is very appreciated!
+- See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
 - Read the [CONTRIBUTING.md](CONTRIBUTING.md) for more information
- Make sure to read this: [Inference at the edge](https://github.com/ggml-org/llama.cpp/discussions/205)
+- Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205)
 - A bit of backstory for those who are interested: [Changelog podcast](https://changelog.com/podcast/532)

-## Other documentation
+## Other documentations

- [main (cli)](tools/main/README.md)
- [server](tools/server/README.md)
- [GBNF grammars](grammars/README.md)
+- [main (cli)](./examples/main/README.md)
+- [server](./examples/server/README.md)
+- [jeopardy](./examples/jeopardy/README.md)
+- [GBNF grammars](./grammars/README.md)

-#### Development documentation
+**Development documentations**

- [How to build](docs/build.md)
- [Running on Docker](docs/docker.md)
- [Build on Android](docs/android.md)
- [Performance troubleshooting](docs/development/token_generation_performance_tips.md)
- [GGML tips & tricks](https://github.com/ggml-org/llama.cpp/wiki/GGML-Tips-&-Tricks)
+- [How to build](./docs/build.md)
+- [Running on Docker](./docs/docker.md)
+- [Build on Android](./docs/android.md)
+- [Performance troubleshooting](./docs/development/token_generation_performance_tips.md)
+- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)

-#### Seminal papers and background on the models
+**Seminal papers and background on the models**

 If your issue is with model generation quality, then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT:
 - LLaMA:
@@ -528,48 +472,3 @@ If your issue is with model generation quality, then please at least scan the fo
 - GPT-3.5 / InstructGPT / ChatGPT:
    - [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
    - [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
-
-## XCFramework
-The XCFramework is a precompiled version of the library for iOS, visionOS, tvOS,
-and macOS. It can be used in Swift projects without the need to compile the
-library from source. For example:
-```swift
-// swift-tools-version: 5.10
-// The swift-tools-version declares the minimum version of Swift required to build this package.
-
-import PackageDescription
-
-let package = Package(
-    name: "MyLlamaPackage",
-    targets: [
-        .executableTarget(
-            name: "MyLlamaPackage",
-            dependencies: [
-                "LlamaFramework"
-            ]),
-        .binaryTarget(
-            name: "LlamaFramework",
-            url: "https://github.com/ggml-org/llama.cpp/releases/download/b5046/llama-b5046-xcframework.zip",
-            checksum: "c19be78b5f00d8d29a25da41042cb7afa094cbf6280a225abe614b03b20029ab"
-        )
-    ]
-)
-```
-The above example is using an intermediate build `b5046` of the library. This can be modified
-to use a different version by changing the URL and checksum.
-
-## Completions
-Command-line completion is available for some environments.
-
-#### Bash Completion
-```bash
-$ build/bin/llama-cli --completion-bash > ~/.llama-completion.bash
-$ source ~/.llama-completion.bash
-```
-Optionally this can be added to your `.bashrc` or `.bash_profile` to load it
-automatically. For example:
-```console
-$ echo "source ~/.llama-completion.bash" >> ~/.bashrc
-```
-
-## References
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -40,8 +40,7 @@ To protect sensitive data from potential leaks or unauthorized access, it is cru
 ### Untrusted environments or networks

 If you can't run your models in a secure and isolated environment or if it must be exposed to an untrusted network, make sure to take the following security precautions:
-* Do not use the RPC backend, [rpc-server](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) and [llama-server](https://github.com/ggml-org/llama.cpp/tree/master/tools/server) functionality (see https://github.com/ggml-org/llama.cpp/pull/13061).
-* Confirm the hash of any downloaded artifact (e.g. pre-trained model weights) matches a known-good value.
+* Confirm the hash of any downloaded artifact (e.g. pre-trained model weights) matches a known-good value
 * Encrypt your data if sending it over the network.

 ### Multi-Tenant environments
@@ -63,6 +62,6 @@ Beware that none of the topics under [Using llama.cpp securely](#using-llamacpp-
 <!-- normal version -->
 However, If you have discovered a security vulnerability in this project, please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released.

-Please disclose it as a private [security advisory](https://github.com/ggml-org/llama.cpp/security/advisories/new).
+Please disclose it as a private [security advisory](https://github.com/ggerganov/llama.cpp/security/advisories/new).

 A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.
--- a/build-xcframework.sh
+++ b/build-xcframework.sh
@@ -1,541 +0,0 @@
-#!/bin/bash
-#
-# Options
-IOS_MIN_OS_VERSION=16.4
-MACOS_MIN_OS_VERSION=13.3
-VISIONOS_MIN_OS_VERSION=1.0
-TVOS_MIN_OS_VERSION=16.4
-
-BUILD_SHARED_LIBS=OFF
-LLAMA_BUILD_EXAMPLES=OFF
-LLAMA_BUILD_TOOLS=OFF
-LLAMA_BUILD_TESTS=OFF
-LLAMA_BUILD_SERVER=OFF
-GGML_METAL=ON
-GGML_METAL_EMBED_LIBRARY=ON
-GGML_BLAS_DEFAULT=ON
-GGML_METAL_USE_BF16=ON
-GGML_OPENMP=OFF
-
-COMMON_C_FLAGS="-Wno-macro-redefined -Wno-shorten-64-to-32 -Wno-unused-command-line-argument -g"
-COMMON_CXX_FLAGS="-Wno-macro-redefined -Wno-shorten-64-to-32 -Wno-unused-command-line-argument -g"
-
-# Common options for all builds
-COMMON_CMAKE_ARGS=(
-    -DCMAKE_XCODE_ATTRIBUTE_CODE_SIGNING_REQUIRED=NO
-    -DCMAKE_XCODE_ATTRIBUTE_CODE_SIGN_IDENTITY=""
-    -DCMAKE_XCODE_ATTRIBUTE_CODE_SIGNING_ALLOWED=NO
-    -DCMAKE_XCODE_ATTRIBUTE_DEBUG_INFORMATION_FORMAT="dwarf-with-dsym"
-    -DCMAKE_XCODE_ATTRIBUTE_GCC_GENERATE_DEBUGGING_SYMBOLS=YES
-    -DCMAKE_XCODE_ATTRIBUTE_COPY_PHASE_STRIP=NO
-    -DCMAKE_XCODE_ATTRIBUTE_STRIP_INSTALLED_PRODUCT=NO
-    -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
-    -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}
-    -DLLAMA_BUILD_EXAMPLES=${LLAMA_BUILD_EXAMPLES}
-    -DLLAMA_BUILD_TOOLS=${LLAMA_BUILD_TOOLS}
-    -DLLAMA_BUILD_TESTS=${LLAMA_BUILD_TESTS}
-    -DLLAMA_BUILD_SERVER=${LLAMA_BUILD_SERVER}
-    -DGGML_METAL_EMBED_LIBRARY=${GGML_METAL_EMBED_LIBRARY}
-    -DGGML_BLAS_DEFAULT=${GGML_BLAS_DEFAULT}
-    -DGGML_METAL=${GGML_METAL}
-    -DGGML_METAL_USE_BF16=${GGML_METAL_USE_BF16}
-    -DGGML_NATIVE=OFF
-    -DGGML_OPENMP=${GGML_OPENMP}
-)
-
-XCODE_VERSION=$(xcodebuild -version 2>/dev/null | head -n1 | awk '{ print $2 }')
-MAJOR_VERSION=$(echo $XCODE_VERSION | cut -d. -f1)
-MINOR_VERSION=$(echo $XCODE_VERSION | cut -d. -f2)
-echo "Detected Xcode version: $XCODE_VERSION"
-
-check_required_tool() {
-    local tool=$1
-    local install_message=$2
-
-    if ! command -v $tool &> /dev/null; then
-        echo "Error: $tool is required but not found."
-        echo "$install_message"
-        exit 1
-    fi
-}
-echo "Checking for required tools..."
-check_required_tool "cmake" "Please install CMake 3.28.0 or later (brew install cmake)"
-check_required_tool "xcodebuild" "Please install Xcode and Xcode Command Line Tools (xcode-select --install)"
-check_required_tool "libtool" "Please install libtool which should be available with Xcode Command Line Tools (CLT). Make sure Xcode CLT is installed (xcode-select --install)"
-check_required_tool "dsymutil" "Please install Xcode and Xcode Command Line Tools (xcode-select --install)"
-
-set -e
-
-## Clean up previous builds
-rm -rf build-apple
-rm -rf build-ios-sim
-rm -rf build-ios-device
-rm -rf build-macos
-rm -rf build-visionos
-rm -rf build-visionos-sim
-rm -rf build-tvos-sim
-rm -rf build-tvos-device
-
-# Setup the xcframework build directory structure
-setup_framework_structure() {
-    local build_dir=$1
-    local min_os_version=$2
-    local platform=$3  # "ios", "macos", "visionos", or "tvos"
-    local framework_name="llama"
-
-    echo "Creating ${platform}-style framework structure for ${build_dir}"
-
-    if [[ "$platform" == "macos" ]]; then
-        # macOS versioned structure uses versioned directories
-        mkdir -p ${build_dir}/framework/${framework_name}.framework/Versions/A/Headers
-        mkdir -p ${build_dir}/framework/${framework_name}.framework/Versions/A/Modules
-        mkdir -p ${build_dir}/framework/${framework_name}.framework/Versions/A/Resources
-
-        # Create symbolic links
-        ln -sf A ${build_dir}/framework/${framework_name}.framework/Versions/Current
-        ln -sf Versions/Current/Headers ${build_dir}/framework/${framework_name}.framework/Headers
-        ln -sf Versions/Current/Modules ${build_dir}/framework/${framework_name}.framework/Modules
-        ln -sf Versions/Current/Resources ${build_dir}/framework/${framework_name}.framework/Resources
-        ln -sf Versions/Current/${framework_name} ${build_dir}/framework/${framework_name}.framework/${framework_name}
-
-        # Set header and module paths
-        local header_path=${build_dir}/framework/${framework_name}.framework/Versions/A/Headers/
-        local module_path=${build_dir}/framework/${framework_name}.framework/Versions/A/Modules/
-    else
-        # iOS/VisionOS/tvOS use a flat structure
-        mkdir -p ${build_dir}/framework/${framework_name}.framework/Headers
-        mkdir -p ${build_dir}/framework/${framework_name}.framework/Modules
-
-        # Remove any existing structure to ensure clean build
-        rm -rf ${build_dir}/framework/${framework_name}.framework/Versions
-
-        # Set header and module paths
-        local header_path=${build_dir}/framework/${framework_name}.framework/Headers/
-        local module_path=${build_dir}/framework/${framework_name}.framework/Modules/
-    fi
-
-    # Copy all required headers (common for all platforms)
-    cp include/llama.h             ${header_path}
-    cp ggml/include/ggml.h         ${header_path}
-    cp ggml/include/ggml-opt.h     ${header_path}
-    cp ggml/include/ggml-alloc.h   ${header_path}
-    cp ggml/include/ggml-backend.h ${header_path}
-    cp ggml/include/ggml-metal.h   ${header_path}
-    cp ggml/include/ggml-cpu.h     ${header_path}
-    cp ggml/include/ggml-blas.h    ${header_path}
-    cp ggml/include/gguf.h         ${header_path}
-
-    # Create module map (common for all platforms)
-    cat > ${module_path}module.modulemap << EOF
-framework module llama {
-    header "llama.h"
-    header "ggml.h"
-    header "ggml-alloc.h"
-    header "ggml-backend.h"
-    header "ggml-metal.h"
-    header "ggml-cpu.h"
-    header "ggml-blas.h"
-    header "gguf.h"
-
-    link "c++"
-    link framework "Accelerate"
-    link framework "Metal"
-    link framework "Foundation"
-
-    export *
-}
-EOF
-
-    # Platform-specific settings for Info.plist
-    local platform_name=""
-    local sdk_name=""
-    local supported_platform=""
-
-    case "$platform" in
-        "ios")
-            platform_name="iphoneos"
-            sdk_name="iphoneos${min_os_version}"
-            supported_platform="iPhoneOS"
-            local plist_path="${build_dir}/framework/${framework_name}.framework/Info.plist"
-            local device_family='    <key>UIDeviceFamily</key>
-    <array>
-        <integer>1</integer>
-        <integer>2</integer>
-    </array>'
-            ;;
-        "macos")
-            platform_name="macosx"
-            sdk_name="macosx${min_os_version}"
-            supported_platform="MacOSX"
-            local plist_path="${build_dir}/framework/${framework_name}.framework/Versions/A/Resources/Info.plist"
-            local device_family=""
-            ;;
-        "visionos")
-            platform_name="xros"
-            sdk_name="xros${min_os_version}"
-            supported_platform="XRPlatform"
-            local plist_path="${build_dir}/framework/${framework_name}.framework/Info.plist"
-            local device_family=""
-            ;;
-        "tvos")
-            platform_name="appletvos"
-            sdk_name="appletvos${min_os_version}"
-            supported_platform="AppleTVOS"
-            local plist_path="${build_dir}/framework/${framework_name}.framework/Info.plist"
-            local device_family='    <key>UIDeviceFamily</key>
-    <array>
-        <integer>3</integer>
-    </array>'
-            ;;
-    esac
-
-    # Create Info.plist
-    cat > ${plist_path} << EOF
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-    <key>CFBundleDevelopmentRegion</key>
-    <string>en</string>
-    <key>CFBundleExecutable</key>
-    <string>llama</string>
-    <key>CFBundleIdentifier</key>
-    <string>org.ggml.llama</string>
-    <key>CFBundleInfoDictionaryVersion</key>
-    <string>6.0</string>
-    <key>CFBundleName</key>
-    <string>llama</string>
-    <key>CFBundlePackageType</key>
-    <string>FMWK</string>
-    <key>CFBundleShortVersionString</key>
-    <string>1.0</string>
-    <key>CFBundleVersion</key>
-    <string>1</string>
-    <key>MinimumOSVersion</key>
-    <string>${min_os_version}</string>
-    <key>CFBundleSupportedPlatforms</key>
-    <array>
-        <string>${supported_platform}</string>
-    </array>${device_family}
-    <key>DTPlatformName</key>
-    <string>${platform_name}</string>
-    <key>DTSDKName</key>
-    <string>${sdk_name}</string>
-</dict>
-</plist>
-EOF
-}
-
-# Create dynamic libraries from static libraries.
-combine_static_libraries() {
-    local build_dir="$1"
-    local release_dir="$2"
-    local platform="$3"  # "ios", "macos", "visionos", or "tvos"
-    local is_simulator="$4"
-    local base_dir="$(pwd)"
-    local framework_name="llama"
-
-    # Determine output path based on platform
-    local output_lib=""
-    if [[ "$platform" == "macos" ]]; then
-        # macOS uses versioned structure
-        output_lib="${build_dir}/framework/${framework_name}.framework/Versions/A/${framework_name}"
-    else
-        # iOS, visionOS, and tvOS use a directory flat structure
-        output_lib="${build_dir}/framework/${framework_name}.framework/${framework_name}"
-    fi
-
-    local libs=(
-        "${base_dir}/${build_dir}/src/${release_dir}/libllama.a"
-        "${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml.a"
-        "${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml-base.a"
-        "${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml-cpu.a"
-        "${base_dir}/${build_dir}/ggml/src/ggml-metal/${release_dir}/libggml-metal.a"
-        "${base_dir}/${build_dir}/ggml/src/ggml-blas/${release_dir}/libggml-blas.a"
-    )
-
-    # Create temporary directory for processing
-    local temp_dir="${base_dir}/${build_dir}/temp"
-    mkdir -p "${temp_dir}"
-
-    # Since we have multiple architectures libtool will find object files that do not
-    # match the target architecture. We suppress these warnings.
-    libtool -static -o "${temp_dir}/combined.a" "${libs[@]}" 2> /dev/null
-
-    # Determine SDK, architectures, and install_name based on platform and simulator flag.
-    local sdk=""
-    local archs=""
-    local min_version_flag=""
-    local install_name=""
-
-    case "$platform" in
-        "ios")
-            if [[ "$is_simulator" == "true" ]]; then
-                sdk="iphonesimulator"
-                archs="arm64 x86_64"
-                min_version_flag="-mios-simulator-version-min=${IOS_MIN_OS_VERSION}"
-            else
-                sdk="iphoneos"
-                archs="arm64"
-                min_version_flag="-mios-version-min=${IOS_MIN_OS_VERSION}"
-            fi
-            install_name="@rpath/llama.framework/llama"
-            ;;
-        "macos")
-            sdk="macosx"
-            archs="arm64 x86_64"
-            min_version_flag="-mmacosx-version-min=${MACOS_MIN_OS_VERSION}"
-            install_name="@rpath/llama.framework/Versions/Current/llama"
-            ;;
-        "visionos")
-            if [[ "$is_simulator" == "true" ]]; then
-                sdk="xrsimulator"
-                archs="arm64 x86_64"
-                min_version_flag="-mtargetos=xros${VISIONOS_MIN_OS_VERSION}-simulator"
-            else
-                sdk="xros"
-                archs="arm64"
-                min_version_flag="-mtargetos=xros${VISIONOS_MIN_OS_VERSION}"
-            fi
-            # Use flat structure for visionOS, same as iOS
-            install_name="@rpath/llama.framework/llama"
-            ;;
-        "tvos")
-            if [[ "$is_simulator" == "true" ]]; then
-                sdk="appletvsimulator"
-                archs="arm64 x86_64"
-                min_version_flag="-mtvos-simulator-version-min=${TVOS_MIN_OS_VERSION}"
-            else
-                sdk="appletvos"
-                archs="arm64"
-                min_version_flag="-mtvos-version-min=${TVOS_MIN_OS_VERSION}"
-            fi
-            install_name="@rpath/llama.framework/llama"
-            ;;
-    esac
-
-    # Build architecture flags
-    local arch_flags=""
-    for arch in $archs; do
-        arch_flags+=" -arch $arch"
-    done
-
-    # Create dynamic library
-    echo "Creating dynamic library for ${platform}."
-    xcrun -sdk $sdk clang++ -dynamiclib \
-        -isysroot $(xcrun --sdk $sdk --show-sdk-path) \
-        $arch_flags \
-        $min_version_flag \
-        -Wl,-force_load,"${temp_dir}/combined.a" \
-        -framework Foundation -framework Metal -framework Accelerate \
-        -install_name "$install_name" \
-        -o "${base_dir}/${output_lib}"
-
-    # Platform-specific post-processing for device builds
-    if [[ "$is_simulator" == "false" ]]; then
-        if command -v xcrun vtool &>/dev/null; then
-            case "$platform" in
-                "ios")
-                    echo "Marking binary as a framework binary for iOS..."
-                    xcrun vtool -set-build-version ios ${IOS_MIN_OS_VERSION} ${IOS_MIN_OS_VERSION} -replace \
-                        -output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}"
-                    ;;
-                "visionos")
-                    echo "Marking binary as a framework binary for visionOS..."
-                    if [[ "$MAJOR_VERSION" -gt 16 ]] || [[ "$MAJOR_VERSION" -eq 16 && "$MINOR_VERSION" -gt 2 ]]; then
-                        echo "Xcode version greater than 16.2, using visionOS."
-                        VISION_OS_BUILD_VERSION="visionos"
-                    else
-                        echo "Xcode version less than or equal to 16.2, using xros."
-                        VISION_OS_BUILD_VERSION="xros"
-                    fi
-                    xcrun vtool -set-build-version ${VISION_OS_BUILD_VERSION} ${VISIONOS_MIN_OS_VERSION} ${VISIONOS_MIN_OS_VERSION} -replace \
-                        -output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}"
-                    ;;
-                "tvos")
-                    echo "Marking binary as a framework binary for tvOS..."
-                    xcrun vtool -set-build-version tvos ${TVOS_MIN_OS_VERSION} ${TVOS_MIN_OS_VERSION} -replace \
-                        -output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}"
-                    ;;
-            esac
-        else
-            echo "Warning: vtool not found. Binary may not pass App Store validation."
-        fi
-    fi
-
-    echo "Creating properly formatted dSYM..."
-    # Create a separate directory for dSYMs for all platforms
-    mkdir -p "${base_dir}/${build_dir}/dSYMs"
-
-    # iOS and visionOS style dSYM (flat structure)
-    if [[ "$platform" == "ios" || "$platform" == "visionos" || "$platform" == "tvos" ]]; then
-        # Generate dSYM in the dSYMs directory
-        xcrun dsymutil "${base_dir}/${output_lib}" -o "${base_dir}/${build_dir}/dSYMs/llama.dSYM"
-
-        # Create a copy of the binary that will be stripped
-        cp "${base_dir}/${output_lib}" "${temp_dir}/binary_to_strip"
-
-        # Strip debug symbols from the copy
-        xcrun strip -S "${temp_dir}/binary_to_strip" -o "${temp_dir}/stripped_lib"
-
-        # Replace the original with the stripped version
-        mv "${temp_dir}/stripped_lib" "${base_dir}/${output_lib}"
-    else
-        # macOS style dSYM
-        # First strip debug info to a separate file
-        xcrun strip -S "${base_dir}/${output_lib}" -o "${temp_dir}/stripped_lib"
-
-        # Generate dSYM in the dSYMs directory
-        xcrun dsymutil "${base_dir}/${output_lib}" -o "${base_dir}/${build_dir}/dSYMs/llama.dSYM"
-
-        # Replace original binary with stripped version
-        mv "${temp_dir}/stripped_lib" "${base_dir}/${output_lib}"
-    fi
-
-    # Remove any automatically generated dSYM files in the framework structure as they will
-    # otherwise case Invalid Bundle Structure validation errors.
-    if [ -d "${base_dir}/${output_lib}.dSYM" ]; then
-        echo "Removing generated dSYM file in framework structure: ${base_dir}/${output_lib}.dSYM"
-        rm -rf "${base_dir}/${output_lib}.dSYM"
-    fi
-
-    # Clean up
-    rm -rf "${temp_dir}"
-}
-
-echo "Building for iOS simulator..."
-cmake -B build-ios-sim -G Xcode \
-    "${COMMON_CMAKE_ARGS[@]}" \
-    -DCMAKE_OSX_DEPLOYMENT_TARGET=${IOS_MIN_OS_VERSION} \
-    -DIOS=ON \
-    -DCMAKE_SYSTEM_NAME=iOS \
-    -DCMAKE_OSX_SYSROOT=iphonesimulator \
-    -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
-    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphonesimulator \
-    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
-    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-    -DLLAMA_CURL=OFF \
-    -S .
-cmake --build build-ios-sim --config Release -- -quiet
-
-echo "Building for iOS devices..."
-cmake -B build-ios-device -G Xcode \
-    "${COMMON_CMAKE_ARGS[@]}" \
-    -DCMAKE_OSX_DEPLOYMENT_TARGET=${IOS_MIN_OS_VERSION} \
-    -DCMAKE_OSX_SYSROOT=iphoneos \
-    -DCMAKE_OSX_ARCHITECTURES="arm64" \
-    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphoneos \
-    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
-    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-    -DLLAMA_CURL=OFF \
-    -S .
-cmake --build build-ios-device --config Release -- -quiet
-
-echo "Building for macOS..."
-cmake -B build-macos -G Xcode \
-    "${COMMON_CMAKE_ARGS[@]}" \
-    -DCMAKE_OSX_DEPLOYMENT_TARGET=${MACOS_MIN_OS_VERSION} \
-    -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
-    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
-    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-    -DLLAMA_CURL=OFF \
-    -S .
-cmake --build build-macos --config Release -- -quiet
-
-echo "Building for visionOS..."
-cmake -B build-visionos -G Xcode \
-    "${COMMON_CMAKE_ARGS[@]}" \
-    -DCMAKE_OSX_DEPLOYMENT_TARGET=${VISIONOS_MIN_OS_VERSION} \
-    -DCMAKE_OSX_ARCHITECTURES="arm64" \
-    -DCMAKE_SYSTEM_NAME=visionOS \
-    -DCMAKE_OSX_SYSROOT=xros \
-    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xros \
-    -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
-    -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
-    -DLLAMA_CURL=OFF \
-    -S .
-cmake --build build-visionos --config Release -- -quiet
-
-echo "Building for visionOS simulator..."
-cmake -B build-visionos-sim -G Xcode \
-    "${COMMON_CMAKE_ARGS[@]}" \
-    -DCMAKE_OSX_DEPLOYMENT_TARGET=${VISIONOS_MIN_OS_VERSION} \
-    -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
-    -DCMAKE_SYSTEM_NAME=visionOS \
-    -DCMAKE_OSX_SYSROOT=xrsimulator \
-    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xrsimulator \
-    -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
-    -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
-    -DLLAMA_CURL=OFF \
-    -S .
-cmake --build build-visionos-sim --config Release -- -quiet
-
-# Add tvOS builds (might need the same u_int definitions as watchOS and visionOS)
-echo "Building for tvOS simulator..."
-cmake -B build-tvos-sim -G Xcode \
-    "${COMMON_CMAKE_ARGS[@]}" \
-    -DCMAKE_OSX_DEPLOYMENT_TARGET=${TVOS_MIN_OS_VERSION} \
-    -DCMAKE_SYSTEM_NAME=tvOS \
-    -DCMAKE_OSX_SYSROOT=appletvsimulator \
-    -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
-    -DGGML_METAL=ON \
-    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=appletvsimulator \
-    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
-    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-    -DLLAMA_CURL=OFF \
-    -S .
-cmake --build build-tvos-sim --config Release -- -quiet
-
-echo "Building for tvOS devices..."
-cmake -B build-tvos-device -G Xcode \
-    "${COMMON_CMAKE_ARGS[@]}" \
-    -DCMAKE_OSX_DEPLOYMENT_TARGET=${TVOS_MIN_OS_VERSION} \
-    -DCMAKE_SYSTEM_NAME=tvOS \
-    -DCMAKE_OSX_SYSROOT=appletvos \
-    -DCMAKE_OSX_ARCHITECTURES="arm64" \
-    -DGGML_METAL=ON \
-    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=appletvos \
-    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
-    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-    -DLLAMA_CURL=OFF \
-    -S .
-cmake --build build-tvos-device --config Release -- -quiet
-
-# Setup frameworks and copy binaries and headers
-echo "Setting up framework structures..."
-setup_framework_structure "build-ios-sim" ${IOS_MIN_OS_VERSION} "ios"
-setup_framework_structure "build-ios-device" ${IOS_MIN_OS_VERSION} "ios"
-setup_framework_structure "build-macos" ${MACOS_MIN_OS_VERSION} "macos"
-setup_framework_structure "build-visionos" ${VISIONOS_MIN_OS_VERSION} "visionos"
-setup_framework_structure "build-visionos-sim" ${VISIONOS_MIN_OS_VERSION} "visionos"
-setup_framework_structure "build-tvos-sim" ${TVOS_MIN_OS_VERSION} "tvos"
-setup_framework_structure "build-tvos-device" ${TVOS_MIN_OS_VERSION} "tvos"
-
-# Create dynamic libraries from static libraries
-echo "Creating dynamic libraries from static libraries..."
-combine_static_libraries "build-ios-sim" "Release-iphonesimulator" "ios" "true"
-combine_static_libraries "build-ios-device" "Release-iphoneos" "ios" "false"
-combine_static_libraries "build-macos" "Release" "macos" "false"
-combine_static_libraries "build-visionos" "Release-xros" "visionos" "false"
-combine_static_libraries "build-visionos-sim" "Release-xrsimulator" "visionos" "true"
-combine_static_libraries "build-tvos-sim" "Release-appletvsimulator" "tvos" "true"
-combine_static_libraries "build-tvos-device" "Release-appletvos" "tvos" "false"
-
-# Create XCFramework with correct debug symbols paths
-echo "Creating XCFramework..."
-xcodebuild -create-xcframework \
-    -framework $(pwd)/build-ios-sim/framework/llama.framework \
-    -debug-symbols $(pwd)/build-ios-sim/dSYMs/llama.dSYM \
-    -framework $(pwd)/build-ios-device/framework/llama.framework \
-    -debug-symbols $(pwd)/build-ios-device/dSYMs/llama.dSYM \
-    -framework $(pwd)/build-macos/framework/llama.framework \
-    -debug-symbols $(pwd)/build-macos/dSYMS/llama.dSYM \
-    -framework $(pwd)/build-visionos/framework/llama.framework \
-    -debug-symbols $(pwd)/build-visionos/dSYMs/llama.dSYM \
-    -framework $(pwd)/build-visionos-sim/framework/llama.framework \
-    -debug-symbols $(pwd)/build-visionos-sim/dSYMs/llama.dSYM \
-    -framework $(pwd)/build-tvos-device/framework/llama.framework \
-    -debug-symbols $(pwd)/build-tvos-device/dSYMs/llama.dSYM \
-    -framework $(pwd)/build-tvos-sim/framework/llama.framework \
-    -debug-symbols $(pwd)/build-tvos-sim/dSYMs/llama.dSYM \
-    -output $(pwd)/build-apple/llama.xcframework
--- a/ci/README.md
+++ b/ci/README.md
@@ -1,11 +1,11 @@
 # CI

-In addition to [Github Actions](https://github.com/ggml-org/llama.cpp/actions) `llama.cpp` uses a custom CI framework:
+In addition to [Github Actions](https://github.com/ggerganov/llama.cpp/actions) `llama.cpp` uses a custom CI framework:

 https://github.com/ggml-org/ci

 It monitors the `master` branch for new commits and runs the
-[ci/run.sh](https://github.com/ggml-org/llama.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us
+[ci/run.sh](https://github.com/ggerganov/llama.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us
 to execute heavier workloads compared to just using Github Actions. Also with time, the cloud instances will be scaled
 to cover various hardware architectures, including GPU and Apple Silicon instances.

@@ -26,43 +26,4 @@ GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 # with SYCL support
 source /opt/intel/oneapi/setvars.sh
 GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
-
-# with MUSA support
-GG_BUILD_MUSA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 ```
-
-## Running MUSA CI in a Docker Container
-
-Assuming `$PWD` is the root of the `llama.cpp` repository, follow these steps to set up and run MUSA CI in a Docker container:
-
-### 1. Create a local directory to store cached models, configuration files and venv:
-
-```bash
-mkdir -p $HOME/llama.cpp/ci-cache
-```
-
-### 2. Create a local directory to store CI run results:
-
-```bash
-mkdir -p $HOME/llama.cpp/ci-results
-```
-
-### 3. Start a Docker container and run the CI:
-
-```bash
-docker run --privileged -it \
-    -v $HOME/llama.cpp/ci-cache:/ci-cache \
-    -v $HOME/llama.cpp/ci-results:/ci-results \
-    -v $PWD:/ws -w /ws \
-    mthreads/musa:rc3.1.1-devel-ubuntu22.04
-```
-
-Inside the container, execute the following commands:
-
-```bash
-apt update -y && apt install -y bc cmake ccache git python3.10-venv time unzip wget
-git config --global --add safe.directory /ws
-GG_BUILD_MUSA=1 bash ./ci/run.sh /ci-results /ci-cache
-```
-
-This setup ensures that the CI runs within an isolated Docker environment while maintaining cached files and results across runs.
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#/bin/bash
 #
 # sample usage:
 #
@@ -16,9 +16,6 @@
 # # with VULKAN support
 # GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 #
-# # with MUSA support
-# GG_BUILD_MUSA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
-#

 if [ -z "$2" ]; then
    echo "usage: $0 <output-dir> <mnt-dir>"
@@ -39,10 +36,10 @@ sd=`dirname $0`
 cd $sd/../
 SRC=`pwd`

-CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=OFF"
+CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON"

 if [ ! -z ${GG_BUILD_METAL} ]; then
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON -DGGML_METAL_USE_BF16=ON"
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
 fi

 if [ ! -z ${GG_BUILD_CUDA} ]; then
@@ -55,24 +52,13 @@ if [ ! -z ${GG_BUILD_SYCL} ]; then
        echo "source /opt/intel/oneapi/setvars.sh"
        exit 1
    fi
-    # Use only main GPU
-    export ONEAPI_DEVICE_SELECTOR="level_zero:0"
-    # Enable sysman for correct memory reporting
-    export ZES_ENABLE_SYSMAN=1
-    # to circumvent precision issues on CPY operations
-    export SYCL_PROGRAM_COMPILE_OPTIONS="-cl-fp32-correctly-rounded-divide-sqrt"
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
+
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
 fi

 if [ ! -z ${GG_BUILD_VULKAN} ]; then
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
 fi
-
-if [ ! -z ${GG_BUILD_MUSA} ]; then
-    # Use qy1 by default (MTT S80)
-    MUSA_ARCH=${MUSA_ARCH:-21}
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_MUSA=ON -DMUSA_ARCHITECTURES=${MUSA_ARCH}"
-fi
 ## helpers

 # download a file if it does not exist or if it is outdated
@@ -187,8 +173,8 @@ function gg_run_test_scripts_debug {

    set -e

-    (cd ./tools/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
-    (cd ./tools/quantize   && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
+    (cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
+    (cd ./examples/quantize   && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log

    set +e
 }
@@ -211,8 +197,8 @@ function gg_run_test_scripts_release {

    set -e

-    (cd ./tools/gguf-split && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
-    (cd ./tools/quantize   && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
+    (cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
+    (cd ./examples/quantize   && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log

    set +e
 }
@@ -340,36 +326,36 @@ function gg_run_open_llama_7b_v2 {
    ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
    ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k

-    (time ./bin/llama-cli -no-cnv --model ${model_f16}  -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-cli --model ${model_f16}  -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log

-    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log

-    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
+    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log

-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0     ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0     ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state     -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state     -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log

    function check_ppl {
        qnt="$1"
@@ -474,34 +460,34 @@ function gg_run_pythia_1_4b {
    ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
    ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k

-    (time ./bin/llama-cli -no-cnv --model ${model_f16}  -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q8_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q4_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q4_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q5_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q5_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q2_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q3_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q4_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q5_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q6_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-cli --model ${model_f16}  -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-cli --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-cli --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-cli --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-cli --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-cli --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-cli --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-cli --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-cli --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-cli --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-cli --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log

-    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log

-    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
+    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log

-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0     ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state     --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log

    function check_ppl {
        qnt="$1"
@@ -605,36 +591,36 @@ function gg_run_pythia_2_8b {
    ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
    ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k

-    (time ./bin/llama-cli -no-cnv --model ${model_f16}  -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-cli --model ${model_f16}  -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log

-    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log

-    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
+    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log

-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0     ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0     ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state     -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state     -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log

    function check_ppl {
        qnt="$1"
@@ -720,88 +706,12 @@ function gg_run_embd_bge_small {

    ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0

-    (time ./bin/llama-embedding --model ${model_f16}  -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-embedding --model ${model_f16}  -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log

    set +e
 }

-function gg_sum_embd_bge_small {
-    gg_printf '### %s\n\n' "${ci}"
-
-    gg_printf 'BGE Small (BERT):\n'
-    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
-    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
-    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
-}
-
-# rerank_tiny
-
-function gg_run_rerank_tiny {
-    cd ${SRC}
-
-    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/config.json
-    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/tokenizer.json
-    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/tokenizer_config.json
-    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/special_tokens_map.json
-    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/resolve/main/pytorch_model.bin
-    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/sentence_bert_config.json
-    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/vocab.txt
-    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/modules.json
-    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/config.json
-
-    gg_wget models-mnt/rerank-tiny/1_Pooling https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/1_Pooling/config.json
-
-    path_models="../models-mnt/rerank-tiny"
-
-    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
-
-    set -e
-
-    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
-
-    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
-
-    model_f16="${path_models}/ggml-model-f16.gguf"
-
-    # for this model, the SEP token is "</s>"
-    (time ./bin/llama-embedding --model ${model_f16} -p "what is panda?</s></s>hi\nwhat is panda?</s></s>it's a bear\nwhat is panda?</s></s>The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
-
-    # sample output
-    # rerank score 0:    0.029
-    # rerank score 1:    0.029
-    # rerank score 2:    0.135
-
-    # check that the score is in the range [$3, $4]
-    function check_score {
-        qnt="$1"
-        score=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
-
-        if [ $(echo "$score < $3" | bc) -eq 1 ] || [ $(echo "$score > $4" | bc) -eq 1 ]; then
-            printf '  - %s @ %s (FAIL: score not in range [%s, %s])\n' "$qnt" "$score" "$3" "$4"
-            return 20
-        fi
-
-        printf '  - %s @ %s OK\n' "$qnt" "$score"
-        return 0
-    }
-
-    check_score "rerank score 0" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 0")" "0.00" "0.05" | tee -a $OUT/${ci}-rk-f16.log
-    check_score "rerank score 1" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 1")" "0.00" "0.05" | tee -a $OUT/${ci}-rk-f16.log
-    check_score "rerank score 2" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 2")" "0.10" "0.30" | tee -a $OUT/${ci}-rk-f16.log
-
-    set +e
-}
-
-function gg_sum_rerank_tiny {
-    gg_printf '### %s\n\n' "${ci}"
-
-    gg_printf 'Rerank Tiny (Jina):\n'
-    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
-    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-rk-f16.log)"
-}
-
 function gg_check_build_requirements {
    if ! command -v cmake &> /dev/null; then
        gg_printf 'cmake not found, please install'
@@ -816,23 +726,29 @@ function gg_check_build_requirements {
    fi
 }

+function gg_sum_embd_bge_small {
+    gg_printf '### %s\n\n' "${ci}"
+
+    gg_printf 'BGE Small (BERT):\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
+    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
+}
+
 ## main

 export LLAMA_LOG_PREFIX=1
 export LLAMA_LOG_TIMESTAMPS=1

 if [ -z ${GG_BUILD_LOW_PERF} ]; then
-    # Create symlink: ./llama.cpp/models-mnt -> $MNT/models
+    # Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
    rm -rf ${SRC}/models-mnt
    mnt_models=${MNT}/models
    mkdir -p ${mnt_models}
    ln -sfn ${mnt_models} ${SRC}/models-mnt

    # Create a fresh python3 venv and enter it
-    if ! python3 -m venv "$MNT/venv"; then
-        echo "Error: Failed to create Python virtual environment at $MNT/venv."
-        exit 1
-    fi
+    python3 -m venv "$MNT/venv"
    source "$MNT/venv/bin/activate"

    pip install -r ${SRC}/requirements.txt --disable-pip-version-check
@@ -840,20 +756,15 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
 fi

 ret=0
-if [ -z ${GG_BUILD_SYCL} ]; then
-    # SYCL build breaks with debug build flags
-    test $ret -eq 0 && gg_run ctest_debug
-fi
+
+test $ret -eq 0 && gg_run ctest_debug
 test $ret -eq 0 && gg_run ctest_release

 if [ -z ${GG_BUILD_LOW_PERF} ]; then
    test $ret -eq 0 && gg_run embd_bge_small
-    test $ret -eq 0 && gg_run rerank_tiny

    if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then
-        if [ -z ${GG_BUILD_SYCL} ]; then
-            test $ret -eq 0 && gg_run test_scripts_debug
-        fi
+        test $ret -eq 0 && gg_run test_scripts_debug
        test $ret -eq 0 && gg_run test_scripts_release
    fi

@@ -864,9 +775,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
            test $ret -eq 0 && gg_run pythia_2_8b
            #test $ret -eq 0 && gg_run open_llama_7b_v2
        fi
-        if [ -z ${GG_BUILD_SYCL} ]; then
-            test $ret -eq 0 && gg_run ctest_with_model_debug
-        fi
+        test $ret -eq 0 && gg_run ctest_with_model_debug
        test $ret -eq 0 && gg_run ctest_with_model_release
    fi
 fi
--- a/cmake/arm64-apple-clang.cmake
+++ b/cmake/arm64-apple-clang.cmake
@@ -1,16 +0,0 @@
-set( CMAKE_SYSTEM_NAME Darwin )
-set( CMAKE_SYSTEM_PROCESSOR arm64 )
-
-set( target arm64-apple-darwin-macho )
-
-set( CMAKE_C_COMPILER    clang )
-set( CMAKE_CXX_COMPILER  clang++ )
-
-set( CMAKE_C_COMPILER_TARGET   ${target} )
-set( CMAKE_CXX_COMPILER_TARGET ${target} )
-
-set( arch_c_flags "-march=armv8.4-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
-set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function" )
-
-set( CMAKE_C_FLAGS_INIT   "${arch_c_flags} ${warn_c_flags}" )
-set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
--- a/cmake/arm64-windows-msvc.cmake
+++ b/cmake/arm64-windows-msvc.cmake
@@ -0,0 +1,6 @@
+set( CMAKE_SYSTEM_NAME Windows )
+set( CMAKE_SYSTEM_PROCESSOR arm64 )
+
+set( target arm64-pc-windows-msvc )
+set( CMAKE_C_COMPILER_TARGET   ${target} )
+set( CMAKE_CXX_COMPILER_TARGET ${target} )
--- a/cmake/build-info.cmake
+++ b/cmake/build-info.cmake
@@ -41,20 +41,14 @@ endif()

 if(MSVC)
    set(BUILD_COMPILER "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
-    if (CMAKE_VS_PLATFORM_NAME)
-        set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME})
-    else()
-        set(BUILD_TARGET "${CMAKE_SYSTEM_NAME} ${CMAKE_SYSTEM_PROCESSOR}")
-    endif()
+    set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME})
 else()
    execute_process(
-        COMMAND ${CMAKE_C_COMPILER} --version
+        COMMAND sh -c "$@ --version | head -1" _ ${CMAKE_C_COMPILER}
        OUTPUT_VARIABLE OUT
        OUTPUT_STRIP_TRAILING_WHITESPACE
    )
-    string(REGEX REPLACE " *\n.*" "" OUT "${OUT}")
    set(BUILD_COMPILER ${OUT})
-
    execute_process(
        COMMAND ${CMAKE_C_COMPILER} -dumpmachine
        OUTPUT_VARIABLE OUT
--- a/cmake/common.cmake
+++ b/cmake/common.cmake
@@ -1,35 +0,0 @@
-include("ggml/cmake/common.cmake")
-
-function(llama_add_compile_flags)
-    if (LLAMA_FATAL_WARNINGS)
-        if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-            list(APPEND C_FLAGS   -Werror)
-            list(APPEND CXX_FLAGS -Werror)
-        elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
-            add_compile_options(/WX)
-        endif()
-    endif()
-
-    if (LLAMA_ALL_WARNINGS)
-        if (NOT MSVC)
-            list(APPEND C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes
-                                -Werror=implicit-int -Werror=implicit-function-declaration)
-
-            list(APPEND CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn)
-
-            list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
-
-            list(APPEND C_FLAGS   ${WARNING_FLAGS})
-            list(APPEND CXX_FLAGS ${WARNING_FLAGS})
-
-            ggml_get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION})
-
-            add_compile_options("$<$<COMPILE_LANGUAGE:C>:${C_FLAGS};${GF_C_FLAGS}>"
-                                "$<$<COMPILE_LANGUAGE:CXX>:${CXX_FLAGS};${GF_CXX_FLAGS}>")
-        else()
-            # todo : msvc
-            set(C_FLAGS   "" PARENT_SCOPE)
-            set(CXX_FLAGS "" PARENT_SCOPE)
-        endif()
-    endif()
-endfunction()
--- a/cmake/llama-config.cmake.in
+++ b/cmake/llama-config.cmake.in
@@ -3,28 +3,88 @@ set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
 set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
 set(LLAMA_SHARED_LIB   @BUILD_SHARED_LIBS@)

+set(GGML_BLAS       @GGML_BLAS@)
+set(GGML_CUDA       @GGML_CUDA@)
+set(GGML_METAL      @GGML_METAL@)
+set(GGML_HIPBLAS    @GGML_HIPBLAS@)
+set(GGML_ACCELERATE @GGML_ACCELERATE@)
+set(GGML_VULKAN @GGML_VULKAN@)
+set(GGML_VULKAN_CHECK_RESULTS @GGML_VULKAN_CHECK_RESULTS@)
+set(GGML_VULKAN_DEBUG @GGML_VULKAN_DEBUG@)
+set(GGML_VULKAN_MEMORY_DEBUG @GGML_VULKAN_MEMORY_DEBUG@)
+set(GGML_VULKAN_VALIDATE @GGML_VULKAN_VALIDATE@)
+set(GGML_SYCL @GGML_SYCL@)
+set(GGML_OPENMP @GGML_OPENMP@)
+
@PACKAGE_INIT@

 set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@")
 set_and_check(LLAMA_LIB_DIR     "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
 set_and_check(LLAMA_BIN_DIR     "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")

-find_package(ggml REQUIRED HINTS ${LLAMA_LIB_DIR}/cmake)
+# Ensure transient dependencies satisfied
+
+find_package(Threads REQUIRED)
+
+if (APPLE AND GGML_ACCELERATE)
+    find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
+endif()
+
+if (GGML_BLAS)
+    find_package(BLAS REQUIRED)
+endif()
+
+if (GGML_CUDA)
+    find_package(CUDAToolkit REQUIRED)
+endif()
+
+if (GGML_METAL)
+    find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
+    find_library(METAL_FRAMEWORK Metal REQUIRED)
+    find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
+endif()
+
+if (GGML_VULKAN)
+    find_package(Vulkan REQUIRED)
+endif()
+
+if (GGML_HIPBLAS)
+    find_package(hip REQUIRED)
+    find_package(hipblas REQUIRED)
+    find_package(rocblas REQUIRED)
+endif()
+
+if (GGML_SYCL)
+    find_package(IntelSYCL REQUIRED)
+    find_package(MKL REQUIRED)
+endif()
+
+if (GGML_OPENMP)
+    find_package(OpenMP REQUIRED)
+endif()
+
+
+find_library(ggml_LIBRARY ggml
+    REQUIRED
+    HINTS ${LLAMA_LIB_DIR})

 find_library(llama_LIBRARY llama
    REQUIRED
-    HINTS ${LLAMA_LIB_DIR}
-    NO_CMAKE_FIND_ROOT_PATH
-)
+    HINTS ${LLAMA_LIB_DIR})
+
+set(_llama_link_deps "${ggml_LIBRARY}" "@GGML_LINK_LIBRARIES@")
+set(_llama_transient_defines "@GGML_TRANSIENT_DEFINES@")

 add_library(llama UNKNOWN IMPORTED)
+
 set_target_properties(llama
    PROPERTIES
        INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
-        INTERFACE_LINK_LIBRARIES "ggml::ggml;ggml::ggml-base;"
+        INTERFACE_LINK_LIBRARIES "${_llama_link_deps}"
+        INTERFACE_COMPILE_DEFINITIONS "${_llama_transient_defines}"
        IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
        IMPORTED_LOCATION "${llama_LIBRARY}"
-        INTERFACE_COMPILE_FEATURES c_std_90
-        POSITION_INDEPENDENT_CODE ON)
+        INTERFACE_COMPILE_FEATURES cxx_std_11
+        POSITION_INDEPENDENT_CODE ON )

 check_required_components(Llama)
--- a/cmake/llama.pc.in
+++ b/cmake/llama.pc.in
@@ -1,10 +1,10 @@
 prefix=@CMAKE_INSTALL_PREFIX@
-exec_prefix=@CMAKE_INSTALL_PREFIX@
-libdir=@CMAKE_INSTALL_FULL_LIBDIR@
-includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
+exec_prefix=${prefix}
+libdir=${exec_prefix}/lib
+includedir=${prefix}/include

 Name: llama
 Description: Port of Facebook's LLaMA model in C/C++
-Version: @LLAMA_INSTALL_VERSION@
-Libs: -L${libdir} -lggml -lggml-base -lllama
+Version: @PROJECT_VERSION@
+Libs: -L${libdir} -lllama
 Cflags: -I${includedir}
--- a/cmake/x64-windows-llvm.cmake
+++ b/cmake/x64-windows-llvm.cmake
@@ -1,5 +0,0 @@
-set( CMAKE_SYSTEM_NAME Windows )
-set( CMAKE_SYSTEM_PROCESSOR x86_64 )
-
-set( CMAKE_C_COMPILER    clang )
-set( CMAKE_CXX_COMPILER  clang++ )
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -2,8 +2,6 @@

 find_package(Threads REQUIRED)

-llama_add_compile_flags()
-
 # Build info header
 #

@@ -39,9 +37,7 @@ add_custom_command(
    COMMENT "Generating build details from Git"
    COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
            -DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
-            -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-            -DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME} -DCMAKE_SYSTEM_PROCESSOR=${CMAKE_SYSTEM_PROCESSOR}
-            -P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info-gen-cpp.cmake"
+            -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info-gen-cpp.cmake"
    WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
    DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
    VERBATIM
@@ -58,25 +54,20 @@ add_library(${TARGET} STATIC
    arg.cpp
    arg.h
    base64.hpp
-    chat.cpp
-    chat.h
    common.cpp
    common.h
    console.cpp
    console.h
    json-schema-to-grammar.cpp
    json.hpp
-    llguidance.cpp
    log.cpp
    log.h
-    minja/chat-template.hpp
-    minja/minja.hpp
    ngram-cache.cpp
    ngram-cache.h
    sampling.cpp
    sampling.h
-    speculative.cpp
-    speculative.h
+    train.cpp
+    train.h
    )

 if (BUILD_SHARED_LIBS)
@@ -87,84 +78,13 @@ set(LLAMA_COMMON_EXTRA_LIBS build_info)

 # Use curl to download model url
 if (LLAMA_CURL)
-    find_package(CURL)
-    if (NOT CURL_FOUND)
-        message(FATAL_ERROR "Could NOT find CURL. Hint: to disable this feature, set -DLLAMA_CURL=OFF")
-    endif()
-    target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
+    find_package(CURL REQUIRED)
+    add_definitions(-DLLAMA_USE_CURL)
    include_directories(${CURL_INCLUDE_DIRS})
    find_library(CURL_LIBRARY curl REQUIRED)
    set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
 endif ()

-if (LLAMA_LLGUIDANCE)
-    include(ExternalProject)
-    set(LLGUIDANCE_SRC ${CMAKE_BINARY_DIR}/llguidance/source)
-    set(LLGUIDANCE_PATH ${LLGUIDANCE_SRC}/target/release)
-
-    # Set the correct library file extension based on platform
-    if (WIN32)
-        set(LLGUIDANCE_LIB_NAME "llguidance.lib")
-        # Add Windows-specific libraries
-        set(LLGUIDANCE_PLATFORM_LIBS
-            ws2_32    # Windows Sockets API
-            userenv   # For GetUserProfileDirectoryW
-            ntdll     # For NT functions
-            bcrypt    # For BCryptGenRandom
-        )
-    else()
-        set(LLGUIDANCE_LIB_NAME "libllguidance.a")
-        set(LLGUIDANCE_PLATFORM_LIBS "")
-    endif()
-
-    ExternalProject_Add(llguidance_ext
-        GIT_REPOSITORY https://github.com/guidance-ai/llguidance
-        # v0.7.19 (+ fancy-regex build fix):
-        GIT_TAG b59f98f85269892a7de3d3641ad155366f13daa6
-        PREFIX ${CMAKE_BINARY_DIR}/llguidance
-        SOURCE_DIR ${LLGUIDANCE_SRC}
-        BUILD_IN_SOURCE TRUE
-        CONFIGURE_COMMAND ""
-        BUILD_COMMAND cargo build --release
-        INSTALL_COMMAND ""
-        BUILD_BYPRODUCTS ${LLGUIDANCE_PATH}/${LLGUIDANCE_LIB_NAME} ${LLGUIDANCE_PATH}/llguidance.h
-        UPDATE_COMMAND ""
-    )
-    target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_LLGUIDANCE)
-
-    add_library(llguidance STATIC IMPORTED)
-    set_target_properties(llguidance PROPERTIES IMPORTED_LOCATION ${LLGUIDANCE_PATH}/${LLGUIDANCE_LIB_NAME})
-    add_dependencies(llguidance llguidance_ext)
-
-    target_include_directories(${TARGET} PRIVATE ${LLGUIDANCE_PATH})
-    # Add platform libraries to the main target
-    set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance ${LLGUIDANCE_PLATFORM_LIBS})
-endif ()
-
 target_include_directories(${TARGET} PUBLIC .)
-target_compile_features   (${TARGET} PUBLIC cxx_std_17)
+target_compile_features   (${TARGET} PUBLIC cxx_std_11)
 target_link_libraries     (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
-
-
-#
-# copy the license files
-#
-
-# Check if running in GitHub Actions
-if (DEFINED ENV{GITHUB_ACTIONS} AND "$ENV{GITHUB_ACTIONS}" STREQUAL "true")
-    message(STATUS "Running inside GitHub Actions - copying license files")
-
-    # Copy all files from licenses/ to build/bin/
-    file(GLOB LICENSE_FILES "${CMAKE_SOURCE_DIR}/licenses/*")
-    foreach(LICENSE_FILE ${LICENSE_FILES})
-        get_filename_component(FILENAME ${LICENSE_FILE} NAME)
-        add_custom_command(
-            POST_BUILD
-            TARGET ${TARGET}
-            COMMAND ${CMAKE_COMMAND} -E copy_if_different
-                "${LICENSE_FILE}"
-                "$<TARGET_FILE_DIR:llama>/${FILENAME}"
-            COMMENT "Copying ${FILENAME} to ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}")
-        message(STATUS "Copying ${LICENSE_FILE} to ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${FILENAME}")
-    endforeach()
-endif()
--- a/common/arg.cpp
+++ b/common/arg.cpp
--- a/common/arg.h
+++ b/common/arg.h
@@ -10,80 +10,68 @@
 // CLI argument parsing
 //

-struct common_arg {
+struct llama_arg {
    std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
-    std::set<enum llama_example> excludes = {};
    std::vector<const char *> args;
    const char * value_hint   = nullptr; // help text or example for arg value
    const char * value_hint_2 = nullptr; // for second arg value
    const char * env          = nullptr;
    std::string help;
    bool is_sparam = false; // is current arg a sampling param?
-    void (*handler_void)   (common_params & params) = nullptr;
-    void (*handler_string) (common_params & params, const std::string &) = nullptr;
-    void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr;
-    void (*handler_int)    (common_params & params, int) = nullptr;
+    void (*handler_void)   (gpt_params & params) = nullptr;
+    void (*handler_string) (gpt_params & params, const std::string &) = nullptr;
+    void (*handler_str_str)(gpt_params & params, const std::string &, const std::string &) = nullptr;
+    void (*handler_int)    (gpt_params & params, int) = nullptr;

-    common_arg(
+    llama_arg(
        const std::initializer_list<const char *> & args,
        const char * value_hint,
        const std::string & help,
-        void (*handler)(common_params & params, const std::string &)
+        void (*handler)(gpt_params & params, const std::string &)
    ) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}

-    common_arg(
+    llama_arg(
        const std::initializer_list<const char *> & args,
        const char * value_hint,
        const std::string & help,
-        void (*handler)(common_params & params, int)
+        void (*handler)(gpt_params & params, int)
    ) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}

-    common_arg(
+    llama_arg(
        const std::initializer_list<const char *> & args,
        const std::string & help,
-        void (*handler)(common_params & params)
+        void (*handler)(gpt_params & params)
    ) : args(args), help(help), handler_void(handler) {}

    // support 2 values for arg
-    common_arg(
+    llama_arg(
        const std::initializer_list<const char *> & args,
        const char * value_hint,
        const char * value_hint_2,
        const std::string & help,
-        void (*handler)(common_params & params, const std::string &, const std::string &)
+        void (*handler)(gpt_params & params, const std::string &, const std::string &)
    ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}

-    common_arg & set_examples(std::initializer_list<enum llama_example> examples);
-    common_arg & set_excludes(std::initializer_list<enum llama_example> excludes);
-    common_arg & set_env(const char * env);
-    common_arg & set_sparam();
+    llama_arg & set_examples(std::initializer_list<enum llama_example> examples);
+    llama_arg & set_env(const char * env);
+    llama_arg & set_sparam();
    bool in_example(enum llama_example ex);
-    bool is_exclude(enum llama_example ex);
    bool get_value_from_env(std::string & output);
    bool has_value_from_env();
    std::string to_string();
 };

-struct common_params_context {
+struct gpt_params_context {
    enum llama_example ex = LLAMA_EXAMPLE_COMMON;
-    common_params & params;
-    std::vector<common_arg> options;
+    gpt_params & params;
+    std::vector<llama_arg> options;
    void(*print_usage)(int, char **) = nullptr;
-    common_params_context(common_params & params) : params(params) {}
+    gpt_params_context(gpt_params & params) : params(params) {}
 };

 // parse input arguments from CLI
 // if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
-bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
+bool gpt_params_parse(int argc, char ** argv, gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);

 // function to be used by test-arg-parser
-common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
-bool common_has_curl();
-
-struct common_remote_params {
-    std::vector<std::string> headers;
-    long timeout = 0; // CURLOPT_TIMEOUT, in seconds ; 0 means no timeout
-    long max_size = 0; // max size of the response ; unlimited if 0 ; max is 2GB
-};
-// get remote file content, returns <http_code, raw_response_body>
-std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params);
+gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
--- a/common/chat.cpp
+++ b/common/chat.cpp
--- a/common/chat.h
+++ b/common/chat.h
@@ -1,135 +0,0 @@
-// Chat support (incl. tool call grammar constraining & output parsing) w/ generic & custom template handlers.
-
-#pragma once
-
-#include "common.h"
-#include <string>
-#include <vector>
-
-struct common_chat_templates;
-
-struct common_chat_tool_call {
-    std::string name;
-    std::string arguments;
-    std::string id;
-};
-
-struct common_chat_msg_content_part {
-    std::string type;
-    std::string text;
-};
-
-struct common_chat_msg {
-    std::string role;
-    std::string content;
-    std::vector<common_chat_msg_content_part> content_parts = {};
-    std::vector<common_chat_tool_call> tool_calls = {};
-    std::string reasoning_content;
-    std::string tool_name;
-    std::string tool_call_id;
-};
-
-struct common_chat_tool {
-    std::string name;
-    std::string description;
-    std::string parameters;
-};
-
-enum common_chat_tool_choice {
-    COMMON_CHAT_TOOL_CHOICE_AUTO,
-    COMMON_CHAT_TOOL_CHOICE_REQUIRED,
-    COMMON_CHAT_TOOL_CHOICE_NONE,
-};
-
-enum common_chat_format {
-    COMMON_CHAT_FORMAT_CONTENT_ONLY,
-    COMMON_CHAT_FORMAT_GENERIC,
-    COMMON_CHAT_FORMAT_MISTRAL_NEMO,
-    COMMON_CHAT_FORMAT_LLAMA_3_X,
-    COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
-    COMMON_CHAT_FORMAT_DEEPSEEK_R1,
-    COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING,
-    COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
-    COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
-    COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
-    COMMON_CHAT_FORMAT_HERMES_2_PRO,
-    COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING,
-    COMMON_CHAT_FORMAT_COMMAND_R7B,
-    COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING,
-
-    COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
-};
-
-struct common_chat_templates_inputs {
-    std::vector<common_chat_msg> messages;
-    std::string grammar;
-    std::string json_schema;
-    bool add_generation_prompt = true;
-    bool use_jinja = true;
-    // Parameters below only supported when use_jinja is true
-    std::vector<common_chat_tool> tools;
-    common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
-    bool parallel_tool_calls = false;
-    bool extract_reasoning     = true;
-};
-
-struct common_chat_params {
-    common_chat_format                  format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
-    std::string                         prompt;
-    std::string                         grammar;
-    bool                                grammar_lazy = false;
-    std::vector<common_grammar_trigger> grammar_triggers;
-    std::vector<std::string>            preserved_tokens;
-    std::vector<std::string>            additional_stops;
-};
-
-// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
-bool common_chat_verify_template(const std::string & tmpl, bool use_jinja);
-
-void common_chat_templates_free(struct common_chat_templates * tmpls);
-
-struct common_chat_templates_deleter { void operator()(common_chat_templates * tmpls) { common_chat_templates_free(tmpls); } };
-
-typedef std::unique_ptr<struct common_chat_templates, common_chat_templates_deleter> common_chat_templates_ptr;
-
-common_chat_templates_ptr common_chat_templates_init(
-                                    const struct llama_model * model,
-                                           const std::string & chat_template_override,
-                                           const std::string & bos_token_override = "",
-                                           const std::string & eos_token_override = "");
-
-bool         common_chat_templates_was_explicit(const struct common_chat_templates * tmpls);
-const char * common_chat_templates_source(const struct common_chat_templates * tmpls, const char * variant = nullptr);
-
-
-struct common_chat_params      common_chat_templates_apply(
-    const struct common_chat_templates * tmpls,
-    const struct common_chat_templates_inputs & inputs);
-
-// Format single message, while taking into account the position of that message in chat history
-std::string common_chat_format_single(
-        const struct common_chat_templates * tmpls,
-        const std::vector<common_chat_msg> & past_msg,
-        const common_chat_msg & new_msg,
-        bool add_ass,
-        bool use_jinja);
-
-// Returns an example of formatted chat
-std::string common_chat_format_example(
-    const struct common_chat_templates * tmpls,
-    bool use_jinja);
-
-std::string               common_chat_format_name(common_chat_format format);
-common_chat_msg           common_chat_parse(      const std::string & input, common_chat_format format);
-
-common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
-
-// Parses a JSON array of messages in OpenAI's chat completion API format.
-// T can be std::string containing JSON or nlohmann::ordered_json
-template <class T> std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const T & messages);
-template <class T> T common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msgs, bool concat_typed_text = false);
-
-// Parses a JSON array of tools in OpenAI's chat completion tool call API format.
-// T can be std::string containing JSON or nlohmann::ordered_json
-template <class T> std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const T & tools);
-template <class T> T common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools);
--- a/common/common.cpp
+++ b/common/common.cpp
--- a/common/common.h
+++ b/common/common.h
@@ -2,9 +2,8 @@

 #pragma once

-#include "llama-cpp.h"
+#include "llama.h"

-#include <set>
 #include <string>
 #include <vector>
 #include <sstream>
@@ -25,22 +24,22 @@

 #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"

-struct common_adapter_lora_info {
+struct llama_lora_adapter_info {
    std::string path;
    float scale;
-
-    struct llama_adapter_lora * ptr;
 };

-using llama_tokens = std::vector<llama_token>;
+struct llama_lora_adapter_container : llama_lora_adapter_info {
+    struct llama_lora_adapter * adapter;
+};

 // build info
 extern int LLAMA_BUILD_NUMBER;
-extern const char * LLAMA_COMMIT;
-extern const char * LLAMA_COMPILER;
-extern const char * LLAMA_BUILD_TARGET;
+extern char const * LLAMA_COMMIT;
+extern char const * LLAMA_COMPILER;
+extern char const * LLAMA_BUILD_TARGET;

-struct common_control_vector_load_info;
+struct llama_control_vector_load_info;

 //
 // CPU utils
@@ -66,6 +65,7 @@ enum llama_example {
    LLAMA_EXAMPLE_COMMON,
    LLAMA_EXAMPLE_SPECULATIVE,
    LLAMA_EXAMPLE_MAIN,
+    LLAMA_EXAMPLE_INFILL,
    LLAMA_EXAMPLE_EMBEDDING,
    LLAMA_EXAMPLE_PERPLEXITY,
    LLAMA_EXAMPLE_RETRIEVAL,
@@ -78,24 +78,18 @@ enum llama_example {
    LLAMA_EXAMPLE_LLAVA,
    LLAMA_EXAMPLE_LOOKUP,
    LLAMA_EXAMPLE_PARALLEL,
-    LLAMA_EXAMPLE_TTS,

    LLAMA_EXAMPLE_COUNT,
 };

-enum common_sampler_type {
-    COMMON_SAMPLER_TYPE_NONE        = 0,
-    COMMON_SAMPLER_TYPE_DRY         = 1,
-    COMMON_SAMPLER_TYPE_TOP_K       = 2,
-    COMMON_SAMPLER_TYPE_TOP_P       = 3,
-    COMMON_SAMPLER_TYPE_MIN_P       = 4,
-  //COMMON_SAMPLER_TYPE_TFS_Z       = 5,
-    COMMON_SAMPLER_TYPE_TYPICAL_P   = 6,
-    COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
-    COMMON_SAMPLER_TYPE_XTC         = 8,
-    COMMON_SAMPLER_TYPE_INFILL      = 9,
-    COMMON_SAMPLER_TYPE_PENALTIES   = 10,
-    COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11,
+enum gpt_sampler_type {
+    GPT_SAMPLER_TYPE_NONE        = 0,
+    GPT_SAMPLER_TYPE_TOP_K       = 1,
+    GPT_SAMPLER_TYPE_TOP_P       = 2,
+    GPT_SAMPLER_TYPE_MIN_P       = 3,
+    GPT_SAMPLER_TYPE_TFS_Z       = 4,
+    GPT_SAMPLER_TYPE_TYPICAL_P   = 5,
+    GPT_SAMPLER_TYPE_TEMPERATURE = 6,
 };

 // dimensionality reduction methods, used by cvector-generator
@@ -104,76 +98,42 @@ enum dimre_method {
    DIMRE_METHOD_MEAN,
 };

-enum common_conversation_mode {
-    COMMON_CONVERSATION_MODE_DISABLED = 0,
-    COMMON_CONVERSATION_MODE_ENABLED  = 1,
-    COMMON_CONVERSATION_MODE_AUTO     = 2,
-};
-
-enum common_grammar_trigger_type {
-    COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN,
-    COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
-    COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
-    COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
-};
-
-struct common_grammar_trigger {
-    common_grammar_trigger_type type;
-    std::string value;
-    llama_token token = LLAMA_TOKEN_NULL;
-};
-
-// sampling parameters
-struct common_params_sampling {
+// sampler parameters
+struct gpt_sampler_params {
    uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler

-    int32_t n_prev             = 64;    // number of previous tokens to remember
-    int32_t n_probs            = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
-    int32_t min_keep           = 0;     // 0 = disabled, otherwise samplers should return at least min_keep tokens
-    int32_t top_k              = 40;    // <= 0 to use vocab size
-    float   top_p              = 0.95f; // 1.0 = disabled
-    float   min_p              = 0.05f; // 0.0 = disabled
-    float   xtc_probability    = 0.00f; // 0.0 = disabled
-    float   xtc_threshold      = 0.10f; // > 0.5 disables XTC
-    float   typ_p              = 1.00f; // typical_p, 1.0 = disabled
-    float   temp               = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
-    float   dynatemp_range     = 0.00f; // 0.0 = disabled
-    float   dynatemp_exponent  = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
-    int32_t penalty_last_n     = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
-    float   penalty_repeat     = 1.00f; // 1.0 = disabled
-    float   penalty_freq       = 0.00f; // 0.0 = disabled
-    float   penalty_present    = 0.00f; // 0.0 = disabled
-    float   dry_multiplier     = 0.0f;  // 0.0 = disabled;      DRY repetition penalty for tokens extending repetition:
-    float   dry_base           = 1.75f; // 0.0 = disabled;      multiplier * base ^ (length of sequence before token - allowed length)
-    int32_t dry_allowed_length = 2;     // tokens extending repetitions beyond this receive penalty
-    int32_t dry_penalty_last_n = -1;    // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
-    int32_t mirostat           = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
-    float   top_n_sigma        = -1.00f;// -1.0 = disabled
-    float   mirostat_tau       = 5.00f; // target entropy
-    float   mirostat_eta       = 0.10f; // learning rate
-    bool    ignore_eos         = false;
-    bool    no_perf            = false; // disable performance metrics
-    bool    timing_per_token   = false;
+    int32_t n_prev            = 64;    // number of previous tokens to remember
+    int32_t n_probs           = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
+    int32_t min_keep          = 0;     // 0 = disabled, otherwise samplers should return at least min_keep tokens
+    int32_t top_k             = 40;    // <= 0 to use vocab size
+    float   top_p             = 0.95f; // 1.0 = disabled
+    float   min_p             = 0.05f; // 0.0 = disabled
+    float   tfs_z             = 1.00f; // 1.0 = disabled
+    float   typ_p             = 1.00f; // typical_p, 1.0 = disabled
+    float   temp              = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
+    float   dynatemp_range    = 0.00f; // 0.0 = disabled
+    float   dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
+    int32_t penalty_last_n    = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
+    float   penalty_repeat    = 1.00f; // 1.0 = disabled
+    float   penalty_freq      = 0.00f; // 0.0 = disabled
+    float   penalty_present   = 0.00f; // 0.0 = disabled
+    int32_t mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
+    float   mirostat_tau      = 5.00f; // target entropy
+    float   mirostat_eta      = 0.10f; // learning rate
+    bool    penalize_nl       = false; // consider newlines as a repeatable token
+    bool    ignore_eos        = false;
+    bool    no_perf           = false; // disable performance metrics

-    std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"};     // default sequence breakers for DRY
-
-
-    std::vector<enum common_sampler_type> samplers = {
-        COMMON_SAMPLER_TYPE_PENALTIES,
-        COMMON_SAMPLER_TYPE_DRY,
-        COMMON_SAMPLER_TYPE_TOP_N_SIGMA,
-        COMMON_SAMPLER_TYPE_TOP_K,
-        COMMON_SAMPLER_TYPE_TYPICAL_P,
-        COMMON_SAMPLER_TYPE_TOP_P,
-        COMMON_SAMPLER_TYPE_MIN_P,
-        COMMON_SAMPLER_TYPE_XTC,
-        COMMON_SAMPLER_TYPE_TEMPERATURE,
+    std::vector<enum gpt_sampler_type> samplers = {
+        GPT_SAMPLER_TYPE_TOP_K,
+        GPT_SAMPLER_TYPE_TFS_Z,
+        GPT_SAMPLER_TYPE_TYPICAL_P,
+        GPT_SAMPLER_TYPE_TOP_P,
+        GPT_SAMPLER_TYPE_MIN_P,
+        GPT_SAMPLER_TYPE_TEMPERATURE
    };

-    std::string                         grammar; // optional BNF-like grammar to constrain sampling
-    bool                                grammar_lazy = false;
-    std::vector<common_grammar_trigger> grammar_triggers; // optional triggers (for lazy grammars)
-    std::set<llama_token>               preserved_tokens;
+    std::string grammar; // optional BNF-like grammar to constrain sampling

    std::vector<llama_logit_bias> logit_bias; // logit biases to apply

@@ -181,51 +141,21 @@ struct common_params_sampling {
    std::string print() const;
 };

-struct common_params_model {
-    std::string path    = ""; // model local path                                           // NOLINT
-    std::string url     = ""; // model url to download                                      // NOLINT
-    std::string hf_repo = ""; // HF repo                                                    // NOLINT
-    std::string hf_file = ""; // HF file                                                    // NOLINT
-};
-
-struct common_params_speculative {
-    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
-
-    int32_t n_ctx        =     0; // draft context size
-    int32_t n_max        =    16; // maximum number of tokens to draft during speculative decoding
-    int32_t n_min        =     0; // minimum number of draft tokens to use for speculative decoding
-    int32_t n_gpu_layers =    -1; // number of layers to store in VRAM for the draft model (-1 - use default)
-    float   p_split      =  0.1f; // speculative decoding split probability
-    float   p_min        = 0.75f; // minimum speculative decoding probability (greedy)
-
-    struct cpu_params cpuparams;
-    struct cpu_params cpuparams_batch;
-
-    struct common_params_model model;
-};
-
-struct common_params_vocoder {
-    struct common_params_model model;
-
-    std::string speaker_file = ""; // speaker file path                                      // NOLINT
-
-    bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy            // NOLINT
-};
-
-enum common_reasoning_format {
-    COMMON_REASONING_FORMAT_NONE,
-    COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`
-};
-
-struct common_params {
+struct gpt_params {
    int32_t n_predict             =    -1; // new tokens to predict
-    int32_t n_ctx                 =  4096; // context size
+    int32_t n_ctx                 =     0; // context size
    int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
    int32_t n_ubatch              =   512; // physical batch size for prompt processing (must be >=32 to use BLAS)
    int32_t n_keep                =     0; // number of tokens to keep from initial prompt
+    int32_t n_draft               =     5; // number of tokens to draft during speculative decoding
    int32_t n_chunks              =    -1; // max number of chunks to process (-1 = unlimited)
    int32_t n_parallel            =     1; // number of parallel sequences to decode
    int32_t n_sequences           =     1; // number of sequences to decode
+    float   p_split               =  0.1f; // speculative decoding split probability
+    int32_t n_gpu_layers          =    -1; // number of layers to store in VRAM (-1 - use default)
+    int32_t n_gpu_layers_draft    =    -1; // number of layers to store in VRAM for the draft model (-1 - use default)
+    int32_t main_gpu              =     0; // the GPU that is used for scratch and small tensors
+    float   tensor_split[128]     =   {0}; // how split tensors should be distributed across GPUs
    int32_t grp_attn_n            =     1; // group-attention factor
    int32_t grp_attn_w            =   512; // group-attention width
    int32_t n_print               =    -1; // print token count every n tokens (-1 = disabled)
@@ -236,56 +166,51 @@ struct common_params {
    float   yarn_beta_fast        = 32.0f; // YaRN low correction dim
    float   yarn_beta_slow        =  1.0f; // YaRN high correction dim
    int32_t yarn_orig_ctx         =     0; // YaRN original context length
-    float   defrag_thold          =  0.1f; // KV cache defragmentation threshold
-
-    // offload params
-    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
-
-    int32_t n_gpu_layers      = -1;  // number of layers to store in VRAM (-1 - use default)
-    int32_t main_gpu          = 0;   // the GPU that is used for scratch and small tensors
-    float   tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
-
-    enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
+    float   defrag_thold          = -1.0f; // KV cache defragmentation threshold

    struct cpu_params cpuparams;
    struct cpu_params cpuparams_batch;
+    struct cpu_params draft_cpuparams;
+    struct cpu_params draft_cpuparams_batch;

    ggml_backend_sched_eval_callback cb_eval = nullptr;
    void * cb_eval_user_data                 = nullptr;

    ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;

+    enum llama_split_mode        split_mode        = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
    enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
    enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
    enum llama_attention_type    attention_type    = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings

-    struct common_params_sampling    sampling;
-    struct common_params_speculative speculative;
-    struct common_params_vocoder     vocoder;
+    struct gpt_sampler_params sparams;

-    struct common_params_model model;
-
-    std::string model_alias          = ""; // model alias                                                   // NOLINT
+    std::string model                = ""; // model path                                                    // NOLINT
+    std::string model_draft          = ""; // draft model for speculative decoding                          // NOLINT
+    std::string model_alias          = "unknown"; // model alias                                            // NOLINT
+    std::string model_url            = ""; // model url to download                                         // NOLINT
    std::string hf_token             = ""; // HF token                                                      // NOLINT
+    std::string hf_repo              = ""; // HF repo                                                       // NOLINT
+    std::string hf_file              = ""; // HF file                                                       // NOLINT
    std::string prompt               = "";                                                                  // NOLINT
-    std::string system_prompt        = "";                                                                  // NOLINT
    std::string prompt_file          = ""; // store the external prompt file name                           // NOLINT
    std::string path_prompt_cache    = ""; // path to file for saving/loading prompt eval state             // NOLINT
    std::string input_prefix         = ""; // string to prefix user inputs with                             // NOLINT
    std::string input_suffix         = ""; // string to suffix user inputs with                             // NOLINT
+    std::string logdir               = ""; // directory in which to save YAML log files                     // NOLINT
    std::string lookup_cache_static  = ""; // path of static ngram cache file for lookup decoding           // NOLINT
    std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding          // NOLINT
    std::string logits_file          = ""; // file for saving *all* logits                                  // NOLINT
+    std::string rpc_servers          = ""; // comma separated list of RPC servers                           // NOLINT

    std::vector<std::string> in_files;   // all input files
    std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
    std::vector<llama_model_kv_override> kv_overrides;
-    std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;

-    bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
-    std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
+    bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
+    std::vector<llama_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale

-    std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
+    std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale

    int32_t verbosity                  = 0;
    int32_t control_vector_layer_start = -1; // layer range for control vector
@@ -307,11 +232,11 @@ struct common_params {
    bool   kl_divergence    = false; // compute KL divergence

    bool usage             = false; // print usage
-    bool completion        = false; // print source-able completion script
    bool use_color         = false; // use color to distinguish generations and inputs
    bool special           = false; // enable special token output
    bool interactive       = false; // interactive mode
    bool interactive_first = false; // wait for user input immediately
+    bool conversation      = false; // conversation mode (does not print special tokens and suffix/prefix)
    bool prompt_cache_all  = false; // save user input and generations to prompt cache
    bool prompt_cache_ro   = false; // open the prompt cache read-only and do not update it

@@ -324,6 +249,7 @@ struct common_params {
    bool ctx_shift         = true;  // context shift on inifinite text generation

    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
+    bool logits_all        = false; // return logits for all tokens in the batch
    bool use_mmap          = true;  // use mmap for faster loads
    bool use_mlock         = false; // use mlock to keep model in memory
    bool verbose_prompt    = false; // print prompt tokens before generation
@@ -332,51 +258,38 @@ struct common_params {
    bool no_kv_offload     = false; // disable KV offloading
    bool warmup            = true;  // warmup run
    bool check_tensors     = false; // validate tensor data
-    bool no_op_offload     = false; // globally disable offload host tensor operations to device

-    bool single_turn       = false; // single turn chat conversation
+    std::string cache_type_k = "f16"; // KV cache data type for the K
+    std::string cache_type_v = "f16"; // KV cache data type for the V

-    ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
-    ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
-
-    common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
-
-    // multimodal models (see tools/mtmd)
-    struct common_params_model mmproj;
-    bool mmproj_use_gpu = true;     // use GPU for multimodal model
-    bool no_mmproj = false;         // explicitly disable multimodal model
+    // multimodal models (see examples/llava)
+    std::string mmproj = "";        // path to multimodal projector                                         // NOLINT
    std::vector<std::string> image; // path to image file(s)

    // embedding
    bool embedding         = false; // get only sentence embedding
-    int32_t embd_normalize = 2;     // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
+    int32_t embd_normalize = 2;     // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
    std::string embd_out   = "";    // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
-    std::string embd_sep   = "\n";  // separator of embeddings
-    bool reranking         = false; // enable reranking support on server
+    std::string embd_sep   = "\n";  // separator of embendings

    // server params
    int32_t port           = 8080;         // server listens on this network port
    int32_t timeout_read   = 600;          // http read timeout in seconds
    int32_t timeout_write  = timeout_read; // http write timeout in seconds
-    int32_t n_threads_http = -1;           // number of threads to process HTTP requests (TODO: support threadpool)
-    int32_t n_cache_reuse  = 0;            // min chunk size to reuse from the cache via KV shifting
+    int     n_threads_http = -1;           // number of threads to process HTTP requests (TODO: support threadpool)

    std::string hostname      = "127.0.0.1";
    std::string public_path   = "";                                                                         // NOLINT
    std::string chat_template = "";                                                                         // NOLINT
-    bool use_jinja = false;                                                                                 // NOLINT
+    std::string system_prompt = "";                                                                         // NOLINT
    bool enable_chat_template = true;
-    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;

    std::vector<std::string> api_keys;

    std::string ssl_file_key  = "";                                                                         // NOLINT
    std::string ssl_file_cert = "";                                                                         // NOLINT

-    // "advanced" endpoints are disabled by default for better security
-    bool webui            = true;
-    bool endpoint_slots   = false;
-    bool endpoint_props   = false; // only control POST requests, not GET
+    bool endpoint_slots   = true;
    bool endpoint_metrics = false;

    bool log_json = false;
@@ -404,72 +317,55 @@ struct common_params {
    int32_t i_pos  = -1;  // position of the passkey in the junk text

    // imatrix params
+    std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
+
    int32_t n_out_freq  = 10; // output the imatrix every n_out_freq iterations
    int32_t n_save_freq =  0; // save the imatrix every n_save_freq iterations
    int32_t i_chunk     =  0; // start processing from this chunk

    bool process_output = false; // collect data for the output tensor
    bool compute_ppl    = true;  // whether to compute perplexity
-    bool parse_special  = false; // whether to parse special tokens during imatrix tokenization

    // cvector-generator params
    int n_pca_batch = 100;
    int n_pca_iterations = 1000;
    dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
-    std::string cvector_positive_file = "tools/cvector-generator/positive.txt";
-    std::string cvector_negative_file = "tools/cvector-generator/negative.txt";
+    std::string cvector_outfile       = "control_vector.gguf";
+    std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
+    std::string cvector_negative_file = "examples/cvector-generator/negative.txt";

    bool spm_infill = false; // suffix/prefix/middle pattern for infill

+    std::string lora_outfile = "ggml-lora-merged-f16.gguf";
+
    // batched-bench params
    bool batched_bench_output_jsonl = false;
-
-    // common params
-    std::string out_file; // output filename for all example programs
 };

 // call once at the start of a program if it uses libcommon
 // initializes the logging system and prints info about the build
-void common_init();
+void gpt_init();

-std::string common_params_get_system_info(const common_params & params);
+std::string gpt_params_get_system_info(const gpt_params & params);

-bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]);
-bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
-void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr);
+bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
+bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
+void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr);
 bool set_process_priority(enum ggml_sched_priority prio);

 //
 // String utils
 //

-#ifdef __GNUC__
-#    if defined(__MINGW32__) && !defined(__clang__)
-#        define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
-#    else
-#        define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
-#    endif
-#else
-#    define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
-#endif
-
-LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
-std::string string_format(const char * fmt, ...);
+std::vector<std::string> string_split(std::string input, char separator);

 std::string string_strip(const std::string & str);
 std::string string_get_sortable_timestamp();

-std::string string_join(const std::vector<std::string> & values, const std::string & separator);
-std::vector<std::string> string_split(const std::string & str, const std::string & delimiter);
-std::string string_repeat(const std::string & str, size_t n);
-
 void string_replace_all(std::string & s, const std::string & search, const std::string & replace);

-std::string regex_escape(const std::string & s);
-
 template<class T>
 static std::vector<T> string_split(const std::string & str, char delim) {
-    static_assert(!std::is_same<T, std::string>::value, "Please use the specialized version for std::string");
    std::vector<T> values;
    std::istringstream str_stream(str);
    std::string token;
@@ -482,32 +378,6 @@ static std::vector<T> string_split(const std::string & str, char delim) {
    return values;
 }

-template<>
-std::vector<std::string> string_split<std::string>(const std::string & input, char separator)
-{
-    std::vector<std::string> parts;
-    size_t begin_pos = 0;
-    size_t separator_pos = input.find(separator);
-    while (separator_pos != std::string::npos) {
-        std::string part = input.substr(begin_pos, separator_pos - begin_pos);
-        parts.emplace_back(part);
-        begin_pos = separator_pos + 1;
-        separator_pos = input.find(separator, begin_pos);
-    }
-    parts.emplace_back(input.substr(begin_pos, separator_pos - begin_pos));
-    return parts;
-}
-
-static bool string_starts_with(const std::string & str,
-                               const std::string & prefix) {  // While we wait for C++20's std::string::starts_with...
-    return str.rfind(prefix, 0) == 0;
-}
-
-static bool string_ends_with(const std::string & str,
-                               const std::string & suffix) {  // While we wait for C++20's std::string::ends_with...
-    return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
-}
-
 bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
 void string_process_escapes(std::string & input);

@@ -530,122 +400,130 @@ std::string fs_get_cache_file(const std::string & filename);
 // Model utils
 //

-// note: defines object's lifetime
-struct common_init_result {
-    llama_model_ptr   model;
-    llama_context_ptr context;
-
-    std::vector<llama_adapter_lora_ptr> lora;
+struct llama_init_result {
+    struct llama_model   * model   = nullptr;
+    struct llama_context * context = nullptr;
+    std::vector<llama_lora_adapter_container> lora_adapters;
 };

-struct common_init_result     common_init_from_params(common_params & params);
+struct llama_init_result    llama_init_from_gpt_params(gpt_params & params);

-struct llama_model_params     common_model_params_to_llama  (      common_params & params);
-struct llama_context_params   common_context_params_to_llama(const common_params & params);
+struct llama_model_params     llama_model_params_from_gpt_params    (const gpt_params & params);
+struct llama_context_params   llama_context_params_from_gpt_params  (const gpt_params & params);
 struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);

+struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
+struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
+
 // clear LoRA adapters from context, then apply new list of adapters
-void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
+void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters);

-std::string                   get_model_endpoint();
-
-//
 // Batch utils
-//

-void common_batch_clear(struct llama_batch & batch);
+void llama_batch_clear(struct llama_batch & batch);

-void common_batch_add(
+void llama_batch_add(
                 struct llama_batch & batch,
                        llama_token   id,
                          llama_pos   pos,
    const std::vector<llama_seq_id> & seq_ids,
                               bool   logits);

-//
-// Token utils
-//
-
-// longest common prefix
-size_t common_lcp(const llama_tokens & a, const llama_tokens & b);
-
-// longet common subsequence
-size_t common_lcs(const llama_tokens & a, const llama_tokens & b);
-
 //
 // Vocab utils
 //

 // tokenizes a string into a vector of tokens
 // should work similar to Python's `tokenizer.encode`
-std::vector<llama_token> common_tokenize(
+std::vector<llama_token> llama_tokenize(
  const struct llama_context * ctx,
           const std::string & text,
                        bool   add_special,
                        bool   parse_special = false);

-std::vector<llama_token> common_tokenize(
-    const struct llama_vocab * vocab,
+std::vector<llama_token> llama_tokenize(
+    const struct llama_model * model,
           const std::string & text,
                        bool   add_special,
                        bool   parse_special = false);

 // tokenizes a token into a piece, optionally renders special/control tokens
 // should work similar to Python's `tokenizer.id_to_piece`
-std::string common_token_to_piece(
+std::string llama_token_to_piece(
        const struct llama_context * ctx,
                       llama_token   token,
                       bool          special = true);

-std::string common_token_to_piece(
-          const struct llama_vocab * vocab,
-                       llama_token   token,
-                       bool          special = true);
-
 // detokenizes a vector of tokens into a string
 // should work similar to Python's `tokenizer.decode`
 // optionally renders special/control tokens
-std::string common_detokenize(
-            const struct llama_context * ctx,
+std::string llama_detokenize(
+                         llama_context * ctx,
        const std::vector<llama_token> & tokens,
                                  bool   special = true);

-std::string common_detokenize(
-              const struct llama_vocab * vocab,
-        const std::vector<llama_token> & tokens,
-                                  bool   special = true);
+//
+// Chat template utils
+//
+
+// same with llama_chat_message, but uses std::string
+struct llama_chat_msg {
+    std::string role;
+    std::string content;
+};
+
+// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
+bool llama_chat_verify_template(const std::string & tmpl);
+
+// CPP wrapper for llama_chat_apply_template
+// If the built-in template is not supported, we default to chatml
+// If the custom "tmpl" is not supported, we throw an error
+std::string llama_chat_apply_template(const struct llama_model * model,
+        const std::string & tmpl,
+        const std::vector<llama_chat_msg> & chat,
+        bool add_ass);
+
+// Format single message, while taking into account the position of that message in chat history
+std::string llama_chat_format_single(const struct llama_model * model,
+        const std::string & tmpl,
+        const std::vector<llama_chat_msg> & past_msg,
+        const llama_chat_msg & new_msg,
+        bool add_ass);
+
+// Returns an example of formatted chat
+std::string llama_chat_format_example(const struct llama_model * model,
+        const std::string & tmpl);

 //
 // KV cache utils
 //

 // Dump the KV cache view with the number of sequences per cell.
-void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
+void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);

 // Dump the KV cache view showing individual sequences in each cell (long output).
-void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
+void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);

 //
 // Embedding utils
 //

-// TODO: repace embd_norm with an enum
-void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);
+void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);

-float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
+float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);

 //
 // Control vector utils
 //

-struct common_control_vector_data {
+struct llama_control_vector_data {
    int n_embd;

    // stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
    std::vector<float> data;
 };

-struct common_control_vector_load_info {
+struct llama_control_vector_load_info {
    float strength;

    std::string fname;
@@ -653,22 +531,24 @@ struct common_control_vector_load_info {

 // Load control vectors, scale each by strength, and add them together.
 // On error, returns {-1, empty}
-common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos);
+llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos);

 //
 // Split utils
 //

-namespace {
-
-const char * const LLM_KV_SPLIT_NO            = "split.no";
-const char * const LLM_KV_SPLIT_COUNT         = "split.count";
-const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
-
-}
+static const char * const LLM_KV_SPLIT_NO            = "split.no";
+static const char * const LLM_KV_SPLIT_COUNT         = "split.count";
+static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";

 //
-// training utils
+// YAML utils
 //

-ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride);
+void yaml_dump_vector_float    (FILE * stream, const char * prop_name, const std::vector<float> & data);
+void yaml_dump_vector_int      (FILE * stream, const char * prop_name, const std::vector<int> & data);
+void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
+
+void yaml_dump_non_result_info(
+    FILE * stream, const gpt_params & params, const llama_context * lctx,
+    const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
--- a/common/console.cpp
+++ b/common/console.cpp
@@ -94,9 +94,6 @@ namespace console {
                simple_io = true;
            }
        }
-        if (simple_io) {
-            _setmode(_fileno(stdin), _O_U8TEXT);
-        }
 #else
        // POSIX-specific console initialization
        if (!simple_io) {
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -1,6 +1,4 @@
 #include "json-schema-to-grammar.h"
-#include "common.h"
-
 #include <algorithm>
 #include <fstream>
 #include <map>
@@ -13,12 +11,14 @@

 using json = nlohmann::ordered_json;

+template <typename Iterator>
+static std::string join(Iterator begin, Iterator end, const std::string & separator);
+
+static std::string repeat(const std::string & str, size_t n);
+
 static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
    auto has_max = max_items != std::numeric_limits<int>::max();

-    if (max_items == 0) {
-        return "";
-    }
    if (min_items == 0 && max_items == 1) {
        return item_rule + "?";
    }
@@ -128,8 +128,8 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
                if (sub_len > 0) {
                    auto from_sub = from.substr(i + 1);
                    auto to_sub = to.substr(i + 1);
-                    auto sub_zeros = string_repeat("0", sub_len);
-                    auto sub_nines = string_repeat("9", sub_len);
+                    auto sub_zeros = repeat("0", sub_len);
+                    auto sub_nines = repeat("9", sub_len);

                    auto to_reached = false;
                    out << "(";
@@ -188,8 +188,8 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
        auto max_digits = max_s.length();

        for (auto digits = min_digits; digits < max_digits; digits++) {
-            uniform_range(min_s, string_repeat("9", digits));
-            min_s = "1" + string_repeat("0", digits);
+            uniform_range(min_s, repeat("9", digits));
+            min_s = "1" + repeat("0", digits);
            out << " | ";
        }
        uniform_range(min_s, max_s);
@@ -267,7 +267,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
    throw std::runtime_error("At least one of min_value or max_value must be set");
 }

-const std::string SPACE_RULE = "| \" \" | \"\\n\"{1,2} [ \\t]{0,20}";
+const std::string SPACE_RULE = "| \" \" | \"\\n\" [ \\t]{0,20}";

 struct BuiltinRule {
    std::string content;
@@ -318,6 +318,49 @@ std::unordered_map<char, std::string> GRAMMAR_LITERAL_ESCAPES = {
 std::unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
 std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'^', '$', '.', '[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};

+template <typename Iterator>
+std::string join(Iterator begin, Iterator end, const std::string & separator) {
+    std::ostringstream result;
+    if (begin != end) {
+        result << *begin;
+        for (Iterator it = begin + 1; it != end; ++it) {
+            result << separator << *it;
+        }
+    }
+    return result.str();
+}
+
+static std::vector<std::string> split(const std::string & str, const std::string & delimiter) {
+    std::vector<std::string> tokens;
+    size_t start = 0;
+    size_t end = str.find(delimiter);
+
+    while (end != std::string::npos) {
+        tokens.push_back(str.substr(start, end - start));
+        start = end + delimiter.length();
+        end = str.find(delimiter, start);
+    }
+
+    tokens.push_back(str.substr(start));
+
+    return tokens;
+}
+
+static std::string repeat(const std::string & str, size_t n) {
+    if (n == 0) {
+        return "";
+    }
+
+    std::string result;
+    result.reserve(str.length() * n);
+
+    for (size_t i = 0; i < n; ++i) {
+        result += str;
+    }
+
+    return result;
+}
+
 static std::string replacePattern(const std::string & input, const std::regex & regex, const std::function<std::string(const std::smatch  &)> & replacement) {
    std::smatch match;
    std::string result;
@@ -346,7 +389,6 @@ static std::string format_literal(const std::string & literal) {

 class SchemaConverter {
 private:
-    friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
    std::function<json(const std::string &)> _fetch_json;
    bool _dotall;
    std::map<std::string, std::string> _rules;
@@ -376,7 +418,7 @@ private:
        for (size_t i = 0; i < alt_schemas.size(); i++) {
            rules.push_back(visit(alt_schemas[i], name + (name.empty() ? "alternative-" : "-") + std::to_string(i)));
        }
-        return string_join(rules, " | ");
+        return join(rules.begin(), rules.end(), " | ");
    }

    std::string _visit_pattern(const std::string & pattern, const std::string & name) {
@@ -439,7 +481,7 @@ private:
                for (const auto & item : ret) {
                    results.push_back(to_rule(item));
                }
-                return std::make_pair(string_join(results, " "), false);
+                return std::make_pair(join(results.begin(), results.end(), " "), false);
            };

            while (i < length) {
@@ -497,7 +539,7 @@ private:
                    }
                    curly_brackets += '}';
                    i++;
-                    auto nums = string_split(curly_brackets.substr(1, curly_brackets.length() - 2), ",");
+                    auto nums = split(curly_brackets.substr(1, curly_brackets.length() - 2), ",");
                    int min_times = 0;
                    int max_times = std::numeric_limits<int>::max();
                    try {
@@ -569,7 +611,7 @@ private:
            }
            return join_seq();
        };
-        return _add_rule(name, "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space");
+        return _add_rule(name, "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space");
    }

    /*
@@ -812,7 +854,7 @@ public:
                            return;
                        }
                        std::string pointer = ref.substr(ref.find('#') + 1);
-                        std::vector<std::string> tokens = string_split(pointer, "/");
+                        std::vector<std::string> tokens = split(pointer, "/");
                        for (size_t i = 1; i < tokens.size(); ++i) {
                            std::string sel = tokens[i];
                            if (target.is_null() || !target.contains(sel)) {
@@ -863,7 +905,7 @@ public:
            for (const auto & v : schema["enum"]) {
                enum_values.push_back(_generate_constant_rule(v));
            }
-            return _add_rule(rule_name, "(" + string_join(enum_values, " | ") + ") space");
+            return _add_rule(rule_name, "(" + join(enum_values.begin(), enum_values.end(), " | ") + ") space");
        } else if ((schema_type.is_null() || schema_type == "object")
                && (schema.contains("properties") ||
                    (schema.contains("additionalProperties") && schema["additionalProperties"] != true))) {
@@ -977,10 +1019,10 @@ public:

    void check_errors() {
        if (!_errors.empty()) {
-            throw std::runtime_error("JSON schema conversion failed:\n" + string_join(_errors, "\n"));
+            throw std::runtime_error("JSON schema conversion failed:\n" + join(_errors.begin(), _errors.end(), "\n"));
        }
        if (!_warnings.empty()) {
-            fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n", string_join(_warnings, "; ").c_str());
+            fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n", join(_warnings.begin(), _warnings.end(), "; ").c_str());
        }
    }

@@ -993,35 +1035,11 @@ public:
    }
 };

-std::string json_schema_to_grammar(const json & schema, bool force_gbnf) {
-#ifdef LLAMA_USE_LLGUIDANCE
-    if (!force_gbnf) {
-        return "%llguidance {}\nstart: %json " + schema.dump();
-    }
-#else
-    (void)force_gbnf;
-#endif // LLAMA_USE_LLGUIDANCE
-    return build_grammar([&](const common_grammar_builder & callbacks) {
-        auto copy = schema;
-        callbacks.resolve_refs(copy);
-        callbacks.add_schema("", copy);
-    });
-}
-
-std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options) {
-    SchemaConverter converter([&](const std::string &) { return json(); }, options.dotall);
-    common_grammar_builder builder {
-        /* .add_rule = */ [&](const std::string & name, const std::string & rule) {
-            return converter._add_rule(name, rule);
-        },
-        /* .add_schema = */ [&](const std::string & name, const nlohmann::ordered_json & schema) {
-            return converter.visit(schema, name == "root" ? "" : name);
-        },
-        /* .resolve_refs = */ [&](nlohmann::ordered_json & schema) {
-            converter.resolve_refs(schema, "");
-        }
-    };
-    cb(builder);
+std::string json_schema_to_grammar(const json & schema) {
+    SchemaConverter converter([](const std::string &) { return json::object(); }, /* dotall= */ false);
+    auto copy = schema;
+    converter.resolve_refs(copy, "input");
+    converter.visit(copy, "");
    converter.check_errors();
    return converter.format_grammar();
 }
--- a/common/json-schema-to-grammar.h
+++ b/common/json-schema-to-grammar.h
@@ -5,17 +5,4 @@
 #define JSON_ASSERT GGML_ASSERT
 #include "json.hpp"

-std::string json_schema_to_grammar(const nlohmann::ordered_json & schema,
-                                   bool force_gbnf = false);
-
-struct common_grammar_builder {
-    std::function<std::string(const std::string &, const std::string &)> add_rule;
-    std::function<std::string(const std::string &, const nlohmann::ordered_json &)> add_schema;
-    std::function<void(nlohmann::ordered_json &)> resolve_refs;
-};
-
-struct common_grammar_options {
-    bool dotall = false;
-};
-
-std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options = {});
+std::string json_schema_to_grammar(const nlohmann::ordered_json& schema);
--- a/common/llguidance.cpp
+++ b/common/llguidance.cpp
@@ -1,254 +0,0 @@
-#include "sampling.h"
-#include "log.h"
-
-#ifdef LLAMA_USE_LLGUIDANCE
-
-#    include "llguidance.h"
-#    include <cmath>
-
-struct llama_sampler_llg {
-    const llama_vocab * vocab;
-    std::string         grammar_kind;
-    std::string         grammar_data;
-    LlgTokenizer *      tokenizer;
-    LlgMatcher *        grammar;
-};
-
-static LlgMatcher * llama_sampler_llg_new(LlgTokenizer * tokenizer, const char * grammar_kind,
-                                          const char * grammar_data) {
-    LlgConstraintInit cinit;
-    llg_constraint_init_set_defaults(&cinit, tokenizer);
-    const char * log_level = getenv("LLGUIDANCE_LOG_LEVEL");
-    if (log_level && *log_level) {
-        cinit.log_stderr_level = atoi(log_level);
-    }
-    auto c = llg_new_matcher(&cinit, grammar_kind, grammar_data);
-    if (llg_matcher_get_error(c)) {
-        LOG_ERR("llg error: %s\n", llg_matcher_get_error(c));
-        llg_free_matcher(c);
-        return nullptr;
-    }
-
-    return c;
-}
-
-static const char * llama_sampler_llg_name(const llama_sampler * /*smpl*/) {
-    return "llguidance";
-}
-
-static void llama_sampler_llg_accept_impl(llama_sampler * smpl, llama_token token) {
-    auto * ctx = (llama_sampler_llg *) smpl->ctx;
-    if (ctx->grammar) {
-        llg_matcher_consume_token(ctx->grammar, token);
-    }
-}
-
-static void llama_sampler_llg_apply(llama_sampler * smpl, llama_token_data_array * cur_p) {
-    auto * ctx = (llama_sampler_llg *) smpl->ctx;
-    if (ctx->grammar) {
-        const uint32_t * mask = llg_matcher_get_mask(ctx->grammar);
-        if (mask == nullptr) {
-            if (llg_matcher_compute_mask(ctx->grammar) == 0) {
-                mask = llg_matcher_get_mask(ctx->grammar);
-            } else {
-                LOG_ERR("llg error: %s\n", llg_matcher_get_error(ctx->grammar));
-                llg_free_matcher(ctx->grammar);
-                ctx->grammar = nullptr;
-                return;
-            }
-        }
-
-        for (size_t i = 0; i < cur_p->size; ++i) {
-            auto token = cur_p->data[i].id;
-            if ((mask[token / 32] & (1 << (token % 32))) == 0) {
-                cur_p->data[i].logit = -INFINITY;
-            }
-        }
-    }
-}
-
-static void llama_sampler_llg_reset(llama_sampler * smpl) {
-    auto * ctx = (llama_sampler_llg *) smpl->ctx;
-    if (ctx->grammar) {
-        llg_matcher_reset(ctx->grammar);
-    }
-}
-
-static llama_sampler * llama_sampler_llg_clone(const llama_sampler * smpl) {
-    const auto * ctx = (const llama_sampler_llg *) smpl->ctx;
-
-    auto * result = llama_sampler_init_llg(ctx->vocab, nullptr, nullptr);
-
-    // copy the state
-    {
-        auto * result_ctx = (llama_sampler_llg *) result->ctx;
-
-        if (ctx->grammar) {
-            result_ctx->grammar_kind = ctx->grammar_kind;
-            result_ctx->grammar_data = ctx->grammar_data;
-            result_ctx->grammar      = llg_clone_matcher(ctx->grammar);
-            result_ctx->tokenizer    = llg_clone_tokenizer(ctx->tokenizer);
-        }
-    }
-
-    return result;
-}
-
-static void llama_sampler_llg_free(llama_sampler * smpl) {
-    const auto * ctx = (llama_sampler_llg *) smpl->ctx;
-
-    if (ctx->grammar) {
-        llg_free_matcher(ctx->grammar);
-        llg_free_tokenizer(ctx->tokenizer);
-    }
-
-    delete ctx;
-}
-
-static llama_sampler_i llama_sampler_llg_i = {
-    /* .name   = */ llama_sampler_llg_name,
-    /* .accept = */ llama_sampler_llg_accept_impl,
-    /* .apply  = */ llama_sampler_llg_apply,
-    /* .reset  = */ llama_sampler_llg_reset,
-    /* .clone  = */ llama_sampler_llg_clone,
-    /* .free   = */ llama_sampler_llg_free,
-};
-
-static size_t llama_sampler_llg_tokenize_fn(const void * user_data, const uint8_t * bytes, size_t bytes_len,
-                                            uint32_t * output_tokens, size_t output_tokens_len) {
-    const llama_vocab * vocab = (const llama_vocab *) user_data;
-    int                 r     = 0;
-    try {
-        r = llama_tokenize(vocab, (const char *) bytes, bytes_len, (int32_t *) output_tokens, output_tokens_len, false,
-                           true);
-    } catch (const std::exception & e) {
-        GGML_ABORT("llama_tokenize failed: %s\n", e.what());
-    }
-    if (r < 0) {
-        return -r;
-    }
-    return r;
-}
-
-static LlgTokenizer * llama_sampler_llg_new_tokenizer(const llama_vocab * vocab) {
-    // TODO store the tokenizer in the vocab somehow
-    static const llama_vocab * vocab_cache;
-    static LlgTokenizer *      tokenizer_cache;
-
-    if (vocab_cache == vocab) {
-        return llg_clone_tokenizer(tokenizer_cache);
-    }
-
-    auto tok_eos = llama_vocab_eot(vocab);
-    if (tok_eos == LLAMA_TOKEN_NULL) {
-        tok_eos = llama_vocab_eos(vocab);
-    }
-
-    size_t vocab_size = llama_vocab_n_tokens(vocab);
-
-    auto token_lens       = new uint32_t[vocab_size];
-    // we typically have ~7 bytes per token; let's go on the safe side here
-    auto token_bytes_size = vocab_size * 16 + 1024 * 1024;
-    auto token_bytes      = new uint8_t[token_bytes_size];
-
-    size_t offset = 0;
-    for (size_t i = 0; i < vocab_size; i++) {
-        size_t max_token = 1024;
-        if (token_bytes_size - offset < max_token) {
-            GGML_ABORT("token_bytes buffer too small\n");
-        }
-
-        llama_token token = i;
-        auto        dp    = (char *) token_bytes + offset;
-        auto        size  = llama_detokenize(vocab, &token, 1, dp, max_token, false, false);
-        if (size < 0) {
-            GGML_ABORT("llama_detokenize failed\n");
-        }
-        if (size == 0) {
-            size = llama_detokenize(vocab, &token, 1, dp + 1, max_token - 1, false, true);
-            if (size < 0) {
-                GGML_ABORT("llama_detokenize failed\n");
-            }
-            if (size != 0) {
-                *dp = '\xff';  // special token prefix marker
-                size += 1;
-            }
-        }
-
-        token_lens[i] = size;
-        offset += size;
-    }
-
-    LlgTokenizerInit tinit = {
-        /* .vocab_size                         = */ (uint32_t) vocab_size,
-        /* .tok_eos                            = */ (uint32_t) tok_eos,
-        /* .token_lens                         = */ token_lens,
-        /* .token_bytes                        = */ token_bytes,
-        /* .tokenizer_json                     = */ nullptr,
-        /* .tokenize_assumes_string            = */ true,
-        /* .tokenize_fn                        = */ llama_sampler_llg_tokenize_fn,
-        /* .use_approximate_greedy_tokenize_fn = */ false,
-        /* .tokenize_user_data                 = */ vocab,
-        /* .slices                             = */ nullptr,
-    };
-
-    char           error_buffer[1024];
-    LlgTokenizer * tokenizer = llg_new_tokenizer(&tinit, error_buffer, sizeof(error_buffer));
-
-    delete[] token_bytes;
-    delete[] token_lens;
-
-    if (tokenizer == nullptr) {
-        LOG_ERR("llg tokenizer error: %s\n", error_buffer);
-        return tokenizer;
-    }
-
-    if (tokenizer_cache) {
-        llg_free_tokenizer(tokenizer_cache);
-    }
-    vocab_cache     = vocab;
-    tokenizer_cache = tokenizer;
-
-    return llg_clone_tokenizer(tokenizer_cache);
-}
-
-llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab, const char * grammar_kind,
-                                       const char * grammar_data) {
-    auto * ctx = new llama_sampler_llg;
-
-    if (grammar_kind != nullptr && grammar_kind[0] != '\0') {
-        auto tokenizer = llama_sampler_llg_new_tokenizer(vocab);
-        *ctx           = {
-            /* .vocab        = */ vocab,
-            /* .grammar_kind = */ grammar_kind,
-            /* .grammar_data = */ grammar_data,
-            /* .tokenizer    = */ tokenizer,
-            /* .grammar      = */ llama_sampler_llg_new(tokenizer, grammar_kind, grammar_data),
-        };
-        if (ctx->grammar) {
-            GGML_ASSERT(((size_t) llama_vocab_n_tokens(vocab) + 31) / 32 * 4 ==
-                        llg_matcher_get_mask_byte_size(ctx->grammar));
-        }
-    } else {
-        *ctx = {
-            /* .vocab        = */ vocab,
-            /* .grammar_kind = */ {},
-            /* .grammar_data = */ {},
-            /* .tokenizer    = */ nullptr,
-            /* .grammar      = */ nullptr,
-        };
-    }
-
-    return llama_sampler_init(
-        /* .iface = */ &llama_sampler_llg_i,
-        /* .ctx   = */ ctx);
-}
-
-#else
-
-llama_sampler * llama_sampler_init_llg(const llama_vocab *, const char *, const char *) {
-    LOG_WRN("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
-    return nullptr;
-}
-
-#endif  // LLAMA_USE_LLGUIDANCE
--- a/common/log.cpp
+++ b/common/log.cpp
@@ -1,6 +1,5 @@
 #include "log.h"

-#include <chrono>
 #include <condition_variable>
 #include <cstdarg>
 #include <cstdio>
@@ -9,27 +8,37 @@
 #include <thread>
 #include <vector>

-int common_log_verbosity_thold = LOG_DEFAULT_LLAMA;
+int gpt_log_verbosity_thold = LOG_DEFAULT_LLAMA;

-void common_log_set_verbosity_thold(int verbosity) {
-    common_log_verbosity_thold = verbosity;
+void gpt_log_set_verbosity_thold(int verbosity) {
+    gpt_log_verbosity_thold = verbosity;
 }

+#define LOG_COL_DEFAULT "\033[0m"
+#define LOG_COL_BOLD    "\033[1m"
+#define LOG_COL_RED     "\033[31m"
+#define LOG_COL_GREEN   "\033[32m"
+#define LOG_COL_YELLOW  "\033[33m"
+#define LOG_COL_BLUE    "\033[34m"
+#define LOG_COL_MAGENTA "\033[35m"
+#define LOG_COL_CYAN    "\033[36m"
+#define LOG_COL_WHITE   "\033[37m"
+
 static int64_t t_us() {
    return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
 }

 // colors
-enum common_log_col : int {
-    COMMON_LOG_COL_DEFAULT = 0,
-    COMMON_LOG_COL_BOLD,
-    COMMON_LOG_COL_RED,
-    COMMON_LOG_COL_GREEN,
-    COMMON_LOG_COL_YELLOW,
-    COMMON_LOG_COL_BLUE,
-    COMMON_LOG_COL_MAGENTA,
-    COMMON_LOG_COL_CYAN,
-    COMMON_LOG_COL_WHITE,
+enum gpt_log_col : int {
+    GPT_LOG_COL_DEFAULT = 0,
+    GPT_LOG_COL_BOLD,
+    GPT_LOG_COL_RED,
+    GPT_LOG_COL_GREEN,
+    GPT_LOG_COL_YELLOW,
+    GPT_LOG_COL_BLUE,
+    GPT_LOG_COL_MAGENTA,
+    GPT_LOG_COL_CYAN,
+    GPT_LOG_COL_WHITE,
 };

 // disable colors by default
@@ -45,7 +54,7 @@ static std::vector<const char *> g_col = {
    "",
 };

-struct common_log_entry {
+struct gpt_log_entry {
    enum ggml_log_level level;

    bool prefix;
@@ -62,7 +71,7 @@ struct common_log_entry {
        if (!fcur) {
            // stderr displays DBG messages only when their verbosity level is not higher than the threshold
            // these messages will still be logged to a file
-            if (level == GGML_LOG_LEVEL_DEBUG && common_log_verbosity_thold < LOG_DEFAULT_DEBUG) {
+            if (level == GGML_LOG_LEVEL_DEBUG && gpt_log_verbosity_thold < LOG_DEFAULT_DEBUG) {
                return;
            }

@@ -73,23 +82,23 @@ struct common_log_entry {
            }
        }

-        if (level != GGML_LOG_LEVEL_NONE && level != GGML_LOG_LEVEL_CONT && prefix) {
+        if (level != GGML_LOG_LEVEL_NONE && prefix) {
            if (timestamp) {
                // [M.s.ms.us]
                fprintf(fcur, "%s%d.%02d.%03d.%03d%s ",
-                        g_col[COMMON_LOG_COL_BLUE],
+                        g_col[GPT_LOG_COL_BLUE],
                        (int) (timestamp / 1000000 / 60),
                        (int) (timestamp / 1000000 % 60),
                        (int) (timestamp / 1000 % 1000),
                        (int) (timestamp % 1000),
-                        g_col[COMMON_LOG_COL_DEFAULT]);
+                        g_col[GPT_LOG_COL_DEFAULT]);
            }

            switch (level) {
-                case GGML_LOG_LEVEL_INFO:  fprintf(fcur, "%sI %s", g_col[COMMON_LOG_COL_GREEN],   g_col[COMMON_LOG_COL_DEFAULT]); break;
-                case GGML_LOG_LEVEL_WARN:  fprintf(fcur, "%sW %s", g_col[COMMON_LOG_COL_MAGENTA], ""                        ); break;
-                case GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[COMMON_LOG_COL_RED],     ""                        ); break;
-                case GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[COMMON_LOG_COL_YELLOW],  ""                        ); break;
+                case GGML_LOG_LEVEL_INFO:  fprintf(fcur, "%sI %s", g_col[GPT_LOG_COL_GREEN],   g_col[GPT_LOG_COL_DEFAULT]); break;
+                case GGML_LOG_LEVEL_WARN:  fprintf(fcur, "%sW %s", g_col[GPT_LOG_COL_MAGENTA], ""                        ); break;
+                case GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[GPT_LOG_COL_RED],     ""                        ); break;
+                case GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[GPT_LOG_COL_YELLOW],  ""                        ); break;
                default:
                    break;
            }
@@ -98,18 +107,18 @@ struct common_log_entry {
        fprintf(fcur, "%s", msg.data());

        if (level == GGML_LOG_LEVEL_WARN || level == GGML_LOG_LEVEL_ERROR || level == GGML_LOG_LEVEL_DEBUG) {
-            fprintf(fcur, "%s", g_col[COMMON_LOG_COL_DEFAULT]);
+            fprintf(fcur, "%s", g_col[GPT_LOG_COL_DEFAULT]);
        }

        fflush(fcur);
    }
 };

-struct common_log {
+struct gpt_log {
    // default capacity - will be expanded if needed
-    common_log() : common_log(256) {}
+    gpt_log() : gpt_log(256) {}

-    common_log(size_t capacity) {
+    gpt_log(size_t capacity) {
        file = nullptr;
        prefix = false;
        timestamps = false;
@@ -128,7 +137,7 @@ struct common_log {
        resume();
    }

-    ~common_log() {
+    ~gpt_log() {
        pause();
        if (file) {
            fclose(file);
@@ -149,12 +158,12 @@ private:
    int64_t t_start;

    // ring buffer of entries
-    std::vector<common_log_entry> entries;
+    std::vector<gpt_log_entry> entries;
    size_t head;
    size_t tail;

    // worker thread copies into this
-    common_log_entry cur;
+    gpt_log_entry cur;

 public:
    void add(enum ggml_log_level level, const char * fmt, va_list args) {
@@ -197,7 +206,6 @@ public:
                vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args_copy);
            }
 #endif
-            va_end(args_copy);
        }

        entry.level = level;
@@ -211,7 +219,7 @@ public:
        tail = (tail + 1) % entries.size();
        if (tail == head) {
            // expand the buffer
-            std::vector<common_log_entry> new_entries(2*entries.size());
+            std::vector<gpt_log_entry> new_entries(2*entries.size());

            size_t new_tail = 0;

@@ -312,15 +320,15 @@ public:
        pause();

        if (colors) {
-            g_col[COMMON_LOG_COL_DEFAULT] = LOG_COL_DEFAULT;
-            g_col[COMMON_LOG_COL_BOLD]    = LOG_COL_BOLD;
-            g_col[COMMON_LOG_COL_RED]     = LOG_COL_RED;
-            g_col[COMMON_LOG_COL_GREEN]   = LOG_COL_GREEN;
-            g_col[COMMON_LOG_COL_YELLOW]  = LOG_COL_YELLOW;
-            g_col[COMMON_LOG_COL_BLUE]    = LOG_COL_BLUE;
-            g_col[COMMON_LOG_COL_MAGENTA] = LOG_COL_MAGENTA;
-            g_col[COMMON_LOG_COL_CYAN]    = LOG_COL_CYAN;
-            g_col[COMMON_LOG_COL_WHITE]   = LOG_COL_WHITE;
+            g_col[GPT_LOG_COL_DEFAULT] = LOG_COL_DEFAULT;
+            g_col[GPT_LOG_COL_BOLD]    = LOG_COL_BOLD;
+            g_col[GPT_LOG_COL_RED]     = LOG_COL_RED;
+            g_col[GPT_LOG_COL_GREEN]   = LOG_COL_GREEN;
+            g_col[GPT_LOG_COL_YELLOW]  = LOG_COL_YELLOW;
+            g_col[GPT_LOG_COL_BLUE]    = LOG_COL_BLUE;
+            g_col[GPT_LOG_COL_MAGENTA] = LOG_COL_MAGENTA;
+            g_col[GPT_LOG_COL_CYAN]    = LOG_COL_CYAN;
+            g_col[GPT_LOG_COL_WHITE]   = LOG_COL_WHITE;
        } else {
            for (size_t i = 0; i < g_col.size(); i++) {
                g_col[i] = "";
@@ -347,47 +355,47 @@ public:
 // public API
 //

-struct common_log * common_log_init() {
-    return new common_log;
+struct gpt_log * gpt_log_init() {
+    return new gpt_log;
 }

-struct common_log * common_log_main() {
-    static struct common_log log;
+struct gpt_log * gpt_log_main() {
+    static struct gpt_log log;

    return &log;
 }

-void common_log_pause(struct common_log * log) {
+void gpt_log_pause(struct gpt_log * log) {
    log->pause();
 }

-void common_log_resume(struct common_log * log) {
+void gpt_log_resume(struct gpt_log * log) {
    log->resume();
 }

-void common_log_free(struct common_log * log) {
+void gpt_log_free(struct gpt_log * log) {
    delete log;
 }

-void common_log_add(struct common_log * log, enum ggml_log_level level, const char * fmt, ...) {
+void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * fmt, ...) {
    va_list args;
    va_start(args, fmt);
    log->add(level, fmt, args);
    va_end(args);
 }

-void common_log_set_file(struct common_log * log, const char * file) {
+void gpt_log_set_file(struct gpt_log * log, const char * file) {
    log->set_file(file);
 }

-void common_log_set_colors(struct common_log * log, bool colors) {
+void gpt_log_set_colors(struct gpt_log * log, bool colors) {
    log->set_colors(colors);
 }

-void common_log_set_prefix(struct common_log * log, bool prefix) {
+void gpt_log_set_prefix(struct gpt_log * log, bool prefix) {
    log->set_prefix(prefix);
 }

-void common_log_set_timestamps(struct common_log * log, bool timestamps) {
+void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps) {
    log->set_timestamps(timestamps);
 }
--- a/common/log.h
+++ b/common/log.h
@@ -2,20 +2,9 @@

 #include "ggml.h" // for ggml_log_level

-#define LOG_CLR_TO_EOL  "\033[K\r"
-#define LOG_COL_DEFAULT "\033[0m"
-#define LOG_COL_BOLD    "\033[1m"
-#define LOG_COL_RED     "\033[31m"
-#define LOG_COL_GREEN   "\033[32m"
-#define LOG_COL_YELLOW  "\033[33m"
-#define LOG_COL_BLUE    "\033[34m"
-#define LOG_COL_MAGENTA "\033[35m"
-#define LOG_COL_CYAN    "\033[36m"
-#define LOG_COL_WHITE   "\033[37m"
-
 #ifndef __GNUC__
 #    define LOG_ATTRIBUTE_FORMAT(...)
-#elif defined(__MINGW32__) && !defined(__clang__)
+#elif defined(__MINGW32__)
 #    define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
 #else
 #    define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
@@ -25,23 +14,23 @@
 #define LOG_DEFAULT_LLAMA 0

 // needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
-// set via common_log_set_verbosity()
-extern int common_log_verbosity_thold;
+// set via gpt_log_set_verbosity()
+extern int gpt_log_verbosity_thold;

-void common_log_set_verbosity_thold(int verbosity); // not thread-safe
+void gpt_log_set_verbosity_thold(int verbosity); // not thread-safe

-// the common_log uses an internal worker thread to print/write log messages
+// the gpt_log uses an internal worker thread to print/write log messages
 // when the worker thread is paused, incoming log messages are discarded
-struct common_log;
+struct gpt_log;

-struct common_log * common_log_init();
-struct common_log * common_log_main(); // singleton, automatically destroys itself on exit
-void                common_log_pause (struct common_log * log); // pause  the worker thread, not thread-safe
-void                common_log_resume(struct common_log * log); // resume the worker thread, not thread-safe
-void                common_log_free  (struct common_log * log);
+struct gpt_log * gpt_log_init();
+struct gpt_log * gpt_log_main(); // singleton, automatically destroys itself on exit
+void             gpt_log_pause (struct gpt_log * log); // pause  the worker thread, not thread-safe
+void             gpt_log_resume(struct gpt_log * log); // resume the worker thread, not thread-safe
+void             gpt_log_free  (struct gpt_log * log);

 LOG_ATTRIBUTE_FORMAT(3, 4)
-void common_log_add(struct common_log * log, enum ggml_log_level level, const char * fmt, ...);
+void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * fmt, ...);

 // defaults: file = NULL, colors = false, prefix = false, timestamps = false
 //
@@ -65,10 +54,10 @@ void common_log_add(struct common_log * log, enum ggml_log_level level, const ch
 // D - debug   (stderr, V = LOG_DEFAULT_DEBUG)
 //

-void common_log_set_file      (struct common_log * log, const char * file);       // not thread-safe
-void common_log_set_colors    (struct common_log * log,       bool   colors);     // not thread-safe
-void common_log_set_prefix    (struct common_log * log,       bool   prefix);     // whether to output prefix to each log
-void common_log_set_timestamps(struct common_log * log,       bool   timestamps); // whether to output timestamps in the prefix
+void gpt_log_set_file      (struct gpt_log * log, const char * file);       // not thread-safe
+void gpt_log_set_colors    (struct gpt_log * log,       bool   colors);     // not thread-safe
+void gpt_log_set_prefix    (struct gpt_log * log,       bool   prefix);     // whether to output prefix to each log
+void gpt_log_set_timestamps(struct gpt_log * log,       bool   timestamps); // whether to output timestamps in the prefix

 // helper macros for logging
 // use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
@@ -77,13 +66,13 @@ void common_log_set_timestamps(struct common_log * log,       bool   timestamps)
 //
 //   LOG_DBG("this is a debug message: %d\n", expensive_function());
 //
-// this will avoid calling expensive_function() if LOG_DEFAULT_DEBUG > common_log_verbosity_thold
+// this will avoid calling expensive_function() if LOG_DEFAULT_DEBUG > gpt_log_verbosity_thold
 //

 #define LOG_TMPL(level, verbosity, ...) \
    do { \
-        if ((verbosity) <= common_log_verbosity_thold) { \
-            common_log_add(common_log_main(), (level), __VA_ARGS__); \
+        if ((verbosity) <= gpt_log_verbosity_thold) { \
+            gpt_log_add(gpt_log_main(), (level), __VA_ARGS__); \
        } \
    } while (0)

@@ -94,10 +83,8 @@ void common_log_set_timestamps(struct common_log * log,       bool   timestamps)
 #define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN,  0,                 __VA_ARGS__)
 #define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, 0,                 __VA_ARGS__)
 #define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_DEFAULT_DEBUG, __VA_ARGS__)
-#define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT,  0,                 __VA_ARGS__)

 #define LOG_INFV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_INFO,  verbosity, __VA_ARGS__)
 #define LOG_WRNV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_WARN,  verbosity, __VA_ARGS__)
 #define LOG_ERRV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, verbosity, __VA_ARGS__)
 #define LOG_DBGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, verbosity, __VA_ARGS__)
-#define LOG_CNTV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_CONT,  verbosity, __VA_ARGS__)
--- a/common/minja/chat-template.hpp
+++ b/common/minja/chat-template.hpp
@@ -1,537 +0,0 @@
-/*
-    Copyright 2024 Google LLC
-
-    Use of this source code is governed by an MIT-style
-    license that can be found in the LICENSE file or at
-    https://opensource.org/licenses/MIT.
-*/
-// SPDX-License-Identifier: MIT
-#pragma once
-
-#include "minja.hpp"
-
-#include <chrono>
-#include <cstddef>
-#include <cstdio>
-#include <exception>
-#include <iomanip>
-#include <memory>
-#include <sstream>
-#include <string>
-#include <vector>
-
-#include <json.hpp>
-
-using json = nlohmann::ordered_json;
-
-namespace minja {
-
-struct chat_template_caps {
-    bool supports_tools = false;
-    bool supports_tool_calls = false;
-    bool supports_tool_responses = false;
-    bool supports_system_role = false;
-    bool supports_parallel_tool_calls = false;
-    bool supports_tool_call_id = false;
-    // meta-llama/Llama-3.1-8B-Instruct expects arguments to be an object.
-    // Most other templates (and OpenAI's API) expect the arguments object to be stringified.
-    bool requires_object_arguments = false;
-    // CohereForAI/c4ai-command-r-plus simple variant
-    bool requires_non_null_content = false;
-    // MiniMaxAI/MiniMax-Text-01 special
-    bool requires_typed_content = false;
-};
-
-struct chat_template_inputs {
-    nlohmann::ordered_json messages;
-    nlohmann::ordered_json tools;
-    bool add_generation_prompt = true;
-    nlohmann::ordered_json extra_context;
-    std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
-};
-
-struct chat_template_options {
-    bool apply_polyfills = true;
-    bool use_bos_token = true;
-    bool use_eos_token = true;
-    bool define_strftime_now = true;
-
-    bool polyfill_tools = true;
-    bool polyfill_tool_call_examples = true;
-    bool polyfill_tool_calls = true;
-    bool polyfill_tool_responses = true;
-    bool polyfill_system_role = true;
-    bool polyfill_object_arguments = true;
-    bool polyfill_typed_content = true;
-};
-
-class chat_template {
-
-  private:
-    chat_template_caps caps_;
-    std::string source_;
-    std::string bos_token_;
-    std::string eos_token_;
-    std::shared_ptr<minja::TemplateNode> template_root_;
-    std::string tool_call_example_;
-
-    std::string try_raw_render(
-        const nlohmann::ordered_json & messages,
-        const nlohmann::ordered_json & tools,
-        bool add_generation_prompt,
-        const nlohmann::ordered_json & extra_context = nlohmann::ordered_json()) const
-    {
-        try {
-            chat_template_inputs inputs;
-            inputs.messages = messages;
-            inputs.tools = tools;
-            inputs.add_generation_prompt = add_generation_prompt;
-            inputs.extra_context = extra_context;
-            // Use fixed date for tests
-            inputs.now = std::chrono::system_clock::from_time_t(0);
-
-            chat_template_options opts;
-            opts.apply_polyfills = false;
-
-            auto prompt = apply(inputs, opts);
-            // fprintf(stderr, "try_raw_render: %s\n", prompt.c_str());
-            return prompt;
-        } catch (const std::exception & e) {
-            // fprintf(stderr, "try_raw_render error: %s\n", e.what());
-            return "";
-        }
-    }
-
-  public:
-
-    chat_template(const std::string & source, const std::string & bos_token, const std::string & eos_token)
-        : source_(source), bos_token_(bos_token), eos_token_(eos_token)
-    {
-        template_root_ = minja::Parser::parse(source_, {
-            /* .trim_blocks = */ true,
-            /* .lstrip_blocks = */ true,
-            /* .keep_trailing_newline = */ false,
-        });
-
-        auto contains = [](const std::string & haystack, const std::string & needle) {
-            return haystack.find(needle) != std::string::npos;
-        };
-
-        const std::string user_needle = "<User Needle>";
-        const std::string sys_needle = "<System Needle>";
-        const json dummy_str_user_msg = {{"role", "user"}, {"content", user_needle}};
-        const json dummy_typed_user_msg = {{"role", "user"}, {"content", json::array({{{"type", "text"}, {"text", user_needle}}})}};
-
-        caps_.requires_typed_content =
-            !contains(try_raw_render(json::array({dummy_str_user_msg}), {}, false), user_needle)
-            && contains(try_raw_render(json::array({dummy_typed_user_msg}), {}, false), user_needle);
-
-        const auto dummy_user_msg = caps_.requires_typed_content
-            ? dummy_typed_user_msg
-            : dummy_str_user_msg;
-        const json needle_system_msg = {
-            {"role", "system"},
-            {"content", caps_.requires_typed_content ? json::array({{{"type", "text"}, {"text", sys_needle}}}) : json(sys_needle)},
-        };
-
-        caps_.supports_system_role = contains(try_raw_render({needle_system_msg, dummy_user_msg,}, {}, false), sys_needle);
-
-        auto out = try_raw_render(json::array({
-            dummy_user_msg
-        }), json::array({
-            {
-                {"name", "some_tool"},
-                {"type", "function"},
-                {"function", {
-                    {"name", "some_tool"},
-                    {"description", "Some tool."},
-                    {"parameters", {
-                        {"type", "object"},
-                        {"properties", {
-                            {"arg", {
-                                {"type", "string"},
-                                {"description", "Some argument."},
-                            }},
-                        }},
-                        {"required", json::array({ "arg" })},
-                    }},
-                }},
-            },
-        }), false);
-        caps_.supports_tools = contains(out, "some_tool");
-
-        auto make_tool_calls_msg = [&](const json & tool_calls) {
-            return json {
-                {"role", "assistant"},
-                {"content", nullptr},
-                {"tool_calls", tool_calls},
-            };
-        };
-        auto make_tool_call = [](const std::string & tool_name, const json & arguments) {
-            return json {
-                {"id", "call_1___"},
-                {"type", "function"},
-                {"function", {
-                    {"arguments", arguments},
-                    {"name", tool_name},
-                }},
-            };
-        };
-        const json dummy_args_obj {{"argument_needle", "print('Hello, World!')"}};
-
-        // Note: the arguments are rendered in both cases, but may be double-escaped, which we don't want.
-        out = try_raw_render(json::array({
-            dummy_user_msg,
-            make_tool_calls_msg(json::array({make_tool_call("ipython", dummy_args_obj.dump())})),
-        }), {}, false);
-        auto tool_call_renders_str_arguments = contains(out, "\"argument_needle\":") || contains(out, "'argument_needle':");
-        out = try_raw_render(json::array({
-            dummy_user_msg,
-            make_tool_calls_msg(json::array({make_tool_call("ipython", dummy_args_obj)})),
-        }), {}, false);
-        auto tool_call_renders_obj_arguments = contains(out, "\"argument_needle\":") || contains(out, "'argument_needle':");
-
-        caps_.supports_tool_calls = tool_call_renders_str_arguments || tool_call_renders_obj_arguments;
-        caps_.requires_object_arguments = !tool_call_renders_str_arguments && tool_call_renders_obj_arguments;
-        auto out_empty = try_raw_render(json::array({dummy_user_msg, {{"role", "assistant"}, {"content", ""}}}), {}, false);
-        auto out_null = try_raw_render(json::array({dummy_user_msg, {{"role", "assistant"}, {"content", nullptr}}}), {}, false);
-        caps_.requires_non_null_content = contains(out_empty, user_needle) && !contains(out_null, user_needle);
-
-        if (caps_.supports_tool_calls) {
-            auto dummy_args = caps_.requires_object_arguments ? dummy_args_obj : json(dummy_args_obj.dump());
-            auto tc1 = make_tool_call("test_tool1", dummy_args);
-            auto tc2 = make_tool_call("test_tool2", dummy_args);
-            auto out = try_raw_render(json::array({
-                dummy_user_msg,
-                make_tool_calls_msg(json::array({tc1, tc2})),
-            }), {}, false);
-            caps_.supports_parallel_tool_calls = contains(out, "test_tool1") && contains(out, "test_tool2");
-
-            out = try_raw_render(json::array({
-                dummy_user_msg,
-                make_tool_calls_msg(json::array({tc1})),
-                {
-                    {"role", "tool"},
-                    {"name", "test_tool1"},
-                    {"content", "Some response!"},
-                    {"tool_call_id", "call_911_"},
-                }
-            }), {}, false);
-            caps_.supports_tool_responses = contains(out, "Some response!");
-            caps_.supports_tool_call_id = contains(out, "call_911_");
-        }
-
-        try {
-            if (!caps_.supports_tools) {
-                const json user_msg {
-                    {"role", "user"},
-                    {"content", "Hey"},
-                };
-                const json args {
-                    {"arg1", "some_value"},
-                };
-                const json tool_call_msg {
-                    {"role", "assistant"},
-                    {"content", nullptr},
-                    {"tool_calls", json::array({
-                        {
-                            // TODO: detect if requires numerical id or fixed length == 6 like Nemo
-                            {"id", "call_1___"},
-                            {"type", "function"},
-                            {"function", {
-                                {"name", "tool_name"},
-                                {"arguments", (caps_.requires_object_arguments ? args : json(minja::Value(args).dump(-1, /* to_json= */ true)))},
-                            }},
-                        },
-                    })},
-                };
-                std::string prefix, full;
-                {
-                    chat_template_inputs inputs;
-                    inputs.messages = json::array({user_msg});
-                    inputs.add_generation_prompt = true;
-                    prefix = apply(inputs);
-                }
-                {
-                    chat_template_inputs inputs;
-                    inputs.messages = json::array({user_msg, tool_call_msg});
-                    inputs.add_generation_prompt = false;
-                    full = apply(inputs);
-                }
-                auto eos_pos_last = full.rfind(eos_token_);
-                if (eos_pos_last == prefix.size() - eos_token_.size() ||
-                      (full[full.size() - 1] == '\n' && (eos_pos_last == full.size() - eos_token_.size() - 1))) {
-                    full = full.substr(0, eos_pos_last);
-                }
-                size_t common_prefix_length = 0;
-                for (size_t i = 0; i < prefix.size() && i < full.size(); ++i) {
-                    if (prefix[i] != full[i]) {
-                        break;
-                    }
-                    if (prefix[i] == '<') {
-                        // DeepSeek R1's template (as of 20250209) adds a trailing <think> if add_generation_prompt,
-                        // but it removes thinking tags for past messages.
-                        // The prefix and full strings diverge at <think> vs. <｜tool▁calls▁begin｜>, we avoid consuming the leading <.
-                        continue;
-                    }
-                    common_prefix_length = i + 1;
-                }
-                auto example = full.substr(common_prefix_length);
-                if (example.find("tool_name") == std::string::npos && example.find("some_value") == std::string::npos) {
-                    fprintf(stderr, "Failed to infer a tool call example (possible template bug)\n");
-                } else {
-                    tool_call_example_ = example;
-                }
-            }
-        } catch (const std::exception & e) {
-            fprintf(stderr, "Failed to generate tool call example: %s\n", e.what());
-        }
-    }
-
-    const std::string & source() const { return source_; }
-    const std::string & bos_token() const { return bos_token_; }
-    const std::string & eos_token() const { return eos_token_; }
-    const chat_template_caps & original_caps() const { return caps_; }
-
-    // Deprecated, please use the form with chat_template_inputs and chat_template_options
-    std::string apply(
-        const nlohmann::ordered_json & messages,
-        const nlohmann::ordered_json & tools,
-        bool add_generation_prompt,
-        const nlohmann::ordered_json & extra_context = nlohmann::ordered_json(),
-        bool apply_polyfills = true)
-    {
-        fprintf(stderr, "[%s] Deprecated!\n", __func__);
-        chat_template_inputs inputs;
-        inputs.messages = messages;
-        inputs.tools = tools;
-        inputs.add_generation_prompt = add_generation_prompt;
-        inputs.extra_context = extra_context;
-        inputs.now = std::chrono::system_clock::now();
-
-        chat_template_options opts;
-        opts.apply_polyfills = apply_polyfills;
-
-        return apply(inputs, opts);
-    }
-
-    std::string apply(
-        const chat_template_inputs & inputs,
-        const chat_template_options & opts = chat_template_options()) const
-    {
-        json actual_messages;
-
-        auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
-        auto has_tool_calls = false;
-        auto has_tool_responses = false;
-        auto has_string_content = false;
-        for (const auto & message : inputs.messages) {
-            if (message.contains("tool_calls") && !message["tool_calls"].is_null()) {
-                has_tool_calls = true;
-            }
-            if (message.contains("role") && message["role"] == "tool") {
-                has_tool_responses = true;
-            }
-            if (message.contains("content") && message["content"].is_string()) {
-                has_string_content = true;
-            }
-        }
-
-        auto polyfill_system_role = opts.polyfill_system_role && !caps_.supports_system_role;
-        auto polyfill_tools = opts.polyfill_tools && has_tools && !caps_.supports_tools;
-        auto polyfill_tool_call_example = polyfill_tools && opts.polyfill_tool_call_examples;
-        auto polyfill_tool_calls = opts.polyfill_tool_calls && has_tool_calls && !caps_.supports_tool_calls;
-        auto polyfill_tool_responses = opts.polyfill_tool_responses && has_tool_responses && !caps_.supports_tool_responses;
-        auto polyfill_object_arguments = opts.polyfill_object_arguments && has_tool_calls && caps_.requires_object_arguments;
-        auto polyfill_typed_content = opts.polyfill_typed_content && has_string_content && caps_.requires_typed_content;
-
-        auto needs_polyfills = opts.apply_polyfills && (false
-            || polyfill_system_role
-            || polyfill_tools
-            || polyfill_tool_calls
-            || polyfill_tool_responses
-            || polyfill_object_arguments
-            || polyfill_typed_content
-        );
-
-        if (needs_polyfills) {
-            actual_messages = json::array();
-
-            auto add_message = [&](const json & msg) {
-                if (polyfill_typed_content && msg.contains("content") && !msg.at("content").is_null() && msg.at("content").is_string()) {
-                    actual_messages.push_back({
-                        {"role", msg.at("role")},
-                        {"content", {{
-                            {"type", "text"},
-                            {"text", msg.at("content")},
-                        }}},
-                    });
-                } else {
-                    actual_messages.push_back(msg);
-                }
-            };
-
-            std::string pending_system;
-            auto flush_sys = [&]() {
-                if (!pending_system.empty()) {
-                    add_message({
-                        {"role", "user"},
-                        {"content", pending_system},
-                    });
-                    pending_system.clear();
-                }
-            };
-
-            json adjusted_messages;
-            if (polyfill_tools) {
-                adjusted_messages = add_system(inputs.messages,
-                    "You can call any of the following tools to satisfy the user's requests: " + minja::Value(inputs.tools).dump(2, /* to_json= */ true) +
-                    (!polyfill_tool_call_example || tool_call_example_.empty() ? "" : "\n\nExample tool call syntax:\n\n" + tool_call_example_ + "\n\n"));
-            } else {
-                adjusted_messages = inputs.messages;
-            }
-
-            for (const auto & message_ : adjusted_messages) {
-                auto message = message_;
-                if (!message.contains("role") || !message.contains("content")) {
-                    throw std::runtime_error("message must have 'role' and 'content' fields: " + message.dump());
-                }
-                std::string role = message.at("role");
-
-                if (message.contains("tool_calls")) {
-                    if (polyfill_object_arguments || polyfill_tool_calls) {
-                        for (auto & tool_call : message.at("tool_calls")) {
-                            if (tool_call["type"] == "function") {
-                                auto & function = tool_call.at("function");
-                                auto & arguments = function.at("arguments");
-                                if (arguments.is_string()) {
-                                    try {
-                                        arguments = json::parse(arguments.get<std::string>());
-                                    } catch (const std::exception & ecvt) {
-                                        fprintf(stderr, "Failed to parse arguments: %s\n", ecvt.what());
-                                    }
-                                }
-                            }
-                        }
-                    }
-                    if (polyfill_tool_calls) {
-                        auto content = message.at("content");
-                        auto tool_calls = json::array();
-                        for (const auto & tool_call : message.at("tool_calls")) {
-                            if (tool_call.at("type") != "function") {
-                                continue;
-                            }
-                            const auto & function = tool_call.at("function");
-                            auto tc = json {
-                                {"name", function.at("name")},
-                                {"arguments", function.at("arguments")},
-                            };
-                            if (tool_call.contains("id")) {
-                                tc["id"] = tool_call["id"];
-                            }
-                            tool_calls.push_back(tc);
-                        }
-                        auto obj = json {
-                            {"tool_calls", tool_calls},
-                        };
-                        if (!content.is_null() && !content.empty()) {
-                            obj["content"] = content;
-                        }
-                        message["content"] = obj.dump(2);
-                        message.erase("tool_calls");
-                    }
-                }
-                if (polyfill_tool_responses && role == "tool") {
-                    message["role"] = "user";
-                    auto obj = json {
-                        {"tool_response", json::object()},
-                    };
-                    if (message.contains("name")) {
-                        obj["tool_response"]["tool"] = message.at("name");
-                    }
-                    obj["tool_response"]["content"] = message.at("content");
-                    if (message.contains("tool_call_id")) {
-                        obj["tool_response"]["tool_call_id"] = message.at("tool_call_id");
-                    }
-                    message["content"] = obj.dump(2);
-                    message.erase("name");
-                }
-
-                if (!message["content"].is_null() && polyfill_system_role) {
-                    std::string content = message.at("content");
-                    if (role == "system") {
-                        if (!pending_system.empty()) pending_system += "\n";
-                        pending_system += content;
-                        continue;
-                    } else {
-                        if (role == "user") {
-                            if (!pending_system.empty()) {
-                                message["content"] = pending_system + (content.empty() ? "" : "\n" + content);
-                                pending_system.clear();
-                            }
-                        } else {
-                            flush_sys();
-                        }
-                    }
-                }
-                add_message(message);
-            }
-            flush_sys();
-        } else {
-            actual_messages = inputs.messages;
-        }
-
-        auto context = minja::Context::make(json({
-            {"messages", actual_messages},
-            {"add_generation_prompt", inputs.add_generation_prompt},
-        }));
-        context->set("bos_token", opts.use_bos_token ? bos_token_ : "");
-        context->set("eos_token", opts.use_eos_token ? eos_token_ : "");
-        if (opts.define_strftime_now) {
-            auto now = inputs.now;
-            context->set("strftime_now", Value::callable([now](const std::shared_ptr<minja::Context> &, minja::ArgumentsValue & args) {
-                args.expectArgs("strftime_now", {1, 1}, {0, 0});
-                auto format = args.args[0].get<std::string>();
-
-                auto time = std::chrono::system_clock::to_time_t(now);
-                auto local_time = *std::localtime(&time);
-                std::ostringstream ss;
-                ss << std::put_time(&local_time, format.c_str());
-                return ss.str();
-            }));
-        }
-        if (!inputs.tools.is_null()) {
-            context->set("tools", minja::Value(inputs.tools));
-        }
-        if (!inputs.extra_context.is_null()) {
-            for (auto & kv : inputs.extra_context.items()) {
-                context->set(kv.key(), minja::Value(kv.value()));
-            }
-        }
-
-        auto ret = template_root_->render(context);
-        // fprintf(stderr, "actual_messages: %s\n", actual_messages.dump(2).c_str());
-        // fprintf(stderr, "apply: %s\n\n", ret.c_str());
-        return ret;
-    }
-
-    static nlohmann::ordered_json add_system(const nlohmann::ordered_json & messages, const std::string & system_prompt) {
-        json messages_with_system = messages;
-
-        if (!messages_with_system.empty() && messages_with_system[0].at("role") == "system") {
-            std::string existing_system = messages_with_system.at(0).at("content");
-            messages_with_system[0] = json {
-                {"role", "system"},
-                {"content", existing_system + "\n\n" + system_prompt},
-            };
-        } else {
-            messages_with_system.insert(messages_with_system.begin(), json {
-                {"role", "system"},
-                {"content", system_prompt},
-            });
-        }
-        return messages_with_system;
-    }
-};
-
-}  // namespace minja
--- a/common/minja/minja.hpp
+++ b/common/minja/minja.hpp
--- a/common/ngram-cache.cpp
+++ b/common/ngram-cache.cpp
@@ -7,9 +7,8 @@
 #include <cstdio>
 #include <fstream>
 #include <thread>
-#include <algorithm>

-void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
+void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
                              std::vector<llama_token> & inp, int nnew, bool print_progress) {
    const int64_t t_start_ms = ggml_time_ms();
    const int64_t inp_size = inp.size();
@@ -21,16 +20,16 @@ void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min,
        const int64_t i_start = std::max(inp_size - nnew, ngram_size);
        for (int64_t i = i_start; i < inp_size; ++i) {
            const int64_t ngram_start = i - ngram_size;
-            common_ngram ngram(&inp[ngram_start], ngram_size);
+            llama_ngram ngram(&inp[ngram_start], ngram_size);
            const llama_token token = inp[i];

-            common_ngram_cache::iterator part_it = ngram_cache.find(ngram);
+            llama_ngram_cache::iterator part_it = ngram_cache.find(ngram);
            if (part_it == ngram_cache.end()) {
-                common_ngram_cache_part part;
+                llama_ngram_cache_part part;
                part.emplace(token, 1);
                ngram_cache.emplace(ngram, part);
            } else {
-                common_ngram_cache_part::iterator token_count_it = part_it->second.find(token);
+                llama_ngram_cache_part::iterator token_count_it = part_it->second.find(token);
                if (token_count_it == part_it->second.end()) {
                    part_it->second.emplace(token, 1);
                } else {
@@ -63,16 +62,16 @@ constexpr int draft_min_sample_size_strict[LLAMA_NGRAM_MAX] = { 4,  3,  2,  2};
 constexpr int     draft_min_percent_strict[LLAMA_NGRAM_MAX] = {75, 66, 66, 66};

 // Helper function that tries to draft a token from only the static ngram cache:
-static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram ngram_static) {
-    common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
+static llama_token try_draft(llama_ngram_cache & nc_static, const llama_ngram ngram_static) {
+    llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
    if (part_static_it == nc_static.end()) {
-        return LLAMA_TOKEN_NULL;
+        return -1;
    }
-    const common_ngram_cache_part part_static = part_static_it->second;
+    const llama_ngram_cache_part part_static = part_static_it->second;

    int max_count_static  = 0;
    int sum_count_static  = 0;
-    llama_token max_token = LLAMA_TOKEN_NULL;
+    llama_token max_token = -1;

    for (std::pair<llama_token, int> token_count_static : part_static) {
        const llama_token token = token_count_static.first;
@@ -86,39 +85,39 @@ static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram
    }

    if (sum_count_static < draft_min_sample_size_lax[LLAMA_NGRAM_STATIC-1]) {
-        return LLAMA_TOKEN_NULL;
+        return -1;
    }
    if (100*max_count_static < draft_min_percent_lax[LLAMA_NGRAM_STATIC-1]*sum_count_static) {
-        return LLAMA_TOKEN_NULL;
+        return -1;
    }
    return max_token;
 }

 // Try to draft a token from primary cache (context/dynamic), validate with static cache:
 static llama_token try_draft(
-    common_ngram_cache & nc_primary, const std::vector<common_ngram> & ngrams_primary, common_ngram_cache_part & part_static,
+    llama_ngram_cache & nc_primary, const std::vector<llama_ngram> & ngrams_primary, llama_ngram_cache_part & part_static,
    const int * min_sample_size, const int * min_percent) {

-    llama_token drafted_token = LLAMA_TOKEN_NULL;
+    llama_token drafted_token = -1;

-    for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == LLAMA_TOKEN_NULL; --i) {
-        const common_ngram ngram_primary = ngrams_primary[i];
+    for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == -1; --i) {
+        const llama_ngram ngram_primary = ngrams_primary[i];

-        common_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary);
+        llama_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary);
        if (part_primary_it == nc_primary.end()) {
            continue;
        }
-        const common_ngram_cache_part part_primary = part_primary_it->second;
+        const llama_ngram_cache_part part_primary = part_primary_it->second;

        int max_count_primary = 0;
        int max_count_static  = 0;
        int sum_count_primary = 0;
-        llama_token max_token = LLAMA_TOKEN_NULL;
+        llama_token max_token = -1;

        for (std::pair<llama_token, int> token_count_primary : part_primary) {
            const llama_token token = token_count_primary.first;

-            common_ngram_cache_part::iterator token_count_static_it = part_static.find(token);
+            llama_ngram_cache_part::iterator token_count_static_it = part_static.find(token);

            const int32_t count_primary = token_count_primary.second;
            const int32_t count_static  = token_count_static_it != part_static.end() ? 100*token_count_static_it->second : 1;
@@ -143,9 +142,9 @@ static llama_token try_draft(
    return drafted_token;
 }

-void common_ngram_cache_draft(
+void llama_ngram_cache_draft(
    std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
-    common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static
+    llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static
 ) {
    GGML_ASSERT(draft.size() == 1);
    const int inp_size = inp.size();
@@ -155,40 +154,40 @@ void common_ngram_cache_draft(
    }

    while ((int) draft.size()-1 < n_draft) {
-        llama_token drafted_token = LLAMA_TOKEN_NULL;
+        llama_token drafted_token = -1;

        const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1;
-        common_ngram ngram_static;
+        llama_ngram ngram_static;
        for (int j = ngram_start_static; j < ngram_start_static + LLAMA_NGRAM_STATIC; ++j) {
            ngram_static.tokens[j-ngram_start_static] = get_token(inp, draft, j);
        }
-        common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
-        common_ngram_cache_part part_static;
+        llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
+        llama_ngram_cache_part part_static;
        if (part_static_it != nc_static.end()) {
            part_static = part_static_it->second;
        }

        // cd = context + dynamic
-        std::vector<common_ngram> ngrams_cd;
+        std::vector<llama_ngram> ngrams_cd;
        for (int ngram_size_cd = ngram_min; ngram_size_cd <= ngram_max; ++ngram_size_cd) {
            const int ngram_start_cd = inp_size-ngram_size_cd + draft.size()-1;
-            common_ngram ngram_cd;
+            llama_ngram ngram_cd;
            for (int j = ngram_start_cd; j < ngram_start_cd + ngram_size_cd; ++j) {
                ngram_cd.tokens[j-ngram_start_cd] = get_token(inp, draft, j);
            }
            ngrams_cd.push_back(ngram_cd);
        }
-        if (drafted_token == LLAMA_TOKEN_NULL) {
+        if (drafted_token == -1) {
            drafted_token = try_draft(nc_context, ngrams_cd, part_static, draft_min_sample_size_lax, draft_min_percent_lax);
        }
-        if (drafted_token == LLAMA_TOKEN_NULL) {
+        if (drafted_token == -1) {
            drafted_token = try_draft(nc_dynamic, ngrams_cd, part_static, draft_min_sample_size_strict, draft_min_percent_strict);
        }
-        if (drafted_token == LLAMA_TOKEN_NULL) {
+        if (drafted_token == -1) {
            drafted_token = try_draft(nc_static, ngram_static);
        }

-        if (drafted_token == LLAMA_TOKEN_NULL) {
+        if (drafted_token == -1) {
            break;
        }

@@ -197,16 +196,16 @@ void common_ngram_cache_draft(
    }
 }

-void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & filename) {
+void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename) {
    std::ofstream file_out(filename, std::ios::binary);
-    for (std::pair<common_ngram, common_ngram_cache_part> item : ngram_cache) {
-        const common_ngram      ngram        = item.first;
-        common_ngram_cache_part token_counts = item.second;
+    for (std::pair<llama_ngram, llama_ngram_cache_part> item : ngram_cache) {
+        const llama_ngram      ngram        = item.first;
+        llama_ngram_cache_part token_counts = item.second;
        GGML_ASSERT(!token_counts.empty());
        const int32_t ntokens = token_counts.size();
        GGML_ASSERT(ntokens > 0);

-        file_out.write(reinterpret_cast<const char *>(&ngram),   sizeof(common_ngram));
+        file_out.write(reinterpret_cast<const char *>(&ngram),   sizeof(llama_ngram));
        file_out.write(reinterpret_cast<const char *>(&ntokens), sizeof(int32_t));
        for (std::pair<llama_token, int32_t> item2 : token_counts) {
            const llama_token token = item2.first;
@@ -220,14 +219,14 @@ void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & fil

 }

-common_ngram_cache common_ngram_cache_load(std::string & filename) {
+llama_ngram_cache llama_ngram_cache_load(std::string & filename) {
    std::ifstream hashmap_file(filename, std::ios::binary);
    if (!hashmap_file) {
        throw std::ifstream::failure("Unable to open file " + filename);
    }
-    common_ngram_cache ngram_cache;
+    llama_ngram_cache ngram_cache;

-    common_ngram ngram;
+    llama_ngram ngram;
    int32_t     ntokens;
    llama_token token;
    int32_t     count;
@@ -236,11 +235,11 @@ common_ngram_cache common_ngram_cache_load(std::string & filename) {
    char * ntokensc = reinterpret_cast<char*>(&ntokens);
    char * tokenc   = reinterpret_cast<char*>(&token);
    char * countc   = reinterpret_cast<char*>(&count);
-    while(hashmap_file.read(ngramc, sizeof(common_ngram))) {
+    while(hashmap_file.read(ngramc, sizeof(llama_ngram))) {
        GGML_ASSERT(!hashmap_file.eof());
        GGML_ASSERT(hashmap_file.read(ntokensc, sizeof(int32_t)));
        GGML_ASSERT(ntokens > 0);
-        common_ngram_cache_part token_counts;
+        llama_ngram_cache_part token_counts;

        for (int i = 0; i < ntokens; ++i) {
            GGML_ASSERT(!hashmap_file.eof());
@@ -258,12 +257,12 @@ common_ngram_cache common_ngram_cache_load(std::string & filename) {
    return ngram_cache;
 }

-void common_ngram_cache_merge(common_ngram_cache & ngram_cache_target, common_ngram_cache & ngram_cache_add) {
-    for (std::pair<common_ngram, common_ngram_cache_part> ngram_part : ngram_cache_add) {
-        const common_ngram      ngram = ngram_part.first;
-        common_ngram_cache_part  part = ngram_part.second;
+void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add) {
+    for (std::pair<llama_ngram, llama_ngram_cache_part> ngram_part : ngram_cache_add) {
+        const llama_ngram      ngram = ngram_part.first;
+        llama_ngram_cache_part  part = ngram_part.second;

-        common_ngram_cache::iterator part_merged_it = ngram_cache_target.find(ngram);
+        llama_ngram_cache::iterator part_merged_it = ngram_cache_target.find(ngram);
        if (part_merged_it == ngram_cache_target.end()) {
            ngram_cache_target.emplace(ngram, part);
            continue;
@@ -274,7 +273,7 @@ void common_ngram_cache_merge(common_ngram_cache & ngram_cache_target, common_ng
            const int32_t     count = token_count.second;
            GGML_ASSERT(count > 0);

-            common_ngram_cache_part::iterator token_count_merged_it = part_merged_it->second.find(token);
+            llama_ngram_cache_part::iterator token_count_merged_it = part_merged_it->second.find(token);
            if (token_count_merged_it == part_merged_it->second.end()) {
                part_merged_it->second.emplace(token, count);
                continue;
--- a/common/ngram-cache.h
+++ b/common/ngram-cache.h
@@ -12,22 +12,22 @@

 // Data structures to map n-grams to empirical token probabilities:

-struct common_ngram {
+struct llama_ngram {
    llama_token tokens[LLAMA_NGRAM_MAX];

-    common_ngram() {
+    llama_ngram() {
        for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
-            tokens[i] = LLAMA_TOKEN_NULL;
+            tokens[i] = -1;
        }
    }

-    common_ngram(const llama_token * input, const int ngram_size) {
+    llama_ngram(const llama_token * input, const int ngram_size) {
        for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
-            tokens[i] = i < ngram_size ? input[i] : LLAMA_TOKEN_NULL;
+            tokens[i] = i < ngram_size ? input[i] : -1;
        }
    }

-    bool operator==(const common_ngram & other) const {
+    bool operator==(const llama_ngram & other) const {
        for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
            if (tokens[i] != other.tokens[i]) {
                return false;
@@ -37,28 +37,28 @@ struct common_ngram {
    }
 };

-struct common_token_hash_function {
+struct llama_token_hash_function {
    size_t operator()(const llama_token token) const {
        // see https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
        return token * 11400714819323198485llu;
    }
 };

-struct common_ngram_hash_function {
-    size_t operator()(const common_ngram & ngram) const {
-        size_t hash = common_token_hash_function{}(ngram.tokens[0]);
+struct llama_ngram_hash_function {
+    size_t operator()(const llama_ngram & ngram) const {
+        size_t hash = llama_token_hash_function{}(ngram.tokens[0]);
        for (int i = 1; i < LLAMA_NGRAM_MAX; ++i) {
-            hash ^= common_token_hash_function{}(ngram.tokens[i]);
+            hash ^= llama_token_hash_function{}(ngram.tokens[i]);
        }
        return hash;
    }
 };

 // token -> number of times token has been seen
-typedef std::unordered_map<llama_token, int32_t> common_ngram_cache_part;
+typedef std::unordered_map<llama_token, int32_t> llama_ngram_cache_part;

 // n-gram -> empirical distribution of following tokens
-typedef std::unordered_map<common_ngram, common_ngram_cache_part, common_ngram_hash_function> common_ngram_cache;
+typedef std::unordered_map<llama_ngram, llama_ngram_cache_part, llama_ngram_hash_function> llama_ngram_cache;


 // Update an ngram cache with tokens.
@@ -70,8 +70,8 @@ typedef std::unordered_map<common_ngram, common_ngram_cache_part, common_ngram_h
 //
 // In order to get correct results inp_data can ONLY BE APPENDED TO.
 // Changes in the middle need a complete rebuild.
-void common_ngram_cache_update(
-    common_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector<llama_token> & inp_data, int nnew, bool print_progress);
+void llama_ngram_cache_update(
+    llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector<llama_token> & inp_data, int nnew, bool print_progress);

 // Try to draft tokens from ngram caches.
 // inp:                the tokens generated so far.
@@ -81,21 +81,21 @@ void common_ngram_cache_update(
 // nc_context:         ngram cache based on current context.
 // nc_dynamic:         ngram cache based on previous user generations.
 // nc_static:          ngram cache generated from a large text corpus, used for validation.
-void common_ngram_cache_draft(
+void llama_ngram_cache_draft(
    std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
-    common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static);
+    llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static);

 // Save an ngram cache to a file.
 // ngram_cache: the ngram cache to save.
 // filename:    the path under which to save the ngram cache.
-void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & filename);
+void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename);

-// Load an ngram cache saved with common_ngram_cache_save.
+// Load an ngram cache saved with llama_ngram_cache_save.
 // filename: the path from which to load the ngram cache.
 // returns:  an ngram cache containing the information saved to filename.
-common_ngram_cache common_ngram_cache_load(std::string & filename);
+llama_ngram_cache llama_ngram_cache_load(std::string & filename);

 // Merge two ngram caches.
 // ngram_cache_target: the ngram cache to which to add the information from ngram_cache_add.
 // ngram_cache_add:    the ngram cache to add to ngram_cache_target.
-void common_ngram_cache_merge(common_ngram_cache & ngram_cache_target, common_ngram_cache & ngram_cache_add);
+void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add);
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -1,11 +1,9 @@
 #include "sampling.h"

 #include "common.h"
-#include "log.h"

 #include <cmath>
 #include <unordered_map>
-#include <algorithm>

 // the ring buffer works similarly to std::deque, but with a fixed capacity
 // TODO: deduplicate with llama-impl.h
@@ -100,8 +98,8 @@ struct ring_buffer {
    std::vector<T> data;
 };

-struct common_sampler {
-    common_params_sampling params;
+struct gpt_sampler {
+    gpt_sampler_params params;

    struct llama_sampler * grmr;
    struct llama_sampler * chain;
@@ -115,10 +113,7 @@ struct common_sampler {
    void set_logits(struct llama_context * ctx, int idx) {
        const auto * logits = llama_get_logits_ith(ctx, idx);

-        const llama_model * model = llama_get_model(ctx);
-        const llama_vocab * vocab = llama_model_get_vocab(model);
-
-        const int n_vocab = llama_vocab_n_tokens(vocab);
+        const int n_vocab = llama_n_vocab(llama_get_model(ctx));

        cur.resize(n_vocab);

@@ -130,93 +125,28 @@ struct common_sampler {
    }
 };

-std::string common_params_sampling::print() const {
+std::string gpt_sampler_params::print() const {
    char result[1024];

    snprintf(result, sizeof(result),
            "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
-            "\tdry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d\n"
-            "\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, top_n_sigma = %.3f, temp = %.3f\n"
+            "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
            "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
            penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
-            dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n,
-            top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, top_n_sigma, temp,
+            top_k, tfs_z, top_p, min_p, typ_p, temp,
            mirostat, mirostat_eta, mirostat_tau);

    return std::string(result);
 }

-struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params) {
-    const llama_vocab * vocab = llama_model_get_vocab(model);
-
+struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) {
    llama_sampler_chain_params lparams = llama_sampler_chain_default_params();

    lparams.no_perf = params.no_perf;

-    struct llama_sampler * grmr;
-    if (params.grammar.compare(0, 11, "%llguidance") == 0) {
-#ifdef LLAMA_USE_LLGUIDANCE
-        grmr = llama_sampler_init_llg(vocab, "lark", params.grammar.c_str());
-#else
-        GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
-#endif // LLAMA_USE_LLGUIDANCE
-    } else {
-        std::vector<std::string> patterns_at_start;
-        std::vector<std::string> patterns_anywhere;
-        std::vector<llama_token> trigger_tokens;
-        for (const auto & trigger : params.grammar_triggers) {
-            switch (trigger.type) {
-                case COMMON_GRAMMAR_TRIGGER_TYPE_WORD:
-                {
-                    const auto & word = trigger.value;
-                    patterns_anywhere.push_back(regex_escape(word));
-                    break;
-                }
-                case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN:
-                case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START:
-                {
-                    const auto & pattern = trigger.value;
-                    (trigger.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START ? patterns_at_start : patterns_anywhere).push_back(pattern);
-                    break;
-                }
-                case COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN:
-                {
-                    const auto token = trigger.token;
-                    trigger_tokens.push_back(token);
-                    break;
-                }
-                default:
-                    GGML_ASSERT(false && "unknown trigger type");
-            }
-        }
-
-        std::vector<std::string> trigger_patterns;
-        if (!patterns_at_start.empty()) {
-            trigger_patterns.push_back("^(" + string_join(patterns_at_start, "|") + ")[\\s\\S]*");
-        }
-        if (!patterns_anywhere.empty()) {
-            trigger_patterns.push_back("^[\\s\\S]*?(" + string_join(patterns_anywhere, "|") + ")[\\s\\S]*");
-        }
-
-        std::vector<const char *> trigger_patterns_c;
-        trigger_patterns_c.reserve(trigger_patterns.size());
-        for (const auto & regex : trigger_patterns) {
-            trigger_patterns_c.push_back(regex.c_str());
-        }
-
-        grmr = params.grammar_lazy
-             ? llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
-                                                        trigger_patterns_c.data(), trigger_patterns_c.size(),
-                                                        trigger_tokens.data(), trigger_tokens.size())
-             :      llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
-        if (!grmr) {
-            return nullptr;
-        }
-    }
-
-    auto * result = new common_sampler {
+    auto * result = new gpt_sampler {
        /* .params = */ params,
-        /* .grmr   = */ grmr,
+        /* .grmr   = */ llama_sampler_init_grammar(model, params.grammar.c_str(), "root"),
        /* .chain  = */ llama_sampler_chain_init(lparams),
        /* .prev   = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
        /* .cur    = */ {},
@@ -225,70 +155,68 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co

    llama_sampler_chain_add(result->chain,
            llama_sampler_init_logit_bias(
-                llama_vocab_n_tokens(vocab),
+                llama_n_vocab(model),
                params.logit_bias.size(),
                params.logit_bias.data()));

-    if (params.mirostat == 0) {
-        for (const auto & cnstr : params.samplers) {
-            switch (cnstr) {
-                case COMMON_SAMPLER_TYPE_DRY:
-                    {
-                        std::vector<const char *> c_breakers;
-                        c_breakers.reserve(params.dry_sequence_breakers.size());
-                        for (const auto & str : params.dry_sequence_breakers) {
-                            c_breakers.push_back(str.c_str());
-                        }
+    llama_sampler_chain_add(result->chain,
+            llama_sampler_init_penalties(
+                llama_n_vocab  (model),
+                llama_token_eos(model),
+                llama_token_nl (model),
+                params.penalty_last_n,
+                params.penalty_repeat,
+                params.penalty_freq,
+                params.penalty_present,
+                params.penalize_nl,
+                params.ignore_eos));

-                        llama_sampler_chain_add(result->chain, llama_sampler_init_dry      (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
-                    }
-                    break;
-                case COMMON_SAMPLER_TYPE_TOP_K:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_top_k       (params.top_k));
-                    break;
-                case COMMON_SAMPLER_TYPE_TOP_P:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_top_p       (params.top_p, params.min_keep));
-                    break;
-                case COMMON_SAMPLER_TYPE_TOP_N_SIGMA:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
-                    break;
-                case COMMON_SAMPLER_TYPE_MIN_P:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_min_p       (params.min_p, params.min_keep));
-                    break;
-                case COMMON_SAMPLER_TYPE_XTC:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_xtc         (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
-                    break;
-                case COMMON_SAMPLER_TYPE_TYPICAL_P:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_typical     (params.typ_p, params.min_keep));
-                    break;
-                case COMMON_SAMPLER_TYPE_TEMPERATURE:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext    (params.temp, params.dynatemp_range, params.dynatemp_exponent));
-                    break;
-                case COMMON_SAMPLER_TYPE_INFILL:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_infill      (vocab));
-                    break;
-                case COMMON_SAMPLER_TYPE_PENALTIES:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_penalties   (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
-                    break;
-                default:
-                    GGML_ASSERT(false && "unknown sampler type");
+    if (params.temp > 0.0f) {
+        if (params.mirostat == 0) {
+            for (const auto & cnstr : params.samplers) {
+                switch (cnstr) {
+                    case GPT_SAMPLER_TYPE_TOP_K:
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_top_k    (params.top_k));
+                        break;
+                    case GPT_SAMPLER_TYPE_TOP_P:
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_top_p    (params.top_p, params.min_keep));
+                        break;
+                    case GPT_SAMPLER_TYPE_MIN_P:
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_min_p    (params.min_p, params.min_keep));
+                        break;
+                    case GPT_SAMPLER_TYPE_TFS_Z:
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
+                        break;
+                    case GPT_SAMPLER_TYPE_TYPICAL_P:
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_typical  (params.typ_p, params.min_keep));
+                        break;
+                    case GPT_SAMPLER_TYPE_TEMPERATURE:
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
+                        break;
+                    default:
+                        GGML_ASSERT(false && "unknown sampler type");
+                }
            }
+            llama_sampler_chain_add(result->chain, llama_sampler_init_softmax());
+            llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
+        } else if (params.mirostat == 1) {
+            llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
+            llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
+        } else if (params.mirostat == 2) {
+            llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
+            llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
+        } else {
+            GGML_ASSERT(false && "unknown mirostat version");
        }
-        llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
-    } else if (params.mirostat == 1) {
-        llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
-        llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
-    } else if (params.mirostat == 2) {
-        llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
-        llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
    } else {
-        GGML_ASSERT(false && "unknown mirostat version");
+        llama_sampler_chain_add(result->chain, llama_sampler_init_softmax());
+        llama_sampler_chain_add(result->chain, llama_sampler_init_greedy());
    }

    return result;
 }

-void common_sampler_free(struct common_sampler * gsmpl) {
+void gpt_sampler_free(struct gpt_sampler * gsmpl) {
    if (gsmpl) {
        llama_sampler_free(gsmpl->grmr);

@@ -298,7 +226,7 @@ void common_sampler_free(struct common_sampler * gsmpl) {
    }
 }

-void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
+void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool accept_grammar) {
    if (accept_grammar) {
        llama_sampler_accept(gsmpl->grmr, token);
    }
@@ -308,14 +236,14 @@ void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, boo
    gsmpl->prev.push_back(token);
 }

-void common_sampler_reset(struct common_sampler * gsmpl) {
+void gpt_sampler_reset(struct gpt_sampler * gsmpl) {
    llama_sampler_reset(gsmpl->grmr);

    llama_sampler_reset(gsmpl->chain);
 }

-struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
-    return new common_sampler {
+struct gpt_sampler * gpt_sampler_clone(gpt_sampler * gsmpl) {
+    return new gpt_sampler {
        /* .params = */ gsmpl->params,
        /* .grmr   = */ llama_sampler_clone(gsmpl->grmr),
        /* .chain  = */ llama_sampler_clone(gsmpl->chain),
@@ -325,7 +253,7 @@ struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
    };
 }

-void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl) {
+void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * gsmpl) {
    // TODO: measure grammar performance

    if (gsmpl) {
@@ -336,7 +264,7 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
    }
 }

-llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
+llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
    gsmpl->set_logits(ctx, idx);

    auto & grmr  = gsmpl->grmr;
@@ -382,60 +310,21 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
    return cur_p.data[cur_p.selected].id;
 }

-std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first) {
-    GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1");
-
-    std::vector<llama_token> result;
-    result.reserve(idxs.size());
-
-    size_t i = 0;
-    for (; i < draft.size(); i++) {
-        const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
-
-        common_sampler_accept(gsmpl, id, true);
-
-        result.push_back(id);
-
-        if (draft[i] != id) {
-            break;
-        }
-    }
-
-    if (i == draft.size()) {
-        const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
-
-        common_sampler_accept(gsmpl, id, true);
-
-        result.push_back(id);
-    }
-
-    return result;
-}
-
-std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first) {
-    std::vector<int> idxs(draft.size() + 1);
-    for (size_t i = 0; i < idxs.size(); ++i) {
-        idxs[i] = i;
-    }
-
-    return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft, grammar_first);
-}
-
-uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
+uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl) {
    return llama_sampler_get_seed(gsmpl->chain);
 }

 // helpers

-llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl) {
+llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl) {
    return &gsmpl->cur_p;
 }

-llama_token common_sampler_last(const struct common_sampler * gsmpl) {
+llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl) {
    return gsmpl->prev.rat(0);
 }

-std::string common_sampler_print(const struct common_sampler * gsmpl) {
+std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) {
    std::string result = "logits ";

    for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
@@ -446,7 +335,7 @@ std::string common_sampler_print(const struct common_sampler * gsmpl) {
    return result;
 }

-std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx_main, int n) {
+std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx_main, int n) {
    n = std::min(n, (int) gsmpl->prev.size());

    if (n <= 0) {
@@ -461,118 +350,99 @@ std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx_

        GGML_ASSERT(id != LLAMA_TOKEN_NULL && "null token in the sampling history - should not happen");

-        result += common_token_to_piece(ctx_main, id);
+        result += llama_token_to_piece(ctx_main, id);
    }

    return result;
 }

-char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
+char gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr) {
    switch (cnstr) {
-        case COMMON_SAMPLER_TYPE_DRY:         return 'd';
-        case COMMON_SAMPLER_TYPE_TOP_K:       return 'k';
-        case COMMON_SAMPLER_TYPE_TYPICAL_P:   return 'y';
-        case COMMON_SAMPLER_TYPE_TOP_P:       return 'p';
-        case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: return 's';
-        case COMMON_SAMPLER_TYPE_MIN_P:       return 'm';
-        case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
-        case COMMON_SAMPLER_TYPE_XTC:         return 'x';
-        case COMMON_SAMPLER_TYPE_INFILL:      return 'i';
-        case COMMON_SAMPLER_TYPE_PENALTIES:   return 'e';
+        case GPT_SAMPLER_TYPE_TOP_K:       return 'k';
+        case GPT_SAMPLER_TYPE_TFS_Z:       return 'f';
+        case GPT_SAMPLER_TYPE_TYPICAL_P:   return 'y';
+        case GPT_SAMPLER_TYPE_TOP_P:       return 'p';
+        case GPT_SAMPLER_TYPE_MIN_P:       return 'm';
+        case GPT_SAMPLER_TYPE_TEMPERATURE: return 't';
        default : return '?';
    }
 }

-std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
+std::string gpt_sampler_type_to_str(enum gpt_sampler_type cnstr) {
    switch (cnstr) {
-        case COMMON_SAMPLER_TYPE_DRY:         return "dry";
-        case COMMON_SAMPLER_TYPE_TOP_K:       return "top_k";
-        case COMMON_SAMPLER_TYPE_TYPICAL_P:   return "typ_p";
-        case COMMON_SAMPLER_TYPE_TOP_P:       return "top_p";
-        case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: return "top_n_sigma";
-        case COMMON_SAMPLER_TYPE_MIN_P:       return "min_p";
-        case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
-        case COMMON_SAMPLER_TYPE_XTC:         return "xtc";
-        case COMMON_SAMPLER_TYPE_INFILL:      return "infill";
-        case COMMON_SAMPLER_TYPE_PENALTIES:   return "penalties";
+        case GPT_SAMPLER_TYPE_TOP_K:       return "top_k";
+        case GPT_SAMPLER_TYPE_TFS_Z:       return "tfs_z";
+        case GPT_SAMPLER_TYPE_TYPICAL_P:   return "typ_p";
+        case GPT_SAMPLER_TYPE_TOP_P:       return "top_p";
+        case GPT_SAMPLER_TYPE_MIN_P:       return "min_p";
+        case GPT_SAMPLER_TYPE_TEMPERATURE: return "temperature";
        default : return "";
    }
 }

-std::vector<common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
-    std::unordered_map<std::string, common_sampler_type> sampler_canonical_name_map {
-        { "dry",         COMMON_SAMPLER_TYPE_DRY },
-        { "top_k",       COMMON_SAMPLER_TYPE_TOP_K },
-        { "top_p",       COMMON_SAMPLER_TYPE_TOP_P },
-        { "top_n_sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
-        { "typ_p",       COMMON_SAMPLER_TYPE_TYPICAL_P },
-        { "min_p",       COMMON_SAMPLER_TYPE_MIN_P },
-        { "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
-        { "xtc",         COMMON_SAMPLER_TYPE_XTC },
-        { "infill",      COMMON_SAMPLER_TYPE_INFILL },
-        { "penalties",   COMMON_SAMPLER_TYPE_PENALTIES },
+std::vector<gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
+    std::unordered_map<std::string, gpt_sampler_type> sampler_canonical_name_map {
+        { "top_k",       GPT_SAMPLER_TYPE_TOP_K },
+        { "top_p",       GPT_SAMPLER_TYPE_TOP_P },
+        { "typ_p",       GPT_SAMPLER_TYPE_TYPICAL_P },
+        { "min_p",       GPT_SAMPLER_TYPE_MIN_P },
+        { "tfs_z",       GPT_SAMPLER_TYPE_TFS_Z },
+        { "temperature", GPT_SAMPLER_TYPE_TEMPERATURE },
    };

    // since samplers names are written multiple ways
    // make it ready for both system names and input names
-    std::unordered_map<std::string, common_sampler_type> sampler_alt_name_map {
-        { "top-k",       COMMON_SAMPLER_TYPE_TOP_K },
-        { "top-p",       COMMON_SAMPLER_TYPE_TOP_P },
-        { "top-n-sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
-        { "nucleus",     COMMON_SAMPLER_TYPE_TOP_P },
-        { "typical-p",   COMMON_SAMPLER_TYPE_TYPICAL_P },
-        { "typical",     COMMON_SAMPLER_TYPE_TYPICAL_P },
-        { "typ-p",       COMMON_SAMPLER_TYPE_TYPICAL_P },
-        { "typ",         COMMON_SAMPLER_TYPE_TYPICAL_P },
-        { "min-p",       COMMON_SAMPLER_TYPE_MIN_P },
-        { "temp",        COMMON_SAMPLER_TYPE_TEMPERATURE },
+    std::unordered_map<std::string, gpt_sampler_type> sampler_alt_name_map {
+        { "top-k",       GPT_SAMPLER_TYPE_TOP_K },
+        { "top-p",       GPT_SAMPLER_TYPE_TOP_P },
+        { "nucleus",     GPT_SAMPLER_TYPE_TOP_P },
+        { "typical-p",   GPT_SAMPLER_TYPE_TYPICAL_P },
+        { "typical",     GPT_SAMPLER_TYPE_TYPICAL_P },
+        { "typ-p",       GPT_SAMPLER_TYPE_TYPICAL_P },
+        { "typ",         GPT_SAMPLER_TYPE_TYPICAL_P },
+        { "min-p",       GPT_SAMPLER_TYPE_MIN_P },
+        { "tfs-z",       GPT_SAMPLER_TYPE_TFS_Z },
+        { "tfs",         GPT_SAMPLER_TYPE_TFS_Z },
+        { "temp",        GPT_SAMPLER_TYPE_TEMPERATURE },
    };

-    std::vector<common_sampler_type> samplers;
+    std::vector<gpt_sampler_type> samplers;
    samplers.reserve(names.size());

    for (const auto & name : names) {
        auto sampler = sampler_canonical_name_map.find(name);
        if (sampler != sampler_canonical_name_map.end()) {
            samplers.push_back(sampler->second);
-            continue;
-        }
-        if (allow_alt_names) {
-            sampler = sampler_alt_name_map.find(name);
-            if (sampler != sampler_alt_name_map.end()) {
-                samplers.push_back(sampler->second);
-                continue;
+        } else {
+            if (allow_alt_names) {
+                sampler = sampler_alt_name_map.find(name);
+                if (sampler != sampler_alt_name_map.end()) {
+                    samplers.push_back(sampler->second);
+                }
            }
        }
-        LOG_WRN("%s: unable to match sampler by name '%s'\n", __func__, name.c_str());
    }

    return samplers;
 }

-std::vector<common_sampler_type> common_sampler_types_from_chars(const std::string & chars) {
-    std::unordered_map<char, common_sampler_type> sampler_name_map = {
-        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_DRY),         COMMON_SAMPLER_TYPE_DRY },
-        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K),       COMMON_SAMPLER_TYPE_TOP_K },
-        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P),   COMMON_SAMPLER_TYPE_TYPICAL_P },
-        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P),       COMMON_SAMPLER_TYPE_TOP_P },
-        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_N_SIGMA), COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
-        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P),       COMMON_SAMPLER_TYPE_MIN_P },
-        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
-        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC),         COMMON_SAMPLER_TYPE_XTC },
-        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL),      COMMON_SAMPLER_TYPE_INFILL },
-        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_PENALTIES),   COMMON_SAMPLER_TYPE_PENALTIES },
+std::vector<gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars) {
+    std::unordered_map<char, gpt_sampler_type> sampler_name_map = {
+        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_K),       GPT_SAMPLER_TYPE_TOP_K },
+        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TFS_Z),       GPT_SAMPLER_TYPE_TFS_Z },
+        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TYPICAL_P),   GPT_SAMPLER_TYPE_TYPICAL_P },
+        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_P),       GPT_SAMPLER_TYPE_TOP_P },
+        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_MIN_P),       GPT_SAMPLER_TYPE_MIN_P },
+        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TEMPERATURE), GPT_SAMPLER_TYPE_TEMPERATURE }
    };

-    std::vector<common_sampler_type> samplers;
+    std::vector<gpt_sampler_type> samplers;
    samplers.reserve(chars.size());

    for (const auto & c : chars) {
        const auto sampler = sampler_name_map.find(c);
        if (sampler != sampler_name_map.end()) {
            samplers.push_back(sampler->second);
-        } else {
-            LOG_WRN("%s: unable to match sampler by char '%c'\n", __func__, c);
        }
    }

--- a/common/sampling.h
+++ b/common/sampling.h
@@ -7,7 +7,7 @@
 #include <string>
 #include <vector>

-// common_sampler extends llama_sampler with additional functionality:
+// gpt_sampler extends llama_sampler with additional functionality:
 //
 //  - grammar support
 //  - custom sampler logic based on the parameters
@@ -23,30 +23,30 @@
 // token in order to verify if it fits the grammar. And only if the token doesn't fit the grammar, the
 // grammar constraints are applied to the full vocabulary and the token is resampled.
 //
-// The common_sampler also maintains a container with the last accepted tokens. In the future, this can
+// The gpt_sampler also maintains a container with the last accepted tokens. In the future, this can
 // be moved into the core llama library.
 //
-// For convenience, the common_sampler also maintains a container with the current candidate tokens.
+// For convenience, the gpt_sampler also maintains a container with the current candidate tokens.
 // This can be used to access the probabilities of the rest of the non-sampled tokens.
 //
 // TODO: measure grammar performance
 //

-struct common_sampler;
+struct gpt_sampler;

 // llama_sampler API overloads

-struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params);
+struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params);

-void common_sampler_free(struct common_sampler * gsmpl);
+void gpt_sampler_free(struct gpt_sampler * gsmpl);

 // if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
-void                    common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar);
-void                    common_sampler_reset (struct common_sampler * gsmpl);
-struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
+void                 gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool accept_grammar);
+void                 gpt_sampler_reset (struct gpt_sampler * gsmpl);
+struct gpt_sampler * gpt_sampler_clone (struct gpt_sampler * gsmpl);

 // arguments can be nullptr to skip printing
-void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);
+void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * gsmpl);

 // extended sampling implementation:
 //
@@ -58,50 +58,26 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
 // if grammar_first is true, the grammar is applied before the samplers (slower)
 // useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
 //
-llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
+llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);

-// generalized version of common_sampler_sample
-//
-// will cross-reference the sampled tokens with a batch of draft tokens and accept those that match
-// if the sampler disagrees at some point, we stop and return the accepted tokens up to now
-//
-//      common_sampler_sample_n(gsmpl, ctx, { idx }, {});
-//
-// is equivalent to
-//
-//      common_sampler_sample(gsmpl, ctx, idx);
-//      common_sampler_accept(gsmpl, token, true);
-//
-// requires: idxs.size() == draft.size() + 1
-//
-// returns at least 1 token, up to idxs.size()
-//
-std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first = false);
-
-// assume idxs == [ 0, 1, 2, ..., draft.size() ]
-std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first = false);
-
-uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
+uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl);

 // helpers

 // access the internal list of current candidate tokens
-llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl);
+llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl);

 // get the last accepted token
-llama_token common_sampler_last(const struct common_sampler * gsmpl);
+llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl);

 // print the sampler chain into a string
-std::string common_sampler_print(const struct common_sampler * gsmpl);
+std::string gpt_sampler_print(const struct gpt_sampler * gsmpl);

 // get a string representation of the last accepted tokens
-std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx, int n);
+std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx, int n);

-char        common_sampler_type_to_chr(enum common_sampler_type cnstr);
-std::string common_sampler_type_to_str(enum common_sampler_type cnstr);
+char        gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr);
+std::string gpt_sampler_type_to_str(enum gpt_sampler_type cnstr);

-std::vector<enum common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
-std::vector<enum common_sampler_type> common_sampler_types_from_chars(const std::string & chars);
-
-llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab,
-                const char * grammar_kind, const char * grammar_data);
+std::vector<enum gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
+std::vector<enum gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars);
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Georgi Gerganov	6e873e561a	llama : make llm_tokenizer more private ggml-ci	2024-09-20 11:41:51 +03:00
zhenweijin	d949c5844d	refactor tokenizer	2024-09-20 15:02:44 +08:00