async direct io per tensor test

address review comments
Direct I/O and Transparent HugePages
2026-02-12 14:03:20 +02:00 · 2024-05-22 01:08:52 +02:00 · 2024-05-21 20:05:26 +02:00 · 2024-05-21 01:35:23 +02:00
860 changed files with 162839 additions and 136585 deletions
--- a/.devops/cloud-v-pipeline
+++ b/.devops/cloud-v-pipeline
@@ -15,7 +15,7 @@ node('x86_runner1'){            // Running on x86 runner containing latest vecto
    stage('Running llama.cpp'){
        sh'''#!/bin/bash
            module load gnu-bin2/0.1            # loading latest versions of vector qemu and vector gcc
-            qemu-riscv64 -L /softwares/gnu-bin2/sysroot  -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./llama-cli -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt            # Running llama.cpp on vector qemu-riscv64
+            qemu-riscv64 -L /softwares/gnu-bin2/sysroot  -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./main -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt            # Running llama.cpp on vector qemu-riscv64
            cat llama_log.txt                   # Printing results
        '''
    }
--- a/.devops/full-cuda.Dockerfile
+++ b/.devops/full-cuda.Dockerfile
@@ -1,16 +1,18 @@
 ARG UBUNTU_VERSION=22.04
+
 # This needs to generally match the container host's environment.
-ARG CUDA_VERSION=12.6.0
+ARG CUDA_VERSION=11.7.1
+
 # Target the CUDA build image
 ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}

-FROM ${BASE_CUDA_DEV_CONTAINER} AS build
+FROM ${BASE_CUDA_DEV_CONTAINER} as build

-# CUDA architecture to build for (defaults to all supported archs)
-ARG CUDA_DOCKER_ARCH=default
+# Unless otherwise specified, we make a fat build.
+ARG CUDA_DOCKER_ARCH=all

 RUN apt-get update && \
-    apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
+    apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev

 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
@@ -22,12 +24,13 @@ WORKDIR /app

 COPY . .

-# Use the default CUDA archs if not specified
-RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
-        export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
-    fi && \
-    cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
-    cmake --build build --config Release -j$(nproc) && \
-    cp build/bin/* .
+# Set nvcc architecture
+ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
+# Enable CUDA
+ENV LLAMA_CUDA=1
+# Enable cURL
+ENV LLAMA_CURL=1
+
+RUN make

 ENTRYPOINT ["/app/.devops/tools.sh"]
--- a/.devops/full-rocm.Dockerfile
+++ b/.devops/full-rocm.Dockerfile
@@ -6,7 +6,7 @@ ARG ROCM_VERSION=5.6
 # Target the CUDA build image
 ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete

-FROM ${BASE_ROCM_DEV_CONTAINER} AS build
+FROM ${BASE_ROCM_DEV_CONTAINER} as build

 # Unless otherwise specified, we make a fat build.
 # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
@@ -36,7 +36,7 @@ COPY . .
 # Set nvcc architecture
 ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
 # Enable ROCm
-ENV GGML_HIPBLAS=1
+ENV LLAMA_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++

@@ -45,6 +45,6 @@ ENV LLAMA_CURL=1
 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev

-RUN make -j$(nproc)
+RUN make

 ENTRYPOINT ["/app/.devops/tools.sh"]
--- a/.devops/full.Dockerfile
+++ b/.devops/full.Dockerfile
@@ -1,9 +1,9 @@
 ARG UBUNTU_VERSION=22.04

-FROM ubuntu:$UBUNTU_VERSION AS build
+FROM ubuntu:$UBUNTU_VERSION as build

 RUN apt-get update && \
-    apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
+    apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev

 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
@@ -18,7 +18,7 @@ COPY . .
 ENV LLAMA_CURL=1


-RUN make -j$(nproc)
+RUN make

 ENV LC_ALL=C.utf8

--- a/.devops/llama-cli-cann.Dockerfile
+++ b/.devops/llama-cli-cann.Dockerfile
@@ -1,44 +0,0 @@
-ARG ASCEND_VERSION=8.0.rc2.alpha003-910b-openeuler22.03-py3.8
-
-FROM cosdt/cann:$ASCEND_VERSION AS build
-
-WORKDIR /app
-
-COPY . .
-
-RUN yum install -y gcc g++ cmake make
-ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
-ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
-ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
-ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH}
-ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH}
-ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
-ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
-ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
-ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}
-
-# find libascend_hal.so, because the drive hasn`t been mounted.
-ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
-
-RUN echo "Building with static libs" && \
-    source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
-    cmake -B build -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF  && \
-    cmake --build build --config Release --target llama-cli
-
-# TODO: use image with NNRT
-FROM cosdt/cann:$ASCEND_VERSION AS runtime
-COPY --from=build /app/build/bin/llama-cli /llama-cli
-
-ENV LC_ALL=C.utf8
-
-ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
-ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
-ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
-ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH}
-ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH}
-ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
-ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
-ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
-ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}
-
-ENTRYPOINT ["/llama-cli" ]
--- a/.devops/llama-cli-cuda.Dockerfile
+++ b/.devops/llama-cli-cuda.Dockerfile
@@ -1,37 +0,0 @@
-ARG UBUNTU_VERSION=22.04
-# This needs to generally match the container host's environment.
-ARG CUDA_VERSION=12.6.0
-# Target the CUDA build image
-ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
-# Target the CUDA runtime image
-ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
-
-FROM ${BASE_CUDA_DEV_CONTAINER} AS build
-
-# CUDA architecture to build for (defaults to all supported archs)
-ARG CUDA_DOCKER_ARCH=default
-
-RUN apt-get update && \
-    apt-get install -y build-essential git cmake
-
-WORKDIR /app
-
-COPY . .
-
-# Use the default CUDA archs if not specified
-RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
-        export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
-    fi && \
-    cmake -B build -DGGML_CUDA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
-    cmake --build build --config Release --target llama-cli -j$(nproc)
-
-FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
-
-RUN apt-get update && \
-    apt-get install -y libgomp1
-
-COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
-COPY --from=build /app/build/src/libllama.so /libllama.so
-COPY --from=build /app/build/bin/llama-cli /llama-cli
-
-ENTRYPOINT [ "/llama-cli" ]
--- a/.devops/llama-cli-intel.Dockerfile
+++ b/.devops/llama-cli-intel.Dockerfile
@@ -1,28 +0,0 @@
-ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
-
-FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
-
-ARG GGML_SYCL_F16=OFF
-RUN apt-get update && \
-    apt-get install -y git
-
-WORKDIR /app
-
-COPY . .
-
-RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
-        echo "GGML_SYCL_F16 is set" && \
-        export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
-    fi && \
-    echo "Building with static libs" && \
-    cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \
-    ${OPT_SYCL_F16} -DBUILD_SHARED_LIBS=OFF && \
-    cmake --build build --config Release --target llama-cli
-
-FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
-
-COPY --from=build /app/build/bin/llama-cli /llama-cli
-
-ENV LC_ALL=C.utf8
-
-ENTRYPOINT [ "/llama-cli" ]
--- a/.devops/llama-cli.Dockerfile
+++ b/.devops/llama-cli.Dockerfile
@@ -1,23 +0,0 @@
-ARG UBUNTU_VERSION=22.04
-
-FROM ubuntu:$UBUNTU_VERSION AS build
-
-RUN apt-get update && \
-    apt-get install -y build-essential git
-
-WORKDIR /app
-
-COPY . .
-
-RUN make -j$(nproc) llama-cli
-
-FROM ubuntu:$UBUNTU_VERSION AS runtime
-
-RUN apt-get update && \
-    apt-get install -y libgomp1
-
-COPY --from=build /app/llama-cli /llama-cli
-
-ENV LC_ALL=C.utf8
-
-ENTRYPOINT [ "/llama-cli" ]
--- a/.devops/llama-cpp-clblast.srpm.spec
+++ b/.devops/llama-cpp-clblast.srpm.spec
@@ -0,0 +1,84 @@
+# SRPM for building from source and packaging an RPM for RPM-based distros.
+# https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages
+# Built and maintained by John Boero - boeroboy@gmail.com
+# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
+
+# Notes for llama.cpp:
+# 1. Tags are currently based on hash - which will not sort asciibetically.
+#    We need to declare standard versioning if people want to sort latest releases.
+# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
+# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
+#    Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
+# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
+#    It is up to the user to install the correct vendor-specific support.
+
+Name:           llama.cpp-clblast
+Version:        %( date "+%%Y%%m%%d" )
+Release:        1%{?dist}
+Summary:        OpenCL Inference of LLaMA model in C/C++
+License:        MIT
+Source0:        https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
+BuildRequires:  coreutils make gcc-c++ git mesa-libOpenCL-devel clblast-devel
+Requires:       clblast
+URL:            https://github.com/ggerganov/llama.cpp
+
+%define debug_package %{nil}
+%define source_date_epoch_from_changelog 0
+
+%description
+CPU inference for Meta's Lllama2 models using default options.
+
+%prep
+%setup -n llama.cpp-master
+
+%build
+make -j LLAMA_CLBLAST=1
+
+%install
+mkdir -p %{buildroot}%{_bindir}/
+cp -p main %{buildroot}%{_bindir}/llamaclblast
+cp -p server %{buildroot}%{_bindir}/llamaclblastserver
+cp -p simple %{buildroot}%{_bindir}/llamaclblastsimple
+
+mkdir -p %{buildroot}/usr/lib/systemd/system
+%{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llamaclblast.service
+[Unit]
+Description=Llama.cpp server, CPU only (no GPU support in this build).
+After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
+
+[Service]
+Type=simple
+EnvironmentFile=/etc/sysconfig/llama
+ExecStart=/usr/bin/llamaclblastserver $LLAMA_ARGS
+ExecReload=/bin/kill -s HUP $MAINPID
+Restart=never
+
+[Install]
+WantedBy=default.target
+EOF
+
+mkdir -p %{buildroot}/etc/sysconfig
+%{__cat} <<EOF  > %{buildroot}/etc/sysconfig/llama
+LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
+EOF
+
+%clean
+rm -rf %{buildroot}
+rm -rf %{_builddir}/*
+
+%files
+%{_bindir}/llamaclblast
+%{_bindir}/llamaclblastserver
+%{_bindir}/llamaclblastsimple
+/usr/lib/systemd/system/llamaclblast.service
+%config /etc/sysconfig/llama
+
+
+%pre
+
+%post
+
+%preun
+%postun
+
+%changelog
--- a/.devops/llama-cpp-cuda.srpm.spec
+++ b/.devops/llama-cpp-cuda.srpm.spec
@@ -32,13 +32,13 @@ CPU inference for Meta's Lllama2 models using default options.
 %setup -n llama.cpp-master

 %build
-make -j GGML_CUDA=1
+make -j LLAMA_CUDA=1

 %install
 mkdir -p %{buildroot}%{_bindir}/
-cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli
-cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server
-cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple
+cp -p main %{buildroot}%{_bindir}/llamacppcuda
+cp -p server %{buildroot}%{_bindir}/llamacppcudaserver
+cp -p simple %{buildroot}%{_bindir}/llamacppcudasimple

 mkdir -p %{buildroot}/usr/lib/systemd/system
 %{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llamacuda.service
@@ -49,7 +49,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t
 [Service]
 Type=simple
 EnvironmentFile=/etc/sysconfig/llama
-ExecStart=/usr/bin/llama-cuda-server $LLAMA_ARGS
+ExecStart=/usr/bin/llamacppcudaserver $LLAMA_ARGS
 ExecReload=/bin/kill -s HUP $MAINPID
 Restart=never

@@ -67,9 +67,9 @@ rm -rf %{buildroot}
 rm -rf %{_builddir}/*

 %files
-%{_bindir}/llama-cuda-cli
-%{_bindir}/llama-cuda-server
-%{_bindir}/llama-cuda-simple
+%{_bindir}/llamacppcuda
+%{_bindir}/llamacppcudaserver
+%{_bindir}/llamacppcudasimple
 /usr/lib/systemd/system/llamacuda.service
 %config /etc/sysconfig/llama

--- a/.devops/llama-cpp.srpm.spec
+++ b/.devops/llama-cpp.srpm.spec
@@ -38,9 +38,9 @@ make -j

 %install
 mkdir -p %{buildroot}%{_bindir}/
-cp -p llama-cli %{buildroot}%{_bindir}/llama-cli
-cp -p llama-server %{buildroot}%{_bindir}/llama-server
-cp -p llama-simple %{buildroot}%{_bindir}/llama-simple
+cp -p main %{buildroot}%{_bindir}/llama
+cp -p server %{buildroot}%{_bindir}/llamaserver
+cp -p simple %{buildroot}%{_bindir}/llamasimple

 mkdir -p %{buildroot}/usr/lib/systemd/system
 %{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llama.service
@@ -51,7 +51,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t
 [Service]
 Type=simple
 EnvironmentFile=/etc/sysconfig/llama
-ExecStart=/usr/bin/llama-server $LLAMA_ARGS
+ExecStart=/usr/bin/llamaserver $LLAMA_ARGS
 ExecReload=/bin/kill -s HUP $MAINPID
 Restart=never

@@ -69,9 +69,9 @@ rm -rf %{buildroot}
 rm -rf %{_builddir}/*

 %files
-%{_bindir}/llama-cli
-%{_bindir}/llama-server
-%{_bindir}/llama-simple
+%{_bindir}/llama
+%{_bindir}/llamaserver
+%{_bindir}/llamasimple
 /usr/lib/systemd/system/llama.service
 %config /etc/sysconfig/llama

--- a/.devops/llama-server-cuda.Dockerfile
+++ b/.devops/llama-server-cuda.Dockerfile
@@ -1,42 +0,0 @@
-ARG UBUNTU_VERSION=22.04
-# This needs to generally match the container host's environment.
-ARG CUDA_VERSION=12.6.0
-# Target the CUDA build image
-ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
-# Target the CUDA runtime image
-ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
-
-FROM ${BASE_CUDA_DEV_CONTAINER} AS build
-
-# CUDA architecture to build for (defaults to all supported archs)
-ARG CUDA_DOCKER_ARCH=default
-
-RUN apt-get update && \
-    apt-get install -y build-essential git cmake libcurl4-openssl-dev
-
-WORKDIR /app
-
-COPY . .
-
-# Use the default CUDA archs if not specified
-RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
-        export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
-    fi && \
-    cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
-    cmake --build build --config Release --target llama-server -j$(nproc)
-
-FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
-
-RUN apt-get update && \
-    apt-get install -y libcurl4-openssl-dev libgomp1 curl
-
-COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
-COPY --from=build /app/build/src/libllama.so /libllama.so
-COPY --from=build /app/build/bin/llama-server /llama-server
-
-# Must be set to 0.0.0.0 so it can listen to requests from host machine
-ENV LLAMA_ARG_HOST=0.0.0.0
-
-HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
-
-ENTRYPOINT [ "/llama-server" ]
--- a/.devops/llama-server-intel.Dockerfile
+++ b/.devops/llama-server-intel.Dockerfile
@@ -1,34 +0,0 @@
-ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
-
-FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
-
-ARG GGML_SYCL_F16=OFF
-RUN apt-get update && \
-    apt-get install -y git libcurl4-openssl-dev
-
-WORKDIR /app
-
-COPY . .
-
-RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
-        echo "GGML_SYCL_F16 is set" && \
-        export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
-    fi && \
-    echo "Building with dynamic libs" && \
-    cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
-    cmake --build build --config Release --target llama-server
-
-FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
-
-RUN apt-get update && \
-    apt-get install -y libcurl4-openssl-dev curl
-
-COPY --from=build /app/build/bin/llama-server /llama-server
-
-ENV LC_ALL=C.utf8
-# Must be set to 0.0.0.0 so it can listen to requests from host machine
-ENV LLAMA_ARG_HOST=0.0.0.0
-
-HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
-
-ENTRYPOINT [ "/llama-server" ]
--- a/.devops/llama-server-vulkan.Dockerfile
+++ b/.devops/llama-server-vulkan.Dockerfile
@@ -1,31 +0,0 @@
-ARG UBUNTU_VERSION=jammy
-
-FROM ubuntu:$UBUNTU_VERSION AS build
-
-# Install build tools
-RUN apt update && apt install -y git build-essential cmake wget
-
-# Install Vulkan SDK and cURL
-RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
-    wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
-    apt update -y && \
-    apt-get install -y vulkan-sdk libcurl4-openssl-dev curl
-
-# Build it
-WORKDIR /app
-COPY . .
-RUN cmake -B build -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
-    cmake --build build --config Release --target llama-server
-
-# Clean up
-WORKDIR /
-RUN cp /app/build/bin/llama-server /llama-server && \
-    rm -rf /app
-
-ENV LC_ALL=C.utf8
-# Must be set to 0.0.0.0 so it can listen to requests from host machine
-ENV LLAMA_ARG_HOST=0.0.0.0
-
-HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
-
-ENTRYPOINT [ "/llama-server" ]
--- a/.devops/llama-server.Dockerfile
+++ b/.devops/llama-server.Dockerfile
@@ -1,29 +0,0 @@
-ARG UBUNTU_VERSION=22.04
-
-FROM ubuntu:$UBUNTU_VERSION AS build
-
-RUN apt-get update && \
-    apt-get install -y build-essential git libcurl4-openssl-dev
-
-WORKDIR /app
-
-COPY . .
-
-ENV LLAMA_CURL=1
-
-RUN make -j$(nproc) llama-server
-
-FROM ubuntu:$UBUNTU_VERSION AS runtime
-
-RUN apt-get update && \
-    apt-get install -y libcurl4-openssl-dev libgomp1 curl
-
-COPY --from=build /app/llama-server /llama-server
-
-ENV LC_ALL=C.utf8
-# Must be set to 0.0.0.0 so it can listen to requests from host machine
-ENV LLAMA_ARG_HOST=0.0.0.0
-
-HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
-
-ENTRYPOINT [ "/llama-server" ]
--- a/.devops/main-cuda.Dockerfile
+++ b/.devops/main-cuda.Dockerfile
@@ -0,0 +1,32 @@
+ARG UBUNTU_VERSION=22.04
+# This needs to generally match the container host's environment.
+ARG CUDA_VERSION=11.7.1
+# Target the CUDA build image
+ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+# Target the CUDA runtime image
+ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
+
+FROM ${BASE_CUDA_DEV_CONTAINER} as build
+
+# Unless otherwise specified, we make a fat build.
+ARG CUDA_DOCKER_ARCH=all
+
+RUN apt-get update && \
+    apt-get install -y build-essential git
+
+WORKDIR /app
+
+COPY . .
+
+# Set nvcc architecture
+ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
+# Enable CUDA
+ENV LLAMA_CUDA=1
+
+RUN make
+
+FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
+
+COPY --from=build /app/main /main
+
+ENTRYPOINT [ "/main" ]
--- a/.devops/main-intel.Dockerfile
+++ b/.devops/main-intel.Dockerfile
@@ -0,0 +1,26 @@
+ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
+
+FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
+
+ARG LLAMA_SYCL_F16=OFF
+RUN apt-get update && \
+    apt-get install -y git
+
+WORKDIR /app
+
+COPY . .
+
+RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
+        echo "LLAMA_SYCL_F16 is set" && \
+        export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
+    fi && \
+    cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
+    cmake --build build --config Release --target main
+
+FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
+
+COPY --from=build /app/build/bin/main /main
+
+ENV LC_ALL=C.utf8
+
+ENTRYPOINT [ "/main" ]
--- a/.devops/llama-cli-rocm.Dockerfile
+++ b/.devops/llama-cli-rocm.Dockerfile
@@ -6,7 +6,7 @@ ARG ROCM_VERSION=5.6
 # Target the CUDA build image
 ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete

-FROM ${BASE_ROCM_DEV_CONTAINER} AS build
+FROM ${BASE_ROCM_DEV_CONTAINER} as build

 # Unless otherwise specified, we make a fat build.
 # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
@@ -36,10 +36,10 @@ COPY . .
 # Set nvcc architecture
 ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
 # Enable ROCm
-ENV GGML_HIPBLAS=1
+ENV LLAMA_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++

-RUN make -j$(nproc) llama-cli
+RUN make

-ENTRYPOINT [ "/app/llama-cli" ]
+ENTRYPOINT [ "/app/main" ]
--- a/.devops/llama-cli-vulkan.Dockerfile
+++ b/.devops/llama-cli-vulkan.Dockerfile
@@ -1,9 +1,9 @@
 ARG UBUNTU_VERSION=jammy

-FROM ubuntu:$UBUNTU_VERSION AS build
+FROM ubuntu:$UBUNTU_VERSION as build

 # Install build tools
-RUN apt update && apt install -y git build-essential cmake wget libgomp1
+RUN apt update && apt install -y git build-essential cmake wget

 # Install Vulkan SDK
 RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
@@ -14,14 +14,14 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key
 # Build it
 WORKDIR /app
 COPY . .
-RUN cmake -B build -DGGML_VULKAN=1 && \
-    cmake --build build --config Release --target llama-cli
+RUN cmake -B build -DLLAMA_VULKAN=1 && \
+    cmake --build build --config Release --target main

 # Clean up
 WORKDIR /
-RUN cp /app/build/bin/llama-cli /llama-cli && \
+RUN cp /app/build/bin/main /main && \
    rm -rf /app

 ENV LC_ALL=C.utf8

-ENTRYPOINT [ "/llama-cli" ]
+ENTRYPOINT [ "/main" ]
--- a/.devops/main.Dockerfile
+++ b/.devops/main.Dockerfile
@@ -0,0 +1,20 @@
+ARG UBUNTU_VERSION=22.04
+
+FROM ubuntu:$UBUNTU_VERSION as build
+
+RUN apt-get update && \
+    apt-get install -y build-essential git
+
+WORKDIR /app
+
+COPY . .
+
+RUN make
+
+FROM ubuntu:$UBUNTU_VERSION as runtime
+
+COPY --from=build /app/main /main
+
+ENV LC_ALL=C.utf8
+
+ENTRYPOINT [ "/main" ]
--- a/.devops/nix/apps.nix
+++ b/.devops/nix/apps.nix
@@ -6,10 +6,11 @@
        let
          inherit (config.packages) default;
          binaries = [
-            "llama-cli"
+            "llama"
            "llama-embedding"
            "llama-server"
-            "llama-quantize"
+            "quantize"
+            "train-text-from-scratch"
          ];
          mkApp = name: {
            type = "app";
--- a/.devops/nix/devshells.nix
+++ b/.devops/nix/devshells.nix
@@ -1,52 +1,13 @@
-{ inputs, ... }:
-
 {
  perSystem =
-    {
-      config,
-      lib,
-      system,
-      ...
-    }:
+    { config, lib, ... }:
    {
      devShells =
-        let
-          pkgs = import inputs.nixpkgs { inherit system; };
-          stdenv = pkgs.stdenv;
-          scripts = config.packages.python-scripts;
-        in
-        lib.pipe (config.packages) [
-          (lib.concatMapAttrs (
-            name: package: {
-              ${name} = pkgs.mkShell {
-                name = "${name}";
-                inputsFrom = [ package ];
-                shellHook = ''
-                  echo "Entering ${name} devShell"
-                '';
-              };
-              "${name}-extra" =
-                if (name == "python-scripts") then
-                  null
-                else
-                  pkgs.mkShell {
-                    name = "${name}-extra";
-                    inputsFrom = [
-                      package
-                      scripts
-                    ];
-                    # Extra packages that *may* be used by some scripts
-                    packages = [
-                        pkgs.python3Packages.tiktoken
-                    ];
-                    shellHook = ''
-                      echo "Entering ${name} devShell"
-                      addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib stdenv.cc.cc}/lib"
-                    '';
-                  };
-            }
-          ))
-          (lib.filterAttrs (name: value: value != null))
-        ];
+        lib.concatMapAttrs
+          (name: package: {
+            ${name} = package.passthru.shell;
+            ${name + "-extra"} = package.passthru.shell-extra;
+          })
+          config.packages;
    };
 }
--- a/.devops/nix/nixpkgs-instances.nix
+++ b/.devops/nix/nixpkgs-instances.nix
@@ -26,14 +26,16 @@
          config.cudaSupport = true;
          config.allowUnfreePredicate =
            p:
-            builtins.all (
-              license:
-              license.free
-              || builtins.elem license.shortName [
-                "CUDA EULA"
-                "cuDNN EULA"
-              ]
-            ) (p.meta.licenses or [ p.meta.license ]);
+            builtins.all
+              (
+                license:
+                license.free
+                || builtins.elem license.shortName [
+                  "CUDA EULA"
+                  "cuDNN EULA"
+                ]
+              )
+              (p.meta.licenses or [ p.meta.license ]);
        };
        # Ensure dependencies use ROCm consistently
        pkgsRocm = import inputs.nixpkgs {
--- a/.devops/nix/package-gguf-py.nix
+++ b/.devops/nix/package-gguf-py.nix
@@ -1,36 +0,0 @@
-{
-  lib,
-  llamaVersion,
-  numpy,
-  tqdm,
-  sentencepiece,
-  pyyaml,
-  poetry-core,
-  buildPythonPackage,
-  pytestCheckHook,
-}:
-
-buildPythonPackage {
-  pname = "gguf";
-  version = llamaVersion;
-  pyproject = true;
-  nativeBuildInputs = [ poetry-core ];
-  propagatedBuildInputs = [
-    numpy
-    tqdm
-    sentencepiece
-    pyyaml
-  ];
-  src = lib.cleanSource ../../gguf-py;
-  pythonImportsCheck = [
-    "numpy"
-    "gguf"
-  ];
-  nativeCheckInputs = [ pytestCheckHook ];
-  doCheck = true;
-  meta = with lib; {
-    description = "Python package for writing binary files in the GGUF format";
-    license = licenses.mit;
-    maintainers = [ maintainers.ditsuke ];
-  };
-}
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -3,35 +3,33 @@
  glibc,
  config,
  stdenv,
+  mkShell,
  runCommand,
  cmake,
  ninja,
  pkg-config,
  git,
+  python3,
  mpi,
  blas,
  cudaPackages,
-  autoAddDriverRunpath,
  darwin,
  rocmPackages,
  vulkan-headers,
  vulkan-loader,
-  curl,
-  shaderc,
-  useBlas ?
-    builtins.all (x: !x) [
-      useCuda
-      useMetalKit
-      useRocm
-      useVulkan
-    ]
-    && blas.meta.available,
+  clblast,
+  useBlas ? builtins.all (x: !x) [
+    useCuda
+    useMetalKit
+    useOpenCL
+    useRocm
+    useVulkan
+  ] && blas.meta.available,
  useCuda ? config.cudaSupport,
-  useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin,
-  # Increases the runtime closure size by ~700M
-  useMpi ? false,
+  useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL,
+  useMpi ? false, # Increases the runtime closure size by ~700M
+  useOpenCL ? false,
  useRocm ? config.rocmSupport,
-  enableCurl ? true,
  useVulkan ? false,
  llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake

@@ -39,8 +37,8 @@
  # otherwise we get libstdc++ errors downstream.
  effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv,
  enableStatic ? effectiveStdenv.hostPlatform.isStatic,
-  precompileMetalShaders ? false,
-}:
+  precompileMetalShaders ? false
+}@inputs:

 let
  inherit (lib)
@@ -48,6 +46,7 @@ let
    cmakeFeature
    optionals
    strings
+    versionOlder
    ;

  stdenv = throw "Use effectiveStdenv instead";
@@ -57,17 +56,45 @@ let
    ++ lib.optionals useCuda [ "CUDA" ]
    ++ lib.optionals useMetalKit [ "MetalKit" ]
    ++ lib.optionals useMpi [ "MPI" ]
+    ++ lib.optionals useOpenCL [ "OpenCL" ]
    ++ lib.optionals useRocm [ "ROCm" ]
    ++ lib.optionals useVulkan [ "Vulkan" ];

  pnameSuffix =
    strings.optionalString (suffices != [ ])
      "-${strings.concatMapStringsSep "-" strings.toLower suffices}";
-  descriptionSuffix = strings.optionalString (
-    suffices != [ ]
-  ) ", accelerated with ${strings.concatStringsSep ", " suffices}";
+  descriptionSuffix =
+    strings.optionalString (suffices != [ ])
+      ", accelerated with ${strings.concatStringsSep ", " suffices}";

-  xcrunHost = runCommand "xcrunHost" { } ''
+  executableSuffix = effectiveStdenv.hostPlatform.extensions.executable;
+
+  # TODO: package the Python in this repository in a Nix-like way.
+  # It'd be nice to migrate to buildPythonPackage, as well as ensure this repo
+  # is PEP 517-compatible, and ensure the correct .dist-info is generated.
+  # https://peps.python.org/pep-0517/
+  #
+  # TODO: Package up each Python script or service appropriately, by making
+  # them into "entrypoints"
+  llama-python = python3.withPackages (
+    ps: [
+      ps.numpy
+      ps.sentencepiece
+    ]
+  );
+
+  # TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime
+  llama-python-extra = python3.withPackages (
+    ps: [
+      ps.numpy
+      ps.sentencepiece
+      ps.tiktoken
+      ps.torchWithoutCuda
+      ps.transformers
+    ]
+  );
+
+  xcrunHost = runCommand "xcrunHost" {} ''
    mkdir -p $out/bin
    ln -s /usr/bin/xcrun $out/bin
  '';
@@ -84,9 +111,16 @@ let
    ++ optionals useMetalKit [ MetalKit ];

  cudaBuildInputs = with cudaPackages; [
-    cuda_cudart
-    cuda_cccl # <nv/target>
-    libcublas
+    cuda_cccl.dev # <nv/target>
+
+    # A temporary hack for reducing the closure size, remove once cudaPackages
+    # have stopped using lndir: https://github.com/NixOS/nixpkgs/issues/271792
+    cuda_cudart.dev
+    cuda_cudart.lib
+    cuda_cudart.static
+    libcublas.dev
+    libcublas.lib
+    libcublas.static
  ];

  rocmBuildInputs = with rocmPackages; [
@@ -98,149 +132,187 @@ let
  vulkanBuildInputs = [
    vulkan-headers
    vulkan-loader
-    shaderc
  ];
 in

-effectiveStdenv.mkDerivation (finalAttrs: {
-  pname = "llama-cpp${pnameSuffix}";
-  version = llamaVersion;
+effectiveStdenv.mkDerivation (
+  finalAttrs: {
+    pname = "llama-cpp${pnameSuffix}";
+    version = llamaVersion;

-  # Note: none of the files discarded here are visible in the sandbox or
-  # affect the output hash. This also means they can be modified without
-  # triggering a rebuild.
-  src = lib.cleanSourceWith {
-    filter =
-      name: type:
-      let
-        noneOf = builtins.all (x: !x);
-        baseName = baseNameOf name;
-      in
-      noneOf [
-        (lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths
-        (lib.hasSuffix ".md" name) # Ignore *.md changes whe computing outPaths
-        (lib.hasPrefix "." baseName) # Skip hidden files and directories
-        (baseName == "flake.lock")
+    # Note: none of the files discarded here are visible in the sandbox or
+    # affect the output hash. This also means they can be modified without
+    # triggering a rebuild.
+    src = lib.cleanSourceWith {
+      filter =
+        name: type:
+        let
+          noneOf = builtins.all (x: !x);
+          baseName = baseNameOf name;
+        in
+        noneOf [
+          (lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths
+          (lib.hasSuffix ".md" name) # Ignore *.md changes whe computing outPaths
+          (lib.hasPrefix "." baseName) # Skip hidden files and directories
+          (baseName == "flake.lock")
+        ];
+      src = lib.cleanSource ../../.;
+    };
+
+    postPatch = ''
+      substituteInPlace ./ggml-metal.m \
+        --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
+      substituteInPlace ./ggml-metal.m \
+        --replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
+    '';
+
+    # With PR#6015 https://github.com/ggerganov/llama.cpp/pull/6015,
+    # `default.metallib` may be compiled with Metal compiler from XCode
+    # and we need to escape sandbox on MacOS to access Metal compiler.
+    # `xcrun` is used find the path of the Metal compiler, which is varible
+    # and not on $PATH
+    # see https://github.com/ggerganov/llama.cpp/pull/6118 for discussion
+    __noChroot = effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders;
+
+    nativeBuildInputs =
+      [
+        cmake
+        ninja
+        pkg-config
+        git
+      ]
+      ++ optionals useCuda [
+        cudaPackages.cuda_nvcc
+
+        # TODO: Replace with autoAddDriverRunpath
+        # once https://github.com/NixOS/nixpkgs/pull/275241 has been merged
+        cudaPackages.autoAddOpenGLRunpathHook
+      ]
+      ++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [
+        glibc.static
+      ] ++ optionals (effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders) [
+        xcrunHost
      ];
-    src = lib.cleanSource ../../.;
-  };

-  postPatch = ''
-    substituteInPlace ./ggml/src/ggml-metal.m \
-      --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
-    substituteInPlace ./ggml/src/ggml-metal.m \
-      --replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
-  '';
+    buildInputs =
+      optionals effectiveStdenv.isDarwin darwinBuildInputs
+      ++ optionals useCuda cudaBuildInputs
+      ++ optionals useMpi [ mpi ]
+      ++ optionals useOpenCL [ clblast ]
+      ++ optionals useRocm rocmBuildInputs
+      ++ optionals useBlas [ blas ]
+      ++ optionals useVulkan vulkanBuildInputs;

-  # With PR#6015 https://github.com/ggerganov/llama.cpp/pull/6015,
-  # `default.metallib` may be compiled with Metal compiler from XCode
-  # and we need to escape sandbox on MacOS to access Metal compiler.
-  # `xcrun` is used find the path of the Metal compiler, which is varible
-  # and not on $PATH
-  # see https://github.com/ggerganov/llama.cpp/pull/6118 for discussion
-  __noChroot = effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders;
-
-  nativeBuildInputs =
-    [
-      cmake
-      ninja
-      pkg-config
-      git
-    ]
-    ++ optionals useCuda [
-      cudaPackages.cuda_nvcc
-
-      autoAddDriverRunpath
-    ]
-    ++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [ glibc.static ]
-    ++ optionals (effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders) [ xcrunHost ];
-
-  buildInputs =
-    optionals effectiveStdenv.isDarwin darwinBuildInputs
-    ++ optionals useCuda cudaBuildInputs
-    ++ optionals useMpi [ mpi ]
-    ++ optionals useRocm rocmBuildInputs
-    ++ optionals useBlas [ blas ]
-    ++ optionals useVulkan vulkanBuildInputs
-    ++ optionals enableCurl [ curl ];
-
-  cmakeFlags =
-    [
-      (cmakeBool "LLAMA_BUILD_SERVER" true)
-      (cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
-      (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
-      (cmakeBool "LLAMA_CURL" enableCurl)
-      (cmakeBool "GGML_NATIVE" false)
-      (cmakeBool "GGML_BLAS" useBlas)
-      (cmakeBool "GGML_CUDA" useCuda)
-      (cmakeBool "GGML_HIPBLAS" useRocm)
-      (cmakeBool "GGML_METAL" useMetalKit)
-      (cmakeBool "GGML_VULKAN" useVulkan)
-      (cmakeBool "GGML_STATIC" enableStatic)
-    ]
-    ++ optionals useCuda [
-      (
-        with cudaPackages.flags;
-        cmakeFeature "CMAKE_CUDA_ARCHITECTURES" (
-          builtins.concatStringsSep ";" (map dropDot cudaCapabilities)
+    cmakeFlags =
+      [
+        (cmakeBool "LLAMA_NATIVE" false)
+        (cmakeBool "LLAMA_BUILD_SERVER" true)
+        (cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
+        (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
+        (cmakeBool "LLAMA_BLAS" useBlas)
+        (cmakeBool "LLAMA_CLBLAST" useOpenCL)
+        (cmakeBool "LLAMA_CUDA" useCuda)
+        (cmakeBool "LLAMA_HIPBLAS" useRocm)
+        (cmakeBool "LLAMA_METAL" useMetalKit)
+        (cmakeBool "LLAMA_VULKAN" useVulkan)
+        (cmakeBool "LLAMA_STATIC" enableStatic)
+      ]
+      ++ optionals useCuda [
+        (
+          with cudaPackages.flags;
+          cmakeFeature "CMAKE_CUDA_ARCHITECTURES" (
+            builtins.concatStringsSep ";" (map dropDot cudaCapabilities)
+          )
        )
-      )
-    ]
-    ++ optionals useRocm [
-      (cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
-      (cmakeFeature "CMAKE_HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets))
-    ]
-    ++ optionals useMetalKit [
-      (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
-      (cmakeBool "GGML_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
-    ];
+      ]
+      ++ optionals useRocm [
+        (cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
+        (cmakeFeature "CMAKE_HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets))
+      ]
+      ++ optionals useMetalKit [
+        (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
+        (cmakeBool "LLAMA_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
+      ];

-  # Environment variables needed for ROCm
-  env = optionals useRocm {
-    ROCM_PATH = "${rocmPackages.clr}";
-    HIP_DEVICE_LIB_PATH = "${rocmPackages.rocm-device-libs}/amdgcn/bitcode";
-  };
+    # Environment variables needed for ROCm
+    env = optionals useRocm {
+      ROCM_PATH = "${rocmPackages.clr}";
+      HIP_DEVICE_LIB_PATH = "${rocmPackages.rocm-device-libs}/amdgcn/bitcode";
+    };

-  # TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
-  # if they haven't been added yet.
-  postInstall = ''
-    mkdir -p $out/include
-    cp $src/include/llama.h $out/include/
-  '';
+    # TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
+    # if they haven't been added yet.
+    postInstall = ''
+      mv $out/bin/main${executableSuffix} $out/bin/llama${executableSuffix}
+      mv $out/bin/server${executableSuffix} $out/bin/llama-server${executableSuffix}
+      mkdir -p $out/include
+      cp $src/llama.h $out/include/
+    '';

-  meta = {
-    # Configurations we don't want even the CI to evaluate. Results in the
-    # "unsupported platform" messages. This is mostly a no-op, because
-    # cudaPackages would've refused to evaluate anyway.
-    badPlatforms = optionals useCuda lib.platforms.darwin;
+    # Define the shells here, but don't add in the inputsFrom to avoid recursion.
+    passthru = {
+      inherit
+        useBlas
+        useCuda
+        useMetalKit
+        useMpi
+        useOpenCL
+        useRocm
+        useVulkan
+        ;

-    # Configurations that are known to result in build failures. Can be
-    # overridden by importing Nixpkgs with `allowBroken = true`.
-    broken = (useMetalKit && !effectiveStdenv.isDarwin);
+      shell = mkShell {
+        name = "shell-${finalAttrs.finalPackage.name}";
+        description = "contains numpy and sentencepiece";
+        buildInputs = [ llama-python ];
+        inputsFrom = [ finalAttrs.finalPackage ];
+        shellHook = ''
+          addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib effectiveStdenv.cc.cc}/lib"
+        '';
+      };

-    description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
-    homepage = "https://github.com/ggerganov/llama.cpp/";
-    license = lib.licenses.mit;
+      shell-extra = mkShell {
+        name = "shell-extra-${finalAttrs.finalPackage.name}";
+        description = "contains numpy, sentencepiece, torchWithoutCuda, and transformers";
+        buildInputs = [ llama-python-extra ];
+        inputsFrom = [ finalAttrs.finalPackage ];
+      };
+    };

-    # Accommodates `nix run` and `lib.getExe`
-    mainProgram = "llama-cli";
+    meta = {
+      # Configurations we don't want even the CI to evaluate. Results in the
+      # "unsupported platform" messages. This is mostly a no-op, because
+      # cudaPackages would've refused to evaluate anyway.
+      badPlatforms = optionals (useCuda || useOpenCL) lib.platforms.darwin;

-    # These people might respond, on the best effort basis, if you ping them
-    # in case of Nix-specific regressions or for reviewing Nix-specific PRs.
-    # Consider adding yourself to this list if you want to ensure this flake
-    # stays maintained and you're willing to invest your time. Do not add
-    # other people without their consent. Consider removing people after
-    # they've been unreachable for long periods of time.
+      # Configurations that are known to result in build failures. Can be
+      # overridden by importing Nixpkgs with `allowBroken = true`.
+      broken = (useMetalKit && !effectiveStdenv.isDarwin);

-    # Note that lib.maintainers is defined in Nixpkgs, but you may just add
-    # an attrset following the same format as in
-    # https://github.com/NixOS/nixpkgs/blob/f36a80e54da29775c78d7eff0e628c2b4e34d1d7/maintainers/maintainer-list.nix
-    maintainers = with lib.maintainers; [
-      philiptaron
-      SomeoneSerge
-    ];
+      description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
+      homepage = "https://github.com/ggerganov/llama.cpp/";
+      license = lib.licenses.mit;

-    # Extend `badPlatforms` instead
-    platforms = lib.platforms.all;
-  };
-})
+      # Accommodates `nix run` and `lib.getExe`
+      mainProgram = "llama";
+
+      # These people might respond, on the best effort basis, if you ping them
+      # in case of Nix-specific regressions or for reviewing Nix-specific PRs.
+      # Consider adding yourself to this list if you want to ensure this flake
+      # stays maintained and you're willing to invest your time. Do not add
+      # other people without their consent. Consider removing people after
+      # they've been unreachable for long periods of time.
+
+      # Note that lib.maintainers is defined in Nixpkgs, but you may just add
+      # an attrset following the same format as in
+      # https://github.com/NixOS/nixpkgs/blob/f36a80e54da29775c78d7eff0e628c2b4e34d1d7/maintainers/maintainer-list.nix
+      maintainers = with lib.maintainers; [
+        philiptaron
+        SomeoneSerge
+      ];
+
+      # Extend `badPlatforms` instead
+      platforms = lib.platforms.all;
+    };
+  }
+)
--- a/.devops/nix/python-scripts.nix
+++ b/.devops/nix/python-scripts.nix
@@ -1,66 +0,0 @@
-{
-  lib,
-  stdenv,
-  buildPythonPackage,
-  poetry-core,
-  mkShell,
-  python3Packages,
-  gguf-py,
-}@inputs:
-
-let
-  llama-python-deps = with python3Packages; [
-    numpy
-    sentencepiece
-    transformers
-    protobuf
-    torchWithoutCuda
-    gguf-py
-    tqdm
-
-    # for scripts/compare-llama-bench.py
-    gitpython
-    tabulate
-
-    # for examples/pydantic-models-to-grammar-examples.py
-    docstring-parser
-    pydantic
-
-  ];
-
-  llama-python-test-deps = with python3Packages; [
-    # Server bench
-    matplotlib
-
-    # server tests
-    openai
-    behave
-    prometheus-client
-  ];
-in
-
-buildPythonPackage ({
-  pname = "llama-scripts";
-  version = "0.0.0";
-  pyproject = true;
-
-  # NOTE: The files filtered out here are not visible in the build sandbox, neither
-  # do they affect the output hash. They can be modified without triggering a rebuild.
-  src = lib.cleanSourceWith {
-    filter =
-      name: type:
-      let
-        any = builtins.any (x: x);
-        baseName = builtins.baseNameOf name;
-      in
-      any [
-        (lib.hasSuffix ".py" name)
-        (baseName == "README.md")
-        (baseName == "pyproject.toml")
-      ];
-    src = lib.cleanSource ../../.;
-  };
-  nativeBuildInputs = [ poetry-core ];
-  nativeCheckInputs = llama-python-test-deps;
-  dependencies = llama-python-deps;
-})
--- a/.devops/nix/scope.nix
+++ b/.devops/nix/scope.nix
@@ -1,41 +1,19 @@
 {
  lib,
  newScope,
-  python3,
  llamaVersion ? "0.0.0",
 }:

-let
-  pythonPackages = python3.pkgs;
-  buildPythonPackage = pythonPackages.buildPythonPackage;
-  numpy = pythonPackages.numpy;
-  tqdm = pythonPackages.tqdm;
-  sentencepiece = pythonPackages.sentencepiece;
-  pyyaml = pythonPackages.pyyaml;
-  poetry-core = pythonPackages.poetry-core;
-  pytestCheckHook = pythonPackages.pytestCheckHook;
-in
-
 # We're using `makeScope` instead of just writing out an attrset
 # because it allows users to apply overlays later using `overrideScope'`.
 # Cf. https://noogle.dev/f/lib/makeScope

-lib.makeScope newScope (self: {
-  inherit llamaVersion;
-  gguf-py = self.callPackage ./package-gguf-py.nix {
-    inherit
-      buildPythonPackage
-      numpy
-      tqdm
-      sentencepiece
-      poetry-core
-      pyyaml
-      pytestCheckHook
-      ;
-  };
-  python-scripts = self.callPackage ./python-scripts.nix { inherit buildPythonPackage poetry-core; };
-  llama-cpp = self.callPackage ./package.nix { };
-  docker = self.callPackage ./docker.nix { };
-  docker-min = self.callPackage ./docker.nix { interactive = false; };
-  sif = self.callPackage ./sif.nix { };
-})
+lib.makeScope newScope (
+  self: {
+    inherit llamaVersion;
+    llama-cpp = self.callPackage ./package.nix { };
+    docker = self.callPackage ./docker.nix { };
+    docker-min = self.callPackage ./docker.nix { interactive = false; };
+    sif = self.callPackage ./sif.nix { };
+  }
+)
--- a/.devops/server-cuda.Dockerfile
+++ b/.devops/server-cuda.Dockerfile
@@ -0,0 +1,37 @@
+ARG UBUNTU_VERSION=22.04
+# This needs to generally match the container host's environment.
+ARG CUDA_VERSION=11.7.1
+# Target the CUDA build image
+ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+# Target the CUDA runtime image
+ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
+
+FROM ${BASE_CUDA_DEV_CONTAINER} as build
+
+# Unless otherwise specified, we make a fat build.
+ARG CUDA_DOCKER_ARCH=all
+
+RUN apt-get update && \
+    apt-get install -y build-essential git libcurl4-openssl-dev
+
+WORKDIR /app
+
+COPY . .
+
+# Set nvcc architecture
+ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
+# Enable CUDA
+ENV LLAMA_CUDA=1
+# Enable cURL
+ENV LLAMA_CURL=1
+
+RUN make
+
+FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
+
+RUN apt-get update && \
+    apt-get install -y libcurl4-openssl-dev
+
+COPY --from=build /app/server /server
+
+ENTRYPOINT [ "/server" ]
--- a/.devops/server-intel.Dockerfile
+++ b/.devops/server-intel.Dockerfile
@@ -0,0 +1,29 @@
+ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
+
+FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
+
+ARG LLAMA_SYCL_F16=OFF
+RUN apt-get update && \
+    apt-get install -y git libcurl4-openssl-dev
+
+WORKDIR /app
+
+COPY . .
+
+RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
+        echo "LLAMA_SYCL_F16 is set" && \
+        export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
+    fi && \
+    cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
+    cmake --build build --config Release --target server
+
+FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
+
+RUN apt-get update && \
+    apt-get install -y libcurl4-openssl-dev
+
+COPY --from=build /app/build/bin/server /server
+
+ENV LC_ALL=C.utf8
+
+ENTRYPOINT [ "/server" ]
--- a/.devops/llama-server-rocm.Dockerfile
+++ b/.devops/llama-server-rocm.Dockerfile
@@ -6,7 +6,7 @@ ARG ROCM_VERSION=5.6
 # Target the CUDA build image
 ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete

-FROM ${BASE_ROCM_DEV_CONTAINER} AS build
+FROM ${BASE_ROCM_DEV_CONTAINER} as build

 # Unless otherwise specified, we make a fat build.
 # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
@@ -36,19 +36,15 @@ COPY . .
 # Set nvcc architecture
 ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
 # Enable ROCm
-ENV GGML_HIPBLAS=1
+ENV LLAMA_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++
-# Must be set to 0.0.0.0 so it can listen to requests from host machine
-ENV LLAMA_ARG_HOST=0.0.0.0

 # Enable cURL
 ENV LLAMA_CURL=1
 RUN apt-get update && \
-    apt-get install -y libcurl4-openssl-dev curl
+    apt-get install -y libcurl4-openssl-dev

-RUN make -j$(nproc) llama-server
+RUN make

-HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
-
-ENTRYPOINT [ "/app/llama-server" ]
+ENTRYPOINT [ "/app/server" ]
--- a/.devops/server-vulkan.Dockerfile
+++ b/.devops/server-vulkan.Dockerfile
@@ -0,0 +1,31 @@
+ARG UBUNTU_VERSION=jammy
+
+FROM ubuntu:$UBUNTU_VERSION as build
+
+# Install build tools
+RUN apt update && apt install -y git build-essential cmake wget
+
+# Install Vulkan SDK
+RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
+    wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
+    apt update -y && \
+    apt-get install -y vulkan-sdk
+
+# Install cURL
+RUN apt-get update && \
+    apt-get install -y libcurl4-openssl-dev
+
+# Build it
+WORKDIR /app
+COPY . .
+RUN cmake -B build -DLLAMA_VULKAN=1 -DLLAMA_CURL=1 && \
+    cmake --build build --config Release --target server
+
+# Clean up
+WORKDIR /
+RUN cp /app/build/bin/server /server && \
+    rm -rf /app
+
+ENV LC_ALL=C.utf8
+
+ENTRYPOINT [ "/server" ]
--- a/.devops/server.Dockerfile
+++ b/.devops/server.Dockerfile
@@ -0,0 +1,25 @@
+ARG UBUNTU_VERSION=22.04
+
+FROM ubuntu:$UBUNTU_VERSION as build
+
+RUN apt-get update && \
+    apt-get install -y build-essential git libcurl4-openssl-dev
+
+WORKDIR /app
+
+COPY . .
+
+ENV LLAMA_CURL=1
+
+RUN make
+
+FROM ubuntu:$UBUNTU_VERSION as runtime
+
+RUN apt-get update && \
+    apt-get install -y libcurl4-openssl-dev
+
+COPY --from=build /app/server /server
+
+ENV LC_ALL=C.utf8
+
+ENTRYPOINT [ "/server" ]
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@@ -8,11 +8,13 @@ arg1="$1"
 shift

 if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
-    python3 ./convert_hf_to_gguf.py "$@"
+    python3 ./convert.py "$@"
 elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
-    ./llama-quantize "$@"
+    ./quantize "$@"
 elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
-    ./llama-cli "$@"
+    ./main "$@"
+elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then
+    ./finetune "$@"
 elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
    echo "Converting PTH to GGML..."
    for i in `ls $1/$2/ggml-model-f16.bin*`; do
@@ -20,11 +22,11 @@ elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
            echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
        else
            echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
-            ./llama-quantize "$i" "${i/f16/q4_0}" q4_0
+            ./quantize "$i" "${i/f16/q4_0}" q4_0
        fi
    done
 elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
-    ./llama-server "$@"
+    ./server "$@"
 else
    echo "Unknown command: $arg1"
    echo "Available commands: "
@@ -34,6 +36,8 @@ else
    echo "              ex: --outtype f16 \"/models/7B/\" "
    echo "  --quantize (-q): Optimize with quantization process ggml"
    echo "              ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
+    echo "  --finetune (-f): Run finetune command to create a lora finetune of the model"
+    echo "              See documentation for finetune for command-line parameters"
    echo "  --all-in-one (-a): Execute --convert & --quantize"
    echo "              ex: \"/models/\" 7B"
    echo "  --server (-s): Run a model on the server"
--- a/.dockerignore
+++ b/.dockerignore
@@ -12,8 +12,8 @@ build*/

 models/*

-/llama-cli
-/llama-quantize
+/main
+/quantize

 arm_neon.h
 compile_commands.json
--- a/.ecrc
+++ b/.ecrc
@@ -1,5 +1,5 @@
 {
-  "Exclude": ["^\\.gitmodules$", "stb_image\\.h"],
+  "Exclude": ["^\\.gitmodules$"],
  "Disable": {
    "IndentSize": true
  }
--- a/.editorconfig
+++ b/.editorconfig
@@ -26,7 +26,3 @@ indent_size = 2

 [examples/llama.swiftui/llama.swiftui.xcodeproj/*]
 indent_style = tab
-
-[examples/cvector-generator/*.txt]
-trim_trailing_whitespace = unset
-insert_final_newline = unset
--- a/.github/ISSUE_TEMPLATE/01-bug-low.yml
+++ b/.github/ISSUE_TEMPLATE/01-bug-low.yml
@@ -1,50 +0,0 @@
-name: Low Severity Bugs
-description: Used to report low severity bugs in llama.cpp (e.g. cosmetic issues, non critical UI glitches)
-title: "Bug: "
-labels: ["bug-unconfirmed", "low severity"]
-body:
-  - type: markdown
-    attributes:
-      value: |
-        Thanks for taking the time to fill out this bug report!
-        Please include information about your system, the steps to reproduce the bug,
-        and the version of llama.cpp that you are using.
-        If possible, please provide a minimal code example that reproduces the bug.
-  - type: textarea
-    id: what-happened
-    attributes:
-      label: What happened?
-      description: Also tell us, what did you expect to happen?
-      placeholder: Tell us what you see!
-    validations:
-      required: true
-  - type: textarea
-    id: version
-    attributes:
-      label: Name and Version
-      description: Which executable and which version of our software are you running? (use `--version` to get a version string)
-      placeholder: |
-        $./llama-cli --version
-        version: 2999 (42b4109e)
-        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
-    validations:
-      required: true
-  - type: dropdown
-    id: operating-system
-    attributes:
-      label: What operating system are you seeing the problem on?
-      multiple: true
-      options:
-        - Linux
-        - Mac
-        - Windows
-        - BSD
-        - Other? (Please let us know in description)
-    validations:
-      required: false
-  - type: textarea
-    id: logs
-    attributes:
-      label: Relevant log output
-      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
-      render: shell
--- a/.github/ISSUE_TEMPLATE/02-bug-medium.yml
+++ b/.github/ISSUE_TEMPLATE/02-bug-medium.yml
@@ -1,50 +0,0 @@
-name: Medium Severity Bug
-description: Used to report medium severity bugs in llama.cpp (e.g. Malfunctioning Features but generally still useable)
-title: "Bug: "
-labels: ["bug-unconfirmed", "medium severity"]
-body:
-  - type: markdown
-    attributes:
-      value: |
-        Thanks for taking the time to fill out this bug report!
-        Please include information about your system, the steps to reproduce the bug,
-        and the version of llama.cpp that you are using.
-        If possible, please provide a minimal code example that reproduces the bug.
-  - type: textarea
-    id: what-happened
-    attributes:
-      label: What happened?
-      description: Also tell us, what did you expect to happen?
-      placeholder: Tell us what you see!
-    validations:
-      required: true
-  - type: textarea
-    id: version
-    attributes:
-      label: Name and Version
-      description: Which executable and which version of our software are you running? (use `--version` to get a version string)
-      placeholder: |
-        $./llama-cli --version
-        version: 2999 (42b4109e)
-        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
-    validations:
-      required: true
-  - type: dropdown
-    id: operating-system
-    attributes:
-      label: What operating system are you seeing the problem on?
-      multiple: true
-      options:
-        - Linux
-        - Mac
-        - Windows
-        - BSD
-        - Other? (Please let us know in description)
-    validations:
-      required: false
-  - type: textarea
-    id: logs
-    attributes:
-      label: Relevant log output
-      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
-      render: shell
--- a/.github/ISSUE_TEMPLATE/03-bug-high.yml
+++ b/.github/ISSUE_TEMPLATE/03-bug-high.yml
@@ -1,50 +0,0 @@
-name: High Severity Bug
-description: Used to report high severity bugs in llama.cpp (e.g. Malfunctioning features hindering important common workflow)
-title: "Bug: "
-labels: ["bug-unconfirmed", "high severity"]
-body:
-  - type: markdown
-    attributes:
-      value: |
-        Thanks for taking the time to fill out this bug report!
-        Please include information about your system, the steps to reproduce the bug,
-        and the version of llama.cpp that you are using.
-        If possible, please provide a minimal code example that reproduces the bug.
-  - type: textarea
-    id: what-happened
-    attributes:
-      label: What happened?
-      description: Also tell us, what did you expect to happen?
-      placeholder: Tell us what you see!
-    validations:
-      required: true
-  - type: textarea
-    id: version
-    attributes:
-      label: Name and Version
-      description: Which executable and which version of our software are you running? (use `--version` to get a version string)
-      placeholder: |
-        $./llama-cli --version
-        version: 2999 (42b4109e)
-        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
-    validations:
-      required: true
-  - type: dropdown
-    id: operating-system
-    attributes:
-      label: What operating system are you seeing the problem on?
-      multiple: true
-      options:
-        - Linux
-        - Mac
-        - Windows
-        - BSD
-        - Other? (Please let us know in description)
-    validations:
-      required: false
-  - type: textarea
-    id: logs
-    attributes:
-      label: Relevant log output
-      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
-      render: shell
--- a/.github/ISSUE_TEMPLATE/04-bug-critical.yml
+++ b/.github/ISSUE_TEMPLATE/04-bug-critical.yml
@@ -1,50 +0,0 @@
-name: Critical Severity Bug
-description: Used to report critical severity bugs in llama.cpp (e.g. Crashing, Corrupted, Dataloss)
-title: "Bug: "
-labels: ["bug-unconfirmed", "critical severity"]
-body:
-  - type: markdown
-    attributes:
-      value: |
-        Thanks for taking the time to fill out this bug report!
-        Please include information about your system, the steps to reproduce the bug,
-        and the version of llama.cpp that you are using.
-        If possible, please provide a minimal code example that reproduces the bug.
-  - type: textarea
-    id: what-happened
-    attributes:
-      label: What happened?
-      description: Also tell us, what did you expect to happen?
-      placeholder: Tell us what you see!
-    validations:
-      required: true
-  - type: textarea
-    id: version
-    attributes:
-      label: Name and Version
-      description: Which executable and which version of our software are you running? (use `--version` to get a version string)
-      placeholder: |
-        $./llama-cli --version
-        version: 2999 (42b4109e)
-        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
-    validations:
-      required: true
-  - type: dropdown
-    id: operating-system
-    attributes:
-      label: What operating system are you seeing the problem on?
-      multiple: true
-      options:
-        - Linux
-        - Mac
-        - Windows
-        - BSD
-        - Other? (Please let us know in description)
-    validations:
-      required: false
-  - type: textarea
-    id: logs
-    attributes:
-      label: Relevant log output
-      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
-      render: shell
--- a/.github/ISSUE_TEMPLATE/05-enhancement.yml
+++ b/.github/ISSUE_TEMPLATE/05-enhancement.yml
@@ -1,51 +0,0 @@
-name: Enhancement
-description: Used to request enhancements for llama.cpp
-title: "Feature Request: "
-labels: ["enhancement"]
-body:
-  - type: markdown
-    attributes:
-      value: |
-        [Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggerganov/llama.cpp/discussions/categories/ideas)
-
-  - type: checkboxes
-    id: prerequisites
-    attributes:
-      label: Prerequisites
-      description: Please confirm the following before submitting your enhancement request.
-      options:
-        - label: I am running the latest code. Mention the version if possible as well.
-          required: true
-        - label: I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
-          required: true
-        - label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed).
-          required: true
-        - label: I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new and useful enhancement to share.
-          required: true
-
-  - type: textarea
-    id: feature-description
-    attributes:
-      label: Feature Description
-      description: Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
-      placeholder: Detailed description of the enhancement
-    validations:
-      required: true
-
-  - type: textarea
-    id: motivation
-    attributes:
-      label: Motivation
-      description: Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
-      placeholder: Explanation of why this feature is needed and its benefits
-    validations:
-      required: true
-
-  - type: textarea
-    id: possible-implementation
-    attributes:
-      label: Possible Implementation
-      description: If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.
-      placeholder: Detailed description of potential implementation
-    validations:
-      required: false
--- a/.github/ISSUE_TEMPLATE/06-research.yml
+++ b/.github/ISSUE_TEMPLATE/06-research.yml
@@ -1,52 +0,0 @@
-name: Research
-description: Track new technical research area
-title: "Research: "
-labels: ["research 🔬"]
-body:
-  - type: markdown
-    attributes:
-      value: |
-        Don't forget to check for any [duplicate research issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)
-
-  - type: checkboxes
-    id: research-stage
-    attributes:
-      label: Research Stage
-      description: Track general state of this research ticket
-      options:
-        - label: Background Research (Let's try to avoid reinventing the wheel)
-        - label: Hypothesis Formed (How do you think this will work and it's effect?)
-        - label: Strategy / Implementation Forming
-        - label: Analysis of results
-        - label: Debrief / Documentation (So people in the future can learn from us)
-
-  - type: textarea
-    id: background
-    attributes:
-      label: Previous existing literature and research
-      description: Whats the current state of the art and whats the motivation for this research?
-
-  - type: textarea
-    id: hypothesis
-    attributes:
-      label: Hypothesis
-      description: How do you think this will work and it's effect?
-
-  - type: textarea
-    id: implementation
-    attributes:
-      label: Implementation
-      description: Got an approach? e.g. a PR ready to go?
-
-  - type: textarea
-    id: analysis
-    attributes:
-      label: Analysis
-      description: How does the proposed implementation behave?
-
-  - type: textarea
-    id: logs
-    attributes:
-      label: Relevant log output
-      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
-      render: shell
--- a/.github/ISSUE_TEMPLATE/07-refactor.yml
+++ b/.github/ISSUE_TEMPLATE/07-refactor.yml
@@ -1,28 +0,0 @@
-name: Refactor (Maintainers)
-description: Used to track refactoring opportunities
-title: "Refactor: "
-labels: ["refactor"]
-body:
-  - type: markdown
-    attributes:
-      value: |
-        Don't forget to [check for existing refactor issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
-        Also you may want to check [Pull request refactor label as well](https://github.com/ggerganov/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.
-
-  - type: textarea
-    id: background-description
-    attributes:
-      label: Background Description
-      description: Please provide a detailed written description of the pain points you are trying to solve.
-      placeholder: Detailed description behind your motivation to request refactor
-    validations:
-      required: true
-
-  - type: textarea
-    id: possible-approaches
-    attributes:
-      label: Possible Refactor Approaches
-      description: If you have some idea of possible approaches to solve this problem. You may want to make it a todo list.
-      placeholder: Your idea of possible refactoring opportunity/approaches
-    validations:
-      required: false
--- a/.github/ISSUE_TEMPLATE/bug.md
+++ b/.github/ISSUE_TEMPLATE/bug.md
@@ -0,0 +1,11 @@
+---
+name: Bug template
+about: Used to report bugs in llama.cpp
+labels: ["bug-unconfirmed"]
+assignees: ''
+
+---
+
+Please include information about your system, the steps to reproduce the bug, and the version of llama.cpp that you are using. If possible, please provide a minimal code example that reproduces the bug.
+
+If the bug concerns the server, please try to reproduce it first using the [server test scenario framework](https://github.com/ggerganov/llama.cpp/tree/master/examples/server/tests).
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -1,11 +0,0 @@
-blank_issues_enabled: true
-contact_links:
-  - name: Got an idea?
-    url: https://github.com/ggerganov/llama.cpp/discussions/categories/ideas
-    about: Pop it there. It may then become an enhancement ticket.
-  - name: Got a question?
-    url: https://github.com/ggerganov/llama.cpp/discussions/categories/q-a
-    about: Ask a question there!
-  - name: Want to contribute?
-    url: https://github.com/ggerganov/llama.cpp/wiki/contribute
-    about: Head to the contribution guide page of the wiki for areas you can help with
--- a/.github/ISSUE_TEMPLATE/enhancement.md
+++ b/.github/ISSUE_TEMPLATE/enhancement.md
@@ -0,0 +1,28 @@
+---
+name: Enhancement template
+about: Used to request enhancements for llama.cpp
+labels: ["enhancement"]
+assignees: ''
+
+---
+
+# Prerequisites
+
+Please answer the following questions for yourself before submitting an issue.
+
+- [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now.
+- [ ] I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
+- [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed).
+- [ ] I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new bug or useful enhancement to share.
+
+# Feature Description
+
+Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
+
+# Motivation
+
+Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
+
+# Possible Implementation
+
+If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -1,34 +1,20 @@
 # https://github.com/actions/labeler
-Kompute:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml/include/ggml-kompute.h
-            - ggml/src/ggml-kompute.cpp
-            - README-kompute.md
-Apple Metal:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml/include/ggml-metal.h
-            - ggml/src/ggml-metal.cpp
-            - README-metal.md
+
 SYCL:
    - changed-files:
        - any-glob-to-any-file:
-            - ggml/include/ggml-sycl.h
-            - ggml/src/ggml-sycl.cpp
-            - ggml/src/ggml-sycl/**
-            - docs/backend/SYCL.md
-            - examples/sycl/**
+            - ggml-sycl.h
+            - ggml-sycl.cpp
+            - README-sycl.md
 Nvidia GPU:
    - changed-files:
        - any-glob-to-any-file:
-            - ggml/include/ggml-cuda.h
-            - ggml/src/ggml-cuda/**
+            - ggml-cuda/**
 Vulkan:
    - changed-files:
        - any-glob-to-any-file:
-            - ggml/ggml_vk_generate_shaders.py
-            - ggml/src/ggml-vulkan*
+            - ggml_vk_generate_shaders.py
+            - ggml-vulkan*
 documentation:
    - changed-files:
        - any-glob-to-any-file:
@@ -44,6 +30,7 @@ build:
            - cmake/**
            - CMakeLists.txt
            - CMakePresets.json
+            - codecov.yml
 examples:
    - changed-files:
        - any-glob-to-any-file: examples/**
@@ -75,10 +62,8 @@ server:
 ggml:
    - changed-files:
        - any-glob-to-any-file:
-            - ggml/include/ggml*.h
-            - ggml/src/ggml*.c
-            - ggml/src/ggml*.cpp
-            - ggml/src/ggml*.h
+            - ggml-*.c
+            - ggml-*.h
            - ggml-cuda/**
 nix:
    - changed-files:
@@ -86,6 +71,3 @@ nix:
            - "**/*.nix"
            - .github/workflows/nix-*.yml
            - .devops/nix/nixpkgs-instances.nix
-embedding:
-    - changed-files:
-        - any-glob-to-any-file: examples/embedding/
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -1,7 +0,0 @@
-
-
- [x] I have read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md)
- Self-reported review complexity:
-  - [ ] Low
-  - [ ] Medium
-  - [ ] High
--- a/.github/workflows/bench.yml.disabled
+++ b/.github/workflows/bench.yml.disabled
@@ -1,6 +1,3 @@
-# TODO: there have been some issues with the workflow, so disabling for now
-#       https://github.com/ggerganov/llama.cpp/issues/7893
-#
 # Benchmark
 name: Benchmark

@@ -112,7 +109,7 @@ jobs:
        run: |
          set -eux
          cmake -B build \
-              -DGGML_NATIVE=OFF \
+              -DLLAMA_NATIVE=OFF \
              -DLLAMA_BUILD_SERVER=ON \
              -DLLAMA_CURL=ON \
              -DLLAMA_CUBLAS=ON \
@@ -122,7 +119,7 @@ jobs:
              -DLLAMA_FATAL_WARNINGS=OFF \
              -DLLAMA_ALL_WARNINGS=OFF \
              -DCMAKE_BUILD_TYPE=Release;
-          cmake --build build --config Release -j $(nproc) --target llama-server
+          cmake --build build --config Release -j $(nproc) --target server

      - name: Download the dataset
        id: download_dataset
@@ -132,8 +129,6 @@ jobs:

      - name: Server bench
        id: server_bench
-        env:
-            HEAD_REF: ${{ github.head_ref || github.ref_name }}
        run: |
          set -eux

@@ -142,7 +137,7 @@ jobs:
          python bench.py \
              --runner-label ${{ env.RUNNER_LABEL }} \
              --name ${{ github.job }} \
-              --branch $HEAD_REF \
+              --branch ${{ github.head_ref || github.ref_name }} \
              --commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \
              --scenario script.js \
              --duration ${{ github.event.inputs.duration || env.DURATION }} \
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -10,10 +10,10 @@ on:
  push:
    branches:
      - master
-    paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
+    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
  pull_request:
    types: [opened, synchronize, reopened]
-    paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
+    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@@ -23,9 +23,6 @@ env:
  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
  GGML_NLOOP: 3
  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1

 jobs:
  macOS-latest-cmake-arm64:
@@ -50,7 +47,7 @@ jobs:
          sysctl -a
          mkdir build
          cd build
-          cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF ..
+          cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON ..
          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)

      - name: Test
@@ -87,7 +84,7 @@ jobs:
          name: llama-bin-macos-arm64.zip

  macOS-latest-cmake-x64:
-    runs-on: macos-12
+    runs-on: macos-latest

    steps:
      - name: Clone
@@ -106,10 +103,12 @@ jobs:
        id: cmake_build
        run: |
          sysctl -a
+          mkdir build
+          cd build
          # Metal is disabled due to intermittent failures with Github runners not having a GPU:
          # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
-          cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
+          cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF -DLLAMA_CURL=ON ..
+          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)

      - name: Test
        id: cmake_test
@@ -225,7 +224,7 @@ jobs:
        run: |
          mkdir build
          cd build
-          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF
+          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON
          cmake --build . --config Release -j $(nproc)

      - name: Test
@@ -242,8 +241,8 @@ jobs:
          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
          echo "Fetch llama2c model"
          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
-          ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
-          ./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
+          ./bin/convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
+          ./bin/main -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256

      - name: Determine tag name
        id: tag
@@ -295,22 +294,12 @@ jobs:

      - name: Build
        id: cmake_build
-        if: ${{ matrix.sanitizer != 'THREAD' }}
        run: |
          mkdir build
          cd build
          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
          cmake --build . --config ${{ matrix.build_type }} -j $(nproc)

-      - name: Build (no OpenMP)
-        id: cmake_build_no_openmp
-        if: ${{ matrix.sanitizer == 'THREAD' }}
-        run: |
-          mkdir build
-          cd build
-          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DGGML_OPENMP=OFF
-          cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
-
      - name: Test
        id: cmake_test
        run: |
@@ -338,7 +327,7 @@ jobs:
        run: |
          mkdir build
          cd build
-          cmake -DGGML_RPC=ON ..
+          cmake -DLLAMA_RPC=ON ..
          cmake --build . --config Release -j $(nproc)

      - name: Test
@@ -358,17 +347,15 @@ jobs:
      - name: Dependencies
        id: depends
        run: |
-          wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
-          sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
-          sudo apt-get update -y
-          sudo apt-get install -y build-essential vulkan-sdk
+          sudo apt-get update
+          sudo apt-get install build-essential libvulkan-dev

      - name: Build
        id: cmake_build
        run: |
          mkdir build
          cd build
-          cmake -DGGML_VULKAN=ON ..
+          cmake -DLLAMA_VULKAN=ON ..
          cmake --build . --config Release -j $(nproc)

  ubuntu-22-cmake-hip:
@@ -378,7 +365,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3

      - name: Dependencies
        id: depends
@@ -389,13 +376,13 @@ jobs:
      - name: Build with native CMake HIP support
        id: cmake_build
        run: |
-          cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DGGML_HIPBLAS=ON
+          cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DLLAMA_HIPBLAS=ON
          cmake --build build --config Release -j $(nproc)

      - name: Build with legacy HIP support
        id: cmake_build_legacy_hip
        run: |
-          cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DGGML_HIPBLAS=ON
+          cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DLLAMA_HIPBLAS=ON
          cmake --build build2 --config Release -j $(nproc)

  ubuntu-22-cmake-sycl:
@@ -404,7 +391,7 @@ jobs:
    continue-on-error: true

    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v2

      - name: add oneAPI to apt
        shell: bash
@@ -436,7 +423,7 @@ jobs:
          source /opt/intel/oneapi/setvars.sh
          mkdir build
          cd build
-          cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
+          cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
          cmake --build . --config Release -j $(nproc)

  ubuntu-22-cmake-sycl-fp16:
@@ -445,7 +432,7 @@ jobs:
    continue-on-error: true

    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v2

      - name: add oneAPI to apt
        shell: bash
@@ -477,10 +464,10 @@ jobs:
          source /opt/intel/oneapi/setvars.sh
          mkdir build
          cd build
-          cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON ..
+          cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON ..
          cmake --build . --config Release -j $(nproc)

-  # TODO: build with GGML_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
+  # TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
  #       how to debug it.
  #       ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
  macOS-latest-make:
@@ -502,15 +489,15 @@ jobs:
        env:
            LLAMA_FATAL_WARNINGS: 1
        run: |
-          GGML_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu)
+          LLAMA_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu)

      - name: Test
        id: make_test
        run: |
-          GGML_NO_METAL=1 make tests -j $(sysctl -n hw.logicalcpu)
-          GGML_NO_METAL=1 make test  -j $(sysctl -n hw.logicalcpu)
+          LLAMA_NO_METAL=1 make tests -j $(sysctl -n hw.logicalcpu)
+          LLAMA_NO_METAL=1 make test  -j $(sysctl -n hw.logicalcpu)

-  # TODO: build with GGML_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
+  # TODO: build with LLAMA_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
  #       how to debug it.
  #       ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584
  #       would be great if we fix these
@@ -534,7 +521,7 @@ jobs:
          sysctl -a
          mkdir build
          cd build
-          cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF ..
+          cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF ..
          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)

      - name: Test
@@ -549,7 +536,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v1

      - name: Dependencies
        id: depends
@@ -564,14 +551,13 @@ jobs:
          mkdir build
          cd build
          cmake -G Xcode .. \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
+            -DLLAMA_METAL_EMBED_LIBRARY=ON \
            -DLLAMA_BUILD_EXAMPLES=OFF \
            -DLLAMA_BUILD_TESTS=OFF \
            -DLLAMA_BUILD_SERVER=OFF \
            -DCMAKE_SYSTEM_NAME=iOS \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
-            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
-          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
+            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0
+          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)

  macOS-latest-cmake-tvos:
    runs-on: macos-latest
@@ -579,7 +565,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v1

      - name: Dependencies
        id: depends
@@ -594,14 +580,13 @@ jobs:
          mkdir build
          cd build
          cmake -G Xcode .. \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
+            -DLLAMA_METAL_EMBED_LIBRARY=ON \
            -DLLAMA_BUILD_EXAMPLES=OFF \
            -DLLAMA_BUILD_TESTS=OFF \
            -DLLAMA_BUILD_SERVER=OFF \
            -DCMAKE_SYSTEM_NAME=tvOS \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
-            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
-          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
+            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0
+          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)

  macOS-latest-swift:
    runs-on: macos-latest
@@ -613,7 +598,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v1

      - name: Dependencies
        id: depends
@@ -669,7 +654,7 @@ jobs:
      - name: Build using make w/ OpenBLAS
        shell: msys2 {0}
        run: |
-            make GGML_OPENBLAS=1 -j $(nproc)
+            make LLAMA_OPENBLAS=1 -j $(nproc)

      - name: Build using CMake
        shell: msys2 {0}
@@ -685,38 +670,44 @@ jobs:
      - name: Build using CMake w/ OpenBLAS
        shell: msys2 {0}
        run: |
-            cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
+            cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
            cmake --build build --config ${{ matrix.build }} -j $(nproc)

  windows-latest-cmake:
-    runs-on: windows-2019
+    runs-on: windows-latest

    env:
      OPENBLAS_VERSION: 0.3.23
+      OPENCL_VERSION: 2023.04.17
+      CLBLAST_VERSION: 1.6.0
      SDE_VERSION: 9.33.0-2024-01-07
      VULKAN_VERSION: 1.3.261.1

    strategy:
      matrix:
        include:
+          - build: 'rpc-x64'
+            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_RPC=ON -DBUILD_SHARED_LIBS=ON'
          - build: 'noavx-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON'
+            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
          - build: 'avx2-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
          - build: 'avx-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
+            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
          - build: 'avx512-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
+          - build: 'clblast-x64'
+            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
          - build: 'openblas-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
+            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
          - build: 'kompute-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
          - build: 'vulkan-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
          - build: 'llvm-arm64'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
          - build: 'msvc-arm64'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'

    steps:
      - name: Clone
@@ -729,7 +720,28 @@ jobs:
        id: clone_kompute
        if: ${{ matrix.build == 'kompute-x64' }}
        run: |
-          git submodule update --init ggml/src/kompute
+          git submodule update --init kompute
+
+      - name: Download OpenCL SDK
+        id: get_opencl
+        if: ${{ matrix.build == 'clblast-x64' }}
+        run: |
+          curl.exe -o $env:RUNNER_TEMP/opencl.zip -L "https://github.com/KhronosGroup/OpenCL-SDK/releases/download/v${env:OPENCL_VERSION}/OpenCL-SDK-v${env:OPENCL_VERSION}-Win-x64.zip"
+          mkdir $env:RUNNER_TEMP/opencl
+          tar.exe -xvf $env:RUNNER_TEMP/opencl.zip --strip-components=1 -C $env:RUNNER_TEMP/opencl
+
+      - name: Download CLBlast
+        id: get_clblast
+        if: ${{ matrix.build == 'clblast-x64' }}
+        run: |
+          curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z"
+          curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
+          7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/clblast.7z
+          rename-item $env:RUNNER_TEMP/CLBlast-${env:CLBLAST_VERSION}-windows-x64 clblast
+          foreach ($f in (gci -Recurse -Path "$env:RUNNER_TEMP/clblast" -Filter '*.cmake')) {
+            $txt = Get-Content -Path $f -Raw
+            $txt.Replace('C:/vcpkg/packages/opencl_x64-windows/', "$($env:RUNNER_TEMP.Replace('\','/'))/opencl/") | Set-Content -Path $f -Encoding UTF8
+          }

      - name: Download OpenBLAS
        id: get_openblas
@@ -764,6 +776,13 @@ jobs:
          cmake -S . -B build ${{ matrix.defines }}
          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}

+      - name: Add clblast.dll
+        id: add_clblast_dll
+        if: ${{ matrix.build == 'clblast-x64' }}
+        run: |
+          cp $env:RUNNER_TEMP/clblast/lib/clblast.dll ./build/bin/Release
+          cp $env:RUNNER_TEMP/CLBlast.LICENSE.txt ./build/bin/Release/CLBlast-${env:CLBLAST_VERSION}.txt
+
      - name: Add libopenblas.dll
        id: add_libopenblas_dll
        if: ${{ matrix.build == 'openblas-x64' }}
@@ -787,7 +806,7 @@ jobs:
      - name: Test
        id: cmake_test
        # not all machines have native AVX-512
-        if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
+        if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'clblast-x64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
        run: |
          cd build
          ctest -L main -C Release --verbose --timeout 900
@@ -802,7 +821,6 @@ jobs:
          7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
          $sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
          cd build
-          $env:LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR = 1
          & $sde -future -- ctest -L main -C Release --verbose --timeout 900

      - name: Determine tag name
@@ -833,7 +851,7 @@ jobs:
          name: llama-bin-win-${{ matrix.build }}.zip

  windows-latest-cmake-cuda:
-    runs-on: windows-2019
+    runs-on: windows-latest

    strategy:
      matrix:
@@ -847,9 +865,8 @@ jobs:
        with:
          fetch-depth: 0

-      - name: Install CUDA toolkit
+      - uses: Jimver/cuda-toolkit@v0.2.11
        id: cuda-toolkit
-        uses: Jimver/cuda-toolkit@v0.2.15
        with:
          cuda: ${{ matrix.cuda }}
          method: 'network'
@@ -860,8 +877,7 @@ jobs:
        run: |
          mkdir build
          cd build
-          cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON
-          cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1)) -t ggml
+          cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=ON
          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}

      - name: Determine tag name
@@ -967,20 +983,19 @@ jobs:
          name: llama-bin-win-sycl-x64.zip

  windows-latest-cmake-hip:
-    if: ${{ github.event.inputs.create_release != 'true' }}
    runs-on: windows-latest

    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3

      - name: Install
        id: depends
        run: |
          $ErrorActionPreference = "Stop"
          write-host "Downloading AMD HIP SDK Installer"
-          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
+          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-23.Q4-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
          write-host "Installing AMD HIP SDK"
          Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
          write-host "Completed AMD HIP SDK installation"
@@ -995,72 +1010,8 @@ jobs:
        run: |
          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
-          cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_RPC=ON
-          cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
-
-  windows-latest-cmake-hip-release:
-    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-    runs-on: windows-latest
-
-    strategy:
-      matrix:
-        gpu_target: [gfx1100, gfx1101, gfx1030]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Install
-        id: depends
-        run: |
-          $ErrorActionPreference = "Stop"
-          write-host "Downloading AMD HIP SDK Installer"
-          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
-          write-host "Installing AMD HIP SDK"
-          Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
-          write-host "Completed AMD HIP SDK installation"
-
-      - name: Verify ROCm
-        id: verify
-        run: |
-          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
-
-      - name: Build
-        id: cmake_build
-        run: |
-          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
-          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
-          cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=${{ matrix.gpu_target }} -DGGML_RPC=ON
-          cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
-          md "build\bin\rocblas\library\"
-          cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
-          cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
-          cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
-
-      - name: Determine tag name
-        id: tag
-        shell: bash
-        run: |
-          BUILD_NUMBER="$(git rev-list --count HEAD)"
-          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
-          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
-            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
-          else
-            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
-            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        run: |
-          7z a llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip .\build\bin\*
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
-          name: llama-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
+          cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DLLAMA_HIPBLAS=ON
+          cmake --build build --config Release

  ios-xcode-build:
    runs-on: macos-latest
@@ -1110,7 +1061,7 @@ jobs:
 #        hypervisor: 'qemu'
 #        run: |
 #            sudo pkg update
-#            sudo pkg install -y gmake automake autoconf pkgconf llvm15 openblas
+#            sudo pkg install -y gmake automake autoconf pkgconf llvm15 clinfo clover opencl clblast openblas
 #            gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j `sysctl -n hw.ncpu`

  release:
@@ -1125,7 +1076,6 @@ jobs:
      - macOS-latest-cmake
      - windows-latest-cmake
      - windows-latest-cmake-cuda
-      - windows-latest-cmake-hip-release
      - macOS-latest-cmake-arm64
      - macOS-latest-cmake-x64

--- a/.github/workflows/code-coverage.yml
+++ b/.github/workflows/code-coverage.yml
@@ -0,0 +1,40 @@
+name: Code Coverage
+on: [push, pull_request]
+
+env:
+  GGML_NLOOP: 3
+  GGML_N_THREADS: 1
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  run:
+    runs-on: ubuntu-20.04
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential gcc-8 lcov
+
+      - name: Build
+        run: CC=gcc-8 make -j LLAMA_CODE_COVERAGE=1 tests
+
+      - name: Run tests
+        run: CC=gcc-8 make test
+
+      - name: Generate coverage report
+        run: |
+          make coverage
+          make lcov-report
+
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v3
+        env:
+           CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
+        with:
+          files: lcov-report/coverage.info
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -10,11 +10,10 @@
 name: Publish Docker image

 on:
-  #pull_request:
+  pull_request:
  push:
    branches:
      - master
-    paths: ['.github/workflows/docker.yml', '.devops/*.Dockerfile', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@@ -23,7 +22,7 @@ concurrency:
 jobs:
  push_to_registry:
    name: Push Docker image to Docker Hub
-    #if: github.event.pull_request.draft == false
+    if: github.event.pull_request.draft == false

    runs-on: ubuntu-latest
    env:
@@ -31,18 +30,20 @@ jobs:
    strategy:
      matrix:
        config:
-          - { tag: "light", dockerfile: ".devops/llama-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          - { tag: "server", dockerfile: ".devops/llama-server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          - { tag: "light", dockerfile: ".devops/main.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          - { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
-          - { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
+          - { tag: "server", dockerfile: ".devops/server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          # NOTE(canardletter): The CUDA builds on arm64 are very slow, so I
+          #                     have disabled them for now until the reason why
+          #                     is understood.
+          - { tag: "light-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platforms: "linux/amd64" }
          - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
-          # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
-          #- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          #- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          #- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          - { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" }
-          - { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" }
+          - { tag: "server-cuda", dockerfile: ".devops/server-cuda.Dockerfile", platforms: "linux/amd64" }
+          - { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          - { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          - { tag: "server-rocm", dockerfile: ".devops/server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          - { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" }
+          - { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" }
    steps:
      - name: Check out the repo
        uses: actions/checkout@v4
@@ -96,12 +97,21 @@ jobs:
        env:
          GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'

-      - name: Build and push Docker image (tagged + versioned)
+      - name: Build and push Docker image (versioned)
        if: github.event_name == 'push'
-        uses: docker/build-push-action@v6
+        uses: docker/build-push-action@v4
        with:
          context: .
          push: true
          platforms: ${{ matrix.config.platforms }}
-          tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
+          tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
+          file: ${{ matrix.config.dockerfile }}
+
+      - name: Build and push Docker image (tagged)
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          push: ${{ github.event_name == 'push' }}
+          platforms: ${{ matrix.config.platforms }}
+          tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
          file: ${{ matrix.config.dockerfile }}
--- a/.github/workflows/python-check-requirements.yml
+++ b/.github/workflows/python-check-requirements.yml
@@ -6,13 +6,15 @@ on:
      - '.github/workflows/python-check-requirements.yml'
      - 'scripts/check-requirements.sh'
      - 'convert*.py'
-      - '**/requirements*.txt'
+      - 'requirements.txt'
+      - 'requirements/*.txt'
  pull_request:
    paths:
      - '.github/workflows/python-check-requirements.yml'
      - 'scripts/check-requirements.sh'
      - 'convert*.py'
-      - '**/requirements*.txt'
+      - 'requirements.txt'
+      - 'requirements/*.txt'

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
--- a/.github/workflows/python-type-check.yml
+++ b/.github/workflows/python-type-check.yml
@@ -1,38 +0,0 @@
-name: Python Type-Check
-
-on:
-  push:
-    paths:
-      - '.github/workflows/python-type-check.yml'
-      - '**.py'
-      - '**/requirements*.txt'
-  pull_request:
-    paths:
-      - '.github/workflows/python-type-check.yml'
-      - '**.py'
-      - '**/requirements*.txt'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  python-type-check:
-    runs-on: ubuntu-latest
-    name: pyright type-check
-    steps:
-      - name: Check out source repository
-        uses: actions/checkout@v4
-      - name: Set up Python environment
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.11"
-      - name: Install Python dependencies
-        # TODO: use a venv
-        run: pip install -r requirements/requirements-all.txt
-      - name: Type-check with Pyright
-        uses: jakebailey/pyright-action@v2
-        with:
-          version: 1.1.370
-          level: warning
-          warnings: true
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -16,15 +16,11 @@ on:
    branches:
      - master
    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
-  pull_request:
+  pull_request_target:
    types: [opened, synchronize, reopened]
    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
-
-env:
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-  LLAMA_LOG_VERBOSITY: 10
+  schedule:
+    -  cron: '2 4 * * *'

 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
@@ -36,7 +32,7 @@ jobs:

    strategy:
      matrix:
-        sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
+        sanitizer: [ADDRESS, THREAD, UNDEFINED]
        build_type: [RelWithDebInfo]
        include:
          - build_type: Release
@@ -93,30 +89,16 @@ jobs:
            exit 1
          fi

-      - name: Build (no OpenMP)
-        id: cmake_build_no_openmp
-        if: ${{ matrix.sanitizer == 'THREAD' }}
-        run: |
-          cmake -B build \
-              -DGGML_NATIVE=OFF \
-              -DLLAMA_BUILD_SERVER=ON \
-              -DLLAMA_CURL=ON \
-              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-              -DGGML_OPENMP=OFF ;
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
-
      - name: Build
        id: cmake_build
-        if: ${{ matrix.sanitizer != 'THREAD' }}
        run: |
          cmake -B build \
-              -DGGML_NATIVE=OFF \
+              -DLLAMA_NATIVE=OFF \
              -DLLAMA_BUILD_SERVER=ON \
              -DLLAMA_CURL=ON \
              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
+          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target server

      - name: Tests
        id: server_integration_tests
@@ -133,7 +115,7 @@ jobs:


  server-windows:
-    runs-on: windows-2019
+    runs-on: windows-latest

    steps:
      - name: Clone
@@ -156,7 +138,7 @@ jobs:
        id: cmake_build
        run: |
          cmake -B build -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
-          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
+          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target server

      - name: Python setup
        id: setup_python
@@ -179,7 +161,6 @@ jobs:
        if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
        run: |
          cd examples/server/tests
-          $env:PYTHONIOENCODING = ":replace"
          behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp

      - name: Slow tests
--- a/.github/workflows/zig-build.yml
+++ b/.github/workflows/zig-build.yml
@@ -0,0 +1,29 @@
+name: Zig CI
+
+on:
+  pull_request:
+  push:
+    branches:
+      - master
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  build:
+    strategy:
+      fail-fast: false
+      matrix:
+        runs-on: [ubuntu-latest, macos-latest, windows-latest]
+    runs-on: ${{ matrix.runs-on }}
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          fetch-depth: 0
+      - uses: goto-bus-stop/setup-zig@v2
+        with:
+          version: 0.11.0
+      - name: Build Summary
+        run: zig build --summary all -freference-trace
--- a/.gitignore
+++ b/.gitignore
@@ -1,135 +1,126 @@
-# Extensions
-
+*.o
 *.a
-*.bat
-*.bin
-*.dll
-*.dot
-*.etag
-*.exe
-*.gcda
-*.gcno
-*.gcov
+*.so
 *.gguf
 *.gguf.json
-*.lastModified
+*.bin
+*.exe
+*.dll
 *.log
-*.metallib
-*.o
-*.so
+*.gcov
+*.gcno
+*.gcda
+*.dot
+*.bat
 *.tmp
-
-# IDE / OS
-
+*.metallib
+*.etag
+*.lastModified
+.DS_Store
+.build/
 .cache/
 .ccls-cache/
 .direnv/
-.DS_Store
 .envrc
-.idea/
 .swiftpm
+.venv
+.clang-tidy
 .vs/
 .vscode/
-nppBackup
+.idea/

-
-# Coverage
-
-gcovr-report/
-lcov-report/
-
-# Build Artifacts
-
-tags
-.build/
-build*
-!build-info.cmake
-!build-info.cpp.in
-!build-info.sh
-!build.zig
-!docs/build.md
-/libllama.so
-/llama-*
-/vulkan-shaders-gen
-android-ndk-*
-arm_neon.h
-cmake-build-*
-CMakeSettings.json
-compile_commands.json
 ggml-metal-embed.metal
-llama-batched-swift
-/rpc-server
+
+lcov-report/
+gcovr-report/
+
+build*
+!build.zig
+cmake-build-*
 out/
 tmp/
-autogen-*.md
-
-# Deprecated
-
-/main
-/server
-
-# CI
-
-!.github/workflows/*.yml
-
-# Models

 models/*
 models-mnt
-!models/.editorconfig
-!models/ggml-vocab-*.gguf*

-# Zig
+/Pipfile
+/baby-llama
+/beam-search
+/benchmark-matmult
+/convert-llama2c-to-ggml
+/embd-input-test
+/embedding
+/eval-callback
+/gguf
+/gguf-llama-simple
+/gguf-split
+/gritlm
+/imatrix
+/infill
+/libllama.so
+/llama-bench
+/llava-cli
+/lookahead
+/lookup
+/lookup-create
+/lookup-merge
+/lookup-stats
+/main
+/metal
+/passkey
+/perplexity
+/q8dot
+/quantize
+/quantize-stats
+/result
+/save-load-state
+/server
+/simple
+/batched
+/batched-bench
+/export-lora
+/finetune
+/retrieval
+/speculative
+/parallel
+/train-text-from-scratch
+/tokenize
+/vdot
+/common/build-info.cpp
+arm_neon.h
+compile_commands.json
+CMakeSettings.json
+
+__pycache__
+dist
+
 zig-out/
 zig-cache/

-# Logs
-
 ppl-*.txt
 qnt-*.txt
 perf-*.txt

-# Examples
-
 examples/jeopardy/results.txt
-examples/server/*.css.hpp
 examples/server/*.html.hpp
 examples/server/*.js.hpp
 examples/server/*.mjs.hpp
-!build_64.sh
-!examples/*.bat
-!examples/*/*.kts
-!examples/*/*/*.kts
-!examples/sycl/*.bat
-!examples/sycl/*.sh

-# Python
-
-/.venv
-__pycache__/
-*/poetry.lock
+poetry.lock
 poetry.toml
-
-# Nix
-/result
+nppBackup

 # Test binaries
-/tests/test-backend-ops
-/tests/test-double-float
-/tests/test-grad0
 /tests/test-grammar-parser
 /tests/test-llama-grammar
+/tests/test-double-float
+/tests/test-grad0
 /tests/test-opt
 /tests/test-quantize-fns
 /tests/test-quantize-perf
-/tests/test-rope
 /tests/test-sampling
 /tests/test-tokenizer-0
-/tests/test-tokenizer-1-bpe
 /tests/test-tokenizer-1-spm
-
-# Scripts
-!/scripts/install-oneapi.bat
-
-# Test models for lora adapters
-/lora-tests
+/tests/test-tokenizer-1-bpe
+/tests/test-rope
+/tests/test-backend-ops
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,3 @@
 [submodule "kompute"]
-	path = ggml/src/kompute
+	path = kompute
 	url = https://github.com/nomic-ai/kompute.git
--- a/129
+++ b/129
@@ -1,9 +1,8 @@
-# date: Wed Jun 26 19:36:34 EEST 2024
+# date: Tue Apr  9 09:17:14 EEST 2024
 # this file is auto-generated by scripts/gen-authors.sh

 0cc4m <picard12@live.de>
 0xspringtime <110655352+0xspringtime@users.noreply.github.com>
-20kdc <asdd2808@gmail.com>
 2f38b454 <dxf@protonmail.com>
 3ooabkhxtn <31479382+3ooabkhxtn@users.noreply.github.com>
 44670 <44670@users.noreply.github.com>
@@ -12,18 +11,14 @@ AT <manyoso@users.noreply.github.com>
 Aarni Koskela <akx@iki.fi>
 Aaron Miller <apage43@ninjawhale.com>
 Aaryaman Vasishta <aaryaman.vasishta@amd.com>
-Abheek Gulati <abheekg@hotmail.com>
 Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
 Abhishek Gopinath K <31348521+overtunned@users.noreply.github.com>
 Adithya Balaji <adithya.b94@gmail.com>
 AdithyanI <adithyan.i4internet@gmail.com>
 Adrian <smith.adriane@gmail.com>
 Adrian Hesketh <a-h@users.noreply.github.com>
-Ahmet Zeer <ahmed.zeer@std.yildiz.edu.tr>
 AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com>
 Aisuko <urakiny@gmail.com>
-Akarshan Biswas <akarshanbiswas@fedoraproject.org>
-Albert Jin <albert.jin@gmail.com>
 Alberto <57916483+albbus-stack@users.noreply.github.com>
 Alex <awhill19@icloud.com>
 Alex Azarov <alex@azarov.by>
@@ -40,24 +35,19 @@ Ali Nehzat <ali.nehzat@thanks.dev>
 Ali Tariq <ali.tariq@10xengineers.ai>
 Alon <alonfaraj@gmail.com>
 AlpinDale <52078762+AlpinDale@users.noreply.github.com>
-Amir <amir_zia@outlook.com>
 AmirAli Mirian <37371367+amiralimi@users.noreply.github.com>
 Ananta Bastola <anantarajbastola@gmail.com>
 Anas Ahouzi <112881240+aahouzi@users.noreply.github.com>
 András Salamon <ott2@users.noreply.github.com>
 Andrei <abetlen@gmail.com>
 Andrew Canis <andrew.canis@gmail.com>
-Andrew Downing <andrew2085@gmail.com>
 Andrew Duffy <a10y@users.noreply.github.com>
 Andrew Godfrey <AndrewGodfrey@users.noreply.github.com>
-Andy Tai <andy-tai@users.noreply.github.com>
 Arik Poznanski <arikpoz@users.noreply.github.com>
 Artem <guinmoon@gmail.com>
-Artem Zinnatullin <ceo@abstractny.gay>
 Artyom Lebedev <vagran.ast@gmail.com>
 Asbjørn Olling <asbjornolling@gmail.com>
 Ásgeir Bjarni Ingvarsson <asgeir@fundinn.org>
-Ashish <1856117+ashishdatta@users.noreply.github.com>
 Ashok Gelal <401055+ashokgelal@users.noreply.github.com>
 Ashraful Islam <ashraful.meche@gmail.com>
 Atsushi Tatsuma <yoshoku@outlook.com>
@@ -67,46 +57,35 @@ BADR <contact@pythops.com>
 Bach Le <bach@bullno1.com>
 Bailey Chittle <39804642+bachittle@users.noreply.github.com>
 BarfingLemurs <128182951+BarfingLemurs@users.noreply.github.com>
-Bartowski <ckealty1182@gmail.com>
 Behnam M <58621210+ibehnam@users.noreply.github.com>
-Ben Ashbaugh <ben.ashbaugh@intel.com>
 Ben Garney <bengarney@users.noreply.github.com>
 Ben Siraphob <bensiraphob@gmail.com>
 Ben Williams <ben@719ben.com>
-Benjamin Findley <39356821+Kartoffelsaft@users.noreply.github.com>
 Benjamin Lecaillon <84293038+blecaillon@users.noreply.github.com>
 Bernat Vadell <hounter.caza@gmail.com>
-Bingan <70050083+binganao@users.noreply.github.com>
 Bodo Graumann <mail@bodograumann.de>
 Bono Lv <lvscar@users.noreply.github.com>
 Borislav Stanimirov <b.stanimirov@abv.bg>
 Branden Butler <bwtbutler@hotmail.com>
 Brian <mofosyne@gmail.com>
 Bruce MacDonald <brucewmacdonald@gmail.com>
-Bryan Honof <bryanhonof@gmail.com>
 CJ Pais <cj@cjpais.com>
 CRD716 <crd716@gmail.com>
-Calvin Laurenson <calvin@laurenson.dev>
 Cameron <csteele@steelecameron.com>
 Cameron Kaiser <classilla@users.noreply.github.com>
-Carolinabanana <140120812+Carolinabanana@users.noreply.github.com>
 Casey Primozic <casey@cprimozic.net>
 Casey Primozic <me@ameo.link>
 CausalLM <148736309+CausalLM@users.noreply.github.com>
 Cebtenzzre <cebtenzzre@gmail.com>
 Chad Brewbaker <crb002@gmail.com>
-Chao Jiang <jc19chaoj@zoho.com>
 Cheng Shao <terrorjack@type.dance>
-Chris Elrod <elrodc@gmail.com>
 Chris Kuehl <ckuehl@ckuehl.me>
 Christian Demsar <christian@github.email.demsar.us>
 Christian Demsar <crasm@git.vczf.us>
 Christian Falch <875252+chrfalch@users.noreply.github.com>
 Christian Kögler <ck3d@gmx.de>
-Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com>
 Clark Saben <76020733+csaben@users.noreply.github.com>
 Clint Herron <hanclinto@gmail.com>
-CrispStrobe <154636388+CrispStrobe@users.noreply.github.com>
 Cuong Trinh Manh <nguoithichkhampha@gmail.com>
 DAN™ <dranger003@gmail.com>
 Damian Stewart <d@damianstewart.com>
@@ -116,12 +95,8 @@ Daniel Bevenius <daniel.bevenius@gmail.com>
 Daniel Drake <drake@endlessos.org>
 Daniel Hiltgen <dhiltgen@users.noreply.github.com>
 Daniel Illescas Romero <illescas.daniel@protonmail.com>
-Daniele <57776841+daniandtheweb@users.noreply.github.com>
 DannyDaemonic <DannyDaemonic@gmail.com>
 Dat Quoc Nguyen <2412555+datquocnguyen@users.noreply.github.com>
-Dave <dave-fl@users.noreply.github.com>
-Dave Airlie <airlied@gmail.com>
-Dave Airlie <airlied@redhat.com>
 Dave Della Costa <ddellacosta+github@gmail.com>
 David Friehs <david@friehs.info>
 David Kennedy <dakennedyd@gmail.com>
@@ -129,13 +104,10 @@ David Pflug <david@pflug.email>
 David Renshaw <dwrenshaw@gmail.com>
 David Sommers <12738+databyte@users.noreply.github.com>
 David Yang <davidyang6us@gmail.com>
-Dawid Potocki <github@dawidpotocki.com>
 Dawid Wysocki <62249621+TortillaZHawaii@users.noreply.github.com>
 Dean <Dean.Sinaean@gmail.com>
 Deins <deinsegle@gmail.com>
-Deven Mistry <31466137+deven367@users.noreply.github.com>
 Didzis Gosko <didzis@users.noreply.github.com>
-Djip007 <djip.perois@free.fr>
 Don Mahurin <dmahurin@users.noreply.github.com>
 DooWoong Lee (David) <manics99@naver.com>
 Doomsdayrs <38189170+Doomsdayrs@users.noreply.github.com>
@@ -144,11 +116,8 @@ Dr. Tom Murphy VII Ph.D <499244+tom7@users.noreply.github.com>
 Ebey Abraham <ebey97@gmail.com>
 Ed Lee <edilee@mozilla.com>
 Ed Lepedus <ed.lepedus@googlemail.com>
-Eddie-Wang <wangjinheng1120@163.com>
 Edward Taylor <edeetee@gmail.com>
-Elaine <elaine.zosa@gmail.com>
 Elbios <141279586+Elbios@users.noreply.github.com>
-Elton Kola <eltonkola@gmail.com>
 Engininja2 <139037756+Engininja2@users.noreply.github.com>
 Equim <sayaka@ekyu.moe>
 Eric Sommerlade <es0m@users.noreply.github.com>
@@ -174,47 +143,37 @@ Firat <firatkiral@gmail.com>
 Folko-Ven <71110216+Folko-Ven@users.noreply.github.com>
 Foul-Tarnished <107711110+Foul-Tarnished@users.noreply.github.com>
 Francisco Melo <43780565+francis2tm@users.noreply.github.com>
-Frank Mai <thxcode0824@gmail.com>
 FrankHB <frankhb1989@gmail.com>
-Fred Douglas <43351173+fredlas@users.noreply.github.com>
 Frederik Vogel <Schaltfehler@users.noreply.github.com>
 Gabe Goodhart <gabe.l.hart@gmail.com>
 GainLee <perfecter.gen@gmail.com>
 Galunid <karolek1231456@gmail.com>
 Gary Linscott <glinscott@gmail.com>
 Gary Mulder <gjmulder@gmail.com>
-Gavin Zhao <gavinzhaojw@protonmail.com>
 Genkagaku.GPT <hlhr202@163.com>
 Georgi Gerganov <ggerganov@gmail.com>
 Gilad S <giladgd@users.noreply.github.com>
-Giuseppe Scrivano <giuseppe@scrivano.org>
 GiviMAD <GiviMAD@users.noreply.github.com>
 Govlzkoy <gotope@users.noreply.github.com>
 Guillaume "Vermeille" Sanchez <Guillaume.V.Sanchez@gmail.com>
 Guillaume Wenzek <gwenzek@users.noreply.github.com>
 Guoteng <32697156+SolenoidWGT@users.noreply.github.com>
 Gustavo Rocha Dias <91472747+gustrd@users.noreply.github.com>
-Haggai Nuchi <h.nuchi@gmail.com>
 Halalaluyafail3 <55773281+Halalaluyafail3@users.noreply.github.com>
-Hamdoud Hakem <90524568+hamdoudhakem@users.noreply.github.com>
-HanishKVC <hanishkvc@gmail.com>
 Haohui Mai <ricetons@gmail.com>
 Haoxiang Fei <tonyfettes@tonyfettes.com>
 Harald Fernengel <harald.fernengel@here.com>
 Hatsune Miku <129688334+at8u@users.noreply.github.com>
-HatsuneMikuUwU33 <173229399+HatsuneMikuUwU33@users.noreply.github.com>
 Henk Poley <HenkPoley@gmail.com>
 Henri Vasserman <henv@hot.ee>
 Henrik Forstén <henrik.forsten@gmail.com>
 Herman Semenov <GermanAizek@yandex.ru>
 Hesen Peng <hesen.peng@gmail.com>
 Hoang Nguyen <hugo53@users.noreply.github.com>
-Hong Bo PENG <penghb@cn.ibm.com>
 Hongyu Ouyang <96765450+casavaca@users.noreply.github.com>
 Howard Su <howard0su@gmail.com>
 Hua Jiang <allenhjiang@outlook.com>
 Huawei Lin <huaweilin.cs@gmail.com>
-Hugo Roussel <hugo.rous@gmail.com>
 Ian Bull <irbull@eclipsesource.com>
 Ian Bull <irbull@gmail.com>
 Ian Scrivener <github@zilogy.asia>
@@ -231,10 +190,8 @@ Ivan Stepanov <ivanstepanovftw@gmail.com>
 JH23X <165871467+JH23X@users.noreply.github.com>
 Jack Mousseau <jmousseau@users.noreply.github.com>
 JackJollimore <130917767+JackJollimore@users.noreply.github.com>
-Jaemin Son <woalsdnd@gmail.com>
 Jag Chadha <jagtesh@gmail.com>
 Jakub N <jakubniemczyk97@gmail.com>
-James A Capozzoli <157492257+jac-jim@users.noreply.github.com>
 James Reynolds <magnusviri@users.noreply.github.com>
 Jan Boon <jan.boon@kaetemi.be>
 Jan Boon <kaetemi@gmail.com>
@@ -248,17 +205,12 @@ Jean-Michaël Celerier <jeanmichael.celerier+github@gmail.com>
 Jed Fox <git@jedfox.com>
 Jeffrey Quesnelle <emozilla@nousresearch.com>
 Jesse Jojo Johnson <williamsaintgeorge@gmail.com>
-Jeximo <jeximo@gmail.com>
 Jhen-Jie Hong <iainst0409@gmail.com>
 Jiahao Li <liplus17@163.com>
 Jian Liao <jianliao@users.noreply.github.com>
 JidongZhang-THU <1119708529@qq.com>
 Jinwoo Jeong <33892306+williamjeong2@users.noreply.github.com>
 Jiří Podivín <66251151+jpodivin@users.noreply.github.com>
-Jiří Sejkora <Sejseloid@gmail.com>
-Joan Fontanals <jfontanalsmartinez@gmail.com>
-Joan Fontanals <joan.fontanals.martinez@jina.ai>
-Johan <JohanAR@users.noreply.github.com>
 Johannes Gäßler <johannesg@5d6.de>
 Johannes Rudolph <johannes.rudolph@gmail.com>
 John <78893154+cmp-nct@users.noreply.github.com>
@@ -269,19 +221,15 @@ Jonas Wunderlich <32615971+jonas-w@users.noreply.github.com>
 Jorge A <161275481+jorgealias@users.noreply.github.com>
 Jose Maldonado <63384398+yukiteruamano@users.noreply.github.com>
 Joseph Stahl <1269177+josephst@users.noreply.github.com>
-Josh Ramer <josh.ramer@icloud.com>
 Joyce <joycebrum@google.com>
 Juan Calderon-Perez <835733+gaby@users.noreply.github.com>
 Judd <foldl@users.noreply.github.com>
 Julius Arkenberg <arki05@users.noreply.github.com>
 Jun Jie <71215065+junnjiee16@users.noreply.github.com>
-Junyang Lin <justinlin930319@hotmail.com>
 Juraj Bednar <juraj@bednar.io>
 Justin Parker <jparkerweb@gmail.com>
 Justin Suess <justin.suess@westpoint.edu>
-Justina Cho <justcho5@gmail.com>
 Justine Tunney <jtunney@gmail.com>
-Justine Tunney <jtunney@mozilla.com>
 Juuso Alasuutari <juuso.alasuutari@gmail.com>
 KASR <karim.asrih@gmail.com>
 Kamil Tomšík <info@tomsik.cz>
@@ -294,7 +242,6 @@ Kawrakow <48489457+ikawrakow@users.noreply.github.com>
 Keiichi Tabata <keiichi.tabata@outlook.com>
 Kenvix ⭐ <kenvixzure@live.com>
 Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
-Kevin Gibbons <bakkot@gmail.com>
 Kevin Ji <1146876+kevinji@users.noreply.github.com>
 Kevin Kwok <antimatter15@gmail.com>
 Kevin Lo <kevlo@kevlo.org>
@@ -310,7 +257,6 @@ Laura <Tijntje_7@msn.com>
 Lee <44310445+lx200916@users.noreply.github.com>
 Lee Drake <b.lee.drake@gmail.com>
 Leng Yue <lengyue@lengyue.me>
-Leon Knauer <git@leonknauer.com>
 LeonEricsson <70749762+LeonEricsson@users.noreply.github.com>
 Leonardo Neumann <leonardo@neumann.dev.br>
 Li Tan <tanliboy@gmail.com>
@@ -319,26 +265,20 @@ LoganDark <github@logandark.mozmail.com>
 LostRuins <39025047+LostRuins@users.noreply.github.com>
 Luciano <lucianostrika44@gmail.com>
 Luo Tian <lt@basecity.com>
-Lyle Dean <dean@lyle.dev>
 M. Yusuf Sarıgöz <yusufsarigoz@gmail.com>
 Maarten ter Huurne <maarten@treewalker.org>
 Mack Straight <eiz@users.noreply.github.com>
 Maël Kerbiriou <m431.kerbiriou@gmail.com>
 MaggotHATE <clay1326@gmail.com>
-Manuel <44313466+makuche@users.noreply.github.com>
 Marc Köhlbrugge <subscriptions@marckohlbrugge.com>
 Marco Matthies <71844+marcom@users.noreply.github.com>
 Marcus Dunn <51931484+MarcusDunn@users.noreply.github.com>
 Marian Cepok <marian.cepok@gmail.com>
 Mark Fairbairn <thebaron88@gmail.com>
 Marko Tasic <mtasic85@gmail.com>
-Markus Tavenrath <mtavenrath@users.noreply.github.com>
-Martin Delille <martin@delille.org>
 Martin Krasser <krasserm@googlemail.com>
 Martin Schwaighofer <mschwaig@users.noreply.github.com>
 Marvin Gießing <marvin.giessing@gmail.com>
-Masaya, Kato <62578291+msy-kato@users.noreply.github.com>
-MasterYi1024 <39848311+MasterYi1024@users.noreply.github.com>
 Mateusz Charytoniuk <mateusz.charytoniuk@protonmail.com>
 Matheus C. França <matheus-catarino@hotmail.com>
 Matheus Gabriel Alves Silva <matheusgasource@gmail.com>
@@ -347,11 +287,8 @@ Mathijs de Bruin <mathijs@mathijsfietst.nl>
 Matt Clayton <156335168+mattjcly@users.noreply.github.com>
 Matt Pulver <matt.pulver@heavy.ai>
 Matteo Boschini <12133566+mbosc@users.noreply.github.com>
-Mattheus Chediak <shammcity00@gmail.com>
 Matthew Tejo <matthew.tejo@gmail.com>
 Matvey Soloviev <blackhole89@gmail.com>
-Max Krasnyansky <max.krasnyansky@gmail.com>
-Max Krasnyansky <quic_maxk@quicinc.com>
 Maxime <672982+maximegmd@users.noreply.github.com>
 Maximilian Winter <maximilian.winter.91@gmail.com>
 Meng Zhang <meng@tabbyml.com>
@@ -363,41 +300,32 @@ Michael Kesper <mkesper@schokokeks.org>
 Michael Klimenko <mklimenko29@gmail.com>
 Michael Podvitskiy <podvitskiymichael@gmail.com>
 Michael Potter <NanoTekGuy@Gmail.com>
-Michael de Gans <michael.john.degans@gmail.com>
 Michaël de Vries <vriesdemichael@gmail.com>
 Mihai <mihai.chirculescu@yahoo.com>
 Mike <ytianhui2004@gmail.com>
-Mikko Juola <mikjuo@gmail.com>
 Minsoo Cheong <54794500+mscheong01@users.noreply.github.com>
 Mirko185 <mirkosig@gmail.com>
 Mirror Azure <54669636+MirrorAzure@users.noreply.github.com>
 Miwa / Ensan <63481257+ensan-hcl@users.noreply.github.com>
 Mohammadreza Hendiani <hendiani.mohammadreza@gmail.com>
-Mohammadreza Hendiani <mohammad.r.hendiani@gmail.com>
 Murilo Santana <mvrilo@gmail.com>
 Musab Gultekin <musabgultekin@users.noreply.github.com>
 Nam D. Tran <42194884+namtranase@users.noreply.github.com>
-Nathan Epstein <nate2@umbc.edu>
 NawafAlansari <72708095+NawafAlansari@users.noreply.github.com>
 Nebula <infinitewormhole@gmail.com>
-Neo Zhang <14088817+arthw@users.noreply.github.com>
-Neo Zhang <zhang.jianyu@outlook.com>
 Neo Zhang Jianyu <jianyu.zhang@intel.com>
 Neuman Vong <neuman.vong@gmail.com>
 Nexesenex <124105151+Nexesenex@users.noreply.github.com>
 Niall Coates <1349685+Niall-@users.noreply.github.com>
 Nicolai Weitkemper <kontakt@nicolaiweitkemper.de>
-Nicolás Pérez <nicolas_perez@brown.edu>
 Nigel Bosch <pnigelb@gmail.com>
 Niklas Korz <niklas@niklaskorz.de>
-Nikolas <127742645+nneubacher@users.noreply.github.com>
 Nindaleth <Nindaleth@users.noreply.github.com>
 Oleksandr Nikitin <oleksandr@tvori.info>
 Oleksii Maryshchenko <oleksii.maryshchenko@gmail.com>
 Olivier Chafik <ochafik@users.noreply.github.com>
 Ondřej Čertík <ondrej@certik.us>
 Ouadie EL FAROUKI <ouadie.elfarouki@codeplay.com>
-Patrice Ferlet <metal3d@gmail.com>
 Paul Tsochantaris <ptsochantaris@icloud.com>
 Pavol Rusnak <pavol@rusnak.io>
 Pedro Cuenca <pedro@huggingface.co>
@@ -415,14 +343,9 @@ RJ Adriaansen <adriaansen@eshcc.eur.nl>
 Radoslav Gerganov <rgerganov@gmail.com>
 Radosław Gryta <radek.gryta@gmail.com>
 Rahul Vivek Nair <68507071+RahulVivekNair@users.noreply.github.com>
-Raj Hammeer Singh Hada <hammeerraj@gmail.com>
-Ralph Soika <ralph.soika@imixs.com>
 Rand Xie <randxiexyy29@gmail.com>
 Randall Fitzgerald <randall@dasaku.net>
 Reinforce-II <fate@eastal.com>
-Ren Xuancheng <jklj077@users.noreply.github.com>
-Rene Leonhardt <65483435+reneleonhardt@users.noreply.github.com>
-RhinoDevel <RhinoDevel@users.noreply.github.com>
 Riceball LEE <snowyu.lee@gmail.com>
 Richard Kiss <him@richardkiss.com>
 Richard Roberson <richardr1126@gmail.com>
@@ -450,7 +373,6 @@ Rowan Hart <rowanbhart@gmail.com>
 Rune <43761327+Rune-AI@users.noreply.github.com>
 Ryan Landay <rlanday@gmail.com>
 Ryder Wishart <ryderwishart@gmail.com>
-Ryuei <louixs@users.noreply.github.com>
 Rőczey Barnabás <31726601+An0nie@users.noreply.github.com>
 SakuraUmi <yukinon244@gmail.com>
 Salvador E. Tropea <stropea@inti.gob.ar>
@@ -464,7 +386,6 @@ SebastianApel <13675545+SebastianApel@users.noreply.github.com>
 Senemu <10880819+Senemu@users.noreply.github.com>
 Sergey Alirzaev <zl29ah@gmail.com>
 Sergio López <slp@sinrega.org>
-Sertaç Özercan <852750+sozercan@users.noreply.github.com>
 SeungWon Jeong <65549245+redlion0929@users.noreply.github.com>
 ShadovvBeast <ShadovvBeast@gmail.com>
 Shakhar Dasgupta <shakhardasgupta@gmail.com>
@@ -473,7 +394,6 @@ Shijie <821898965@qq.com>
 Shintarou Okada <kokuzen@gmail.com>
 Shouzheng Liu <61452103+lshzh-ww@users.noreply.github.com>
 Shouzheng Liu <lshzh.hi@gmail.com>
-Shuichi Tsutsumi <shuichi0526@gmail.com>
 Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
 Simon Willison <swillison@gmail.com>
 Siwen Yu <yusiwen@gmail.com>
@@ -485,14 +405,11 @@ Someone <sergei.kozlukov@aalto.fi>
 Someone Serge <sergei.kozlukov@aalto.fi>
 Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
 Spencer Sutton <spencersutton@users.noreply.github.com>
-Srihari-mcw <96763064+Srihari-mcw@users.noreply.github.com>
 Srinivas Billa <nivibilla@gmail.com>
 Stefan Sydow <stefan@sydow.email>
-Steffen Röcker <sroecker@gmail.com>
 Stephan Walter <stephan@walter.name>
 Stephen Nichols <snichols@users.noreply.github.com>
 Steve Grubb <ausearch.1@gmail.com>
-Steven Prichard <spprichard20@gmail.com>
 Steven Roussey <sroussey@gmail.com>
 Steward Garcia <57494570+FSSRepo@users.noreply.github.com>
 Suaj Carrot <72162667+SuajCarrot@users.noreply.github.com>
@@ -517,19 +434,16 @@ Tom C <tom.corelis@gmail.com>
 Tom Jobbins <784313+TheBloke@users.noreply.github.com>
 Tomas <tom.tomas.36478119@gmail.com>
 Tomáš Pazdiora <tomas.pazdiora@gmail.com>
-Tristan Druyen <tristan@vault81.mozmail.com>
 Tristan Ross <rosscomputerguy@protonmail.com>
 Tungsten842 <886724vf@anonaddy.me>
 Tungsten842 <quantmint@protonmail.com>
 Tushar <ditsuke@protonmail.com>
 UEXTM.com <84163508+uextm@users.noreply.github.com>
-Ulrich Drepper <drepper@gmail.com>
 Uzo Nweke <uzoechi@gmail.com>
 Vaibhav Srivastav <vaibhavs10@gmail.com>
 Val Kharitonov <mail@kharvd.com>
 Valentin Konovalov <valle.ketsujin@gmail.com>
 Valentyn Bezshapkin <61702053+valentynbez@users.noreply.github.com>
-Victor Nogueira <felladrin@gmail.com>
 Victor Z. Peng <ziliangdotme@gmail.com>
 Vlad <spitfireage@gmail.com>
 Vladimir <bogdad@gmail.com>
@@ -541,9 +455,7 @@ Weird Constructor <weirdconstructor@gmail.com>
 Welby Seely <welbyseely@gmail.com>
 Wentai Zhang <rchardx@gmail.com>
 WillCorticesAI <150854901+WillCorticesAI@users.noreply.github.com>
-William Tambellini <william.tambellini@gmail.com>
 Willy Tarreau <w@1wt.eu>
-Wouter <9594229+DifferentialityDevelopment@users.noreply.github.com>
 Wu Jian Ping <wujjpp@hotmail.com>
 Wu Jian Ping <wujp@greatld.com>
 Xiake Sun <xiake.sun@intel.com>
@@ -554,8 +466,6 @@ Xiaoyi Chen <cxychina@gmail.com>
 Xingchen Song(宋星辰) <xingchensong1996@163.com>
 Xuan Son Nguyen <thichthat@gmail.com>
 Yann Follet <131855179+YannFollet@users.noreply.github.com>
-Yaroslav <yaroslav.yashin@me.com>
-Yazan Agha-Schrader <mountaiin@icloud.com>
 Yiming Cui <conandiy@vip.qq.com>
 Yishuo Wang <MeouSker77@outlook.com>
 Yueh-Po Peng <94939112+y10ab1@users.noreply.github.com>
@@ -567,7 +477,6 @@ Zane Shannon <z@zcs.me>
 Zay <95888118+isaiahbjork@users.noreply.github.com>
 Zenix <zenixls2@gmail.com>
 Zhang Peiyuan <a1286225768@gmail.com>
-Zheng.Deng <32841220+dengzheng-cloud@users.noreply.github.com>
 ZhouYuChen <zhouyuchen@naver.com>
 Ziad Ben Hadj-Alouane <zied.benhadjalouane@gmail.com>
 Ziang Wu <97337387+ZiangWu-77@users.noreply.github.com>
@@ -575,18 +484,14 @@ Zsapi <martin1.zsapka@gmail.com>
 a-n-n-a-l-e-e <150648636+a-n-n-a-l-e-e@users.noreply.github.com>
 adel boussaken <netdur@gmail.com>
 afrideva <95653597+afrideva@users.noreply.github.com>
-agray3 <agray3@users.noreply.github.com>
 akawrykow <142945436+akawrykow@users.noreply.github.com>
 alexpinel <93524949+alexpinel@users.noreply.github.com>
 alonfaraj <alonfaraj@gmail.com>
-alwqx <kenan3015@gmail.com>
-amd-lalithnc <lalithnc@amd.com>
 andrijdavid <david@geek.mg>
 anon998 <131767832+anon998@users.noreply.github.com>
 anzz1 <anzz1@live.com>
 apaz <aarpazdera@gmail.com>
 apcameron <37645737+apcameron@users.noreply.github.com>
-arch-btw <57669023+arch-btw@users.noreply.github.com>
 arcrank <arcrank@gmail.com>
 arlo-phoenix <140345165+arlo-phoenix@users.noreply.github.com>
 at8u <129688334+at8u@users.noreply.github.com>
@@ -609,17 +514,13 @@ cocktailpeanut <121128867+cocktailpeanut@users.noreply.github.com>
 coezbek <c.oezbek@gmail.com>
 comex <comexk@gmail.com>
 compilade <113953597+compilade@users.noreply.github.com>
-compilade <git@compilade.net>
-cpumaxx <163466046+cpumaxx@users.noreply.github.com>
 crasm <crasm@git.vczf.net>
 crasm <crasm@git.vczf.us>
 daboe01 <daboe01@googlemail.com>
 david raistrick <keen99@users.noreply.github.com>
-ddh0 <dylanhalladay02@icloud.com>
 ddpasa <112642920+ddpasa@users.noreply.github.com>
 deepdiffuser <112834445+deepdiffuser@users.noreply.github.com>
 divinity76 <divinity76@gmail.com>
-dm4 <sunrisedm4@gmail.com>
 dotpy314 <33351922+dotpy314@users.noreply.github.com>
 drbh <david.richard.holtz@gmail.com>
 ds5t5 <145942675+ds5t5@users.noreply.github.com>
@@ -628,7 +529,6 @@ eastriver <lee@eastriver.dev>
 ebraminio <ebraminio@gmail.com>
 eiery <19350831+eiery@users.noreply.github.com>
 eric8607242 <e0928021388@gmail.com>
-fairydreaming <166155368+fairydreaming@users.noreply.github.com>
 fraxy-v <65565042+fraxy-v@users.noreply.github.com>
 github-actions[bot] <github-actions[bot]@users.noreply.github.com>
 gliptic <gliptic@users.noreply.github.com>
@@ -639,7 +539,6 @@ h-h-h-h <13482553+h-h-h-h@users.noreply.github.com>
 hankcs <cnhankmc@gmail.com>
 hoangmit <hoangmit@users.noreply.github.com>
 hongbo.mo <352280764@qq.com>
-hopkins385 <98618192+hopkins385@users.noreply.github.com>
 howlger <eclipse@voormann.de>
 howlger <github@voormann.de>
 hutli <6594598+hutli@users.noreply.github.com>
@@ -650,22 +549,14 @@ hydai <z54981220@gmail.com>
 iSma <ismail.senhaji@gmail.com>
 iacore <74560659+iacore@users.noreply.github.com>
 igarnier <igarnier@protonmail.com>
-intelmatt <61025942+intelmatt@users.noreply.github.com>
 iohub <rickyang.pro@gmail.com>
 jacobi petrucciani <8117202+jpetrucciani@users.noreply.github.com>
-jaime-m-p <167997752+jaime-m-p@users.noreply.github.com>
 jameswu2014 <545426914@qq.com>
-jiez <373447296@qq.com>
 jneem <joeneeman@gmail.com>
-joecryptotoo <80373433+joecryptotoo@users.noreply.github.com>
 johnson442 <56517414+johnson442@users.noreply.github.com>
-jojorne <jojorne@users.noreply.github.com>
 jon-chuang <9093549+jon-chuang@users.noreply.github.com>
 jp-x-g <jpxg-dev@protonmail.com>
-jukofyork <69222624+jukofyork@users.noreply.github.com>
-junchao-loongson <68935141+junchao-loongson@users.noreply.github.com>
 jwj7140 <32943891+jwj7140@users.noreply.github.com>
-k.h.lai <adrian.k.h.lai@outlook.com>
 kaizau <kaizau@users.noreply.github.com>
 kalomaze <66376113+kalomaze@users.noreply.github.com>
 kang <tpdns9032100@gmail.com>
@@ -684,15 +575,11 @@ ldwang <ftgreat@163.com>
 le.chang <cljs118@126.com>
 leejet <leejet714@gmail.com>
 limitedAtonement <limitedAtonement@users.noreply.github.com>
-liuwei-git <14815172+liuwei-git@users.noreply.github.com>
 lon <114724657+longregen@users.noreply.github.com>
-loonerin <132926317+loonerin@users.noreply.github.com>
-luoyu-intel <yu.luo@intel.com>
 m3ndax <adrian.goessl@outlook.com>
 maddes8cht <55592906+maddes8cht@users.noreply.github.com>
 makomk <makosoft@googlemail.com>
 manikbhandari <mbbhandarimanik2@gmail.com>
-maor-ps <154728172+maor-ps@users.noreply.github.com>
 mdrokz <mohammadmunshi@gmail.com>
 mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com>
 minarchist <minarchist@users.noreply.github.com>
@@ -706,19 +593,15 @@ ngc92 <7938269+ngc92@users.noreply.github.com>
 nhamanasu <45545786+nhamanasu@users.noreply.github.com>
 niansa/tuxifan <anton-sa@web.de>
 niansa/tuxifan <tuxifan@posteo.de>
-nickp27 <nb.porter@gmail.com>
 ningshanwutuobang <ningshanwutuobang@gmail.com>
 nold <Nold360@users.noreply.github.com>
 nopperl <54780682+nopperl@users.noreply.github.com>
 nusu-github <29514220+nusu-github@users.noreply.github.com>
 olexiyb <olexiyb@gmail.com>
-omahs <73983677+omahs@users.noreply.github.com>
 oobabooga <112222186+oobabooga@users.noreply.github.com>
 opparco <parco.opaai@gmail.com>
 ostix360 <55257054+ostix360@users.noreply.github.com>
-pengxin99 <pengxin.yuan@intel.com>
 perserk <perserk@gmail.com>
-pmysl <piotr.myslinski@outlook.com>
 postmasters <namnguyen@google.com>
 pudepiedj <pudepiedj@gmail.com>
 qingfengfenga <41416092+qingfengfenga@users.noreply.github.com>
@@ -731,19 +614,16 @@ rhuddleston <ryan.huddleston@percona.com>
 rimoliga <53384203+rimoliga@users.noreply.github.com>
 runfuture <runfuture@users.noreply.github.com>
 sandyiscool <sandyiscool@gmail.com>
-sasha0552 <admin@sasha0552.org>
 semidark <me@semidark.net>
 sharpHL <132747147+sharpHL@users.noreply.github.com>
 shibe2 <shibe@tuta.io>
 singularity <12184989+singularity-s0@users.noreply.github.com>
 sjinzh <sjinzh@gmail.com>
-sjxx <63994076+ylsdamxssjxxdd@users.noreply.github.com>
 slaren <2141330+slaren@users.noreply.github.com>
 slaren <slarengh@gmail.com>
 snadampal <87143774+snadampal@users.noreply.github.com>
 staviq <staviq@gmail.com>
 stduhpf <stephduh@live.fr>
-strawberrymelonpanda <152940198+strawberrymelonpanda@users.noreply.github.com>
 swittk <switt1995@gmail.com>
 takov751 <40316768+takov751@users.noreply.github.com>
 tarcey <cey.tarik@gmail.com>
@@ -756,16 +636,12 @@ uint256_t <konndennsa@gmail.com>
 uint256_t <maekawatoshiki1017@gmail.com>
 unbounded <haakon@likedan.net>
 valiray <133289098+valiray@users.noreply.github.com>
-vik <vikhyatk@gmail.com>
-viric <viric@viric.name>
 vodkaslime <646329483@qq.com>
 vvhg1 <94630311+vvhg1@users.noreply.github.com>
 vxiiduu <73044267+vxiiduu@users.noreply.github.com>
 wbpxre150 <100937007+wbpxre150@users.noreply.github.com>
 whoreson <139810751+whoreson@users.noreply.github.com>
-woachk <24752637+woachk@users.noreply.github.com>
 wonjun Jang <strutive07@gmail.com>
-woodx <124784234+woodx9@users.noreply.github.com>
 wzy <32936898+Freed-Wu@users.noreply.github.com>
 xaedes <xaedes@gmail.com>
 xaedes <xaedes@googlemail.com>
@@ -773,10 +649,7 @@ xloem <0xloem@gmail.com>
 yangli2 <yangli2@gmail.com>
 yuiseki <yuiseki@gmail.com>
 zakkor <edward.partenie@gmail.com>
-zhangkaihuo <zhangkaihuo@gmail.com>
 zhouwg <6889919+zhouwg@users.noreply.github.com>
-zhouwg <zhouwg2000@gmail.com>
 zrm <trustiosity.zrm@gmail.com>
-Ștefan-Gabriel Muscalu <legraphista@users.noreply.github.com>
 源文雨 <41315874+fumiama@users.noreply.github.com>
 Нияз Гарифзянов <112617865+garrnizon@users.noreply.github.com>
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -1,4 +1,4 @@
-{
+{
  "version": 4,
  "configurePresets": [
    {
@@ -11,29 +11,15 @@
            "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
        }
    },
-    {
-        "name": "sycl-base",
-        "hidden": true,
-        "generator": "Ninja",
-        "binaryDir": "${sourceDir}/build-${presetName}",
-        "cacheVariables": {
-            "CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
-            "CMAKE_CXX_COMPILER": "icx",
-            "CMAKE_C_COMPILER": "cl",
-            "GGML_SYCL": "ON",
-            "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
-        }
-    },
+
    { "name": "debug",   "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
-    { "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
-    { "name": "reldbg",  "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
-    { "name": "static",  "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } },
-    { "name": "sycl_f16",  "hidden": true, "cacheVariables": { "GGML_SYCL_F16": "ON" } },
+    { "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
+    { "name": "static",  "hidden": true, "cacheVariables": { "LLAMA_STATIC": "ON" } },

    {
        "name": "arm64-windows-msvc", "hidden": true,
-        "architecture": { "value": "arm64",    "strategy": "external" },
-        "toolset":      { "value": "host=x64", "strategy": "external" },
+        "architecture": { "value": "arm64",       "strategy": "external" },
+        "toolset":      { "value": "host=x86_64", "strategy": "external" },
        "cacheVariables": {
            "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
        }
@@ -41,28 +27,19 @@

    {
        "name": "arm64-windows-llvm", "hidden": true,
-        "architecture": { "value": "arm64",    "strategy": "external" },
-        "toolset":      { "value": "host=x64", "strategy": "external" },
+        "architecture": { "value": "arm64",       "strategy": "external" },
+        "toolset":      { "value": "host=x86_64", "strategy": "external" },
        "cacheVariables": {
            "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-llvm.cmake"
        }
    },

    { "name": "arm64-windows-llvm-debug"  , "inherits": [ "base", "arm64-windows-llvm",  "debug"   ] },
-    { "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm",  "reldbg" ] },
-    { "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm",  "reldbg", "static" ] },
+    { "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm",  "release" ] },
+    { "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm",  "release", "static" ] },

    { "name": "arm64-windows-msvc-debug"  , "inherits": [ "base", "arm64-windows-msvc",  "debug"   ] },
-    { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc",  "reldbg" ] },
-    { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc",  "reldbg", "static" ] },
-
-    { "name": "x64-windows-msvc-debug"  , "inherits": [ "base", "debug"   ] },
-    { "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] },
-    { "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },
-
-    { "name": "x64-windows-sycl-debug"  , "inherits": [ "sycl-base", "debug"   ] },
-    { "name": "x64-windows-sycl-debug-f16", "inherits": [ "sycl-base", "debug", "sycl_f16" ] },
-    { "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] },
-    { "name": "x64-windows-sycl-release-f16", "inherits": [ "sycl-base", "release", "sycl_f16" ] }
+    { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc",  "release" ] },
+    { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc",  "release", "static" ] }
  ]
 }
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,29 +0,0 @@
-# Pull requests (for contributors)
-
- Test your changes:
-  - Using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the GGML library
-  - Execute [the full CI locally on your machine](ci/README.md) before publishing
- Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs.
-  - The PR template has a series of review complexity checkboxes `[ ]` that [you can mark as](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) `[X]` for your convenience
- Consider allowing write access to your branch for faster review
- If your PR becomes stale, don't hesitate to ping the maintainers in the comments
-
-# Pull requests (for collaborators)
-
- Squash-merge PRs
- Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
- Optionally, pick a `<module>` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules
-
-# Coding guidelines
-
- Avoid adding third-party dependencies, extra files, extra headers, etc.
- Always consider cross-compatibility with other operating systems and architectures
- Avoid fancy looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple
- There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit
- Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
- Naming usually optimizes for common prefix (see https://github.com/ggerganov/ggml/pull/302#discussion_r1243240963)
- Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$
-
-![matmul](media/matmul.png)
-
--- a/1405
+++ b/1405
--- a/Package.swift
+++ b/Package.swift
@@ -3,17 +3,14 @@
 import PackageDescription

 var sources = [
-    "src/llama.cpp",
-    "src/llama-vocab.cpp",
-    "src/llama-grammar.cpp",
-    "src/llama-sampling.cpp",
-    "src/unicode.cpp",
-    "src/unicode-data.cpp",
-    "ggml/src/ggml.c",
-    "ggml/src/ggml-alloc.c",
-    "ggml/src/ggml-backend.c",
-    "ggml/src/ggml-quants.c",
-    "ggml/src/ggml-aarch64.c",
+    "ggml.c",
+    "sgemm.cpp",
+    "llama.cpp",
+    "unicode.cpp",
+    "unicode-data.cpp",
+    "ggml-alloc.c",
+    "ggml-backend.c",
+    "ggml-quants.c",
 ]

 var resources: [Resource] = []
@@ -29,8 +26,8 @@ var cSettings: [CSetting] =  [
 ]

 #if canImport(Darwin)
-sources.append("ggml/src/ggml-metal.m")
-resources.append(.process("ggml/src/ggml-metal.metal"))
+sources.append("ggml-metal.m")
+resources.append(.process("ggml-metal.metal"))
 linkerSettings.append(.linkedFramework("Accelerate"))
 cSettings.append(
    contentsOf: [
@@ -66,6 +63,8 @@ let package = Package(
               "models",
               "tests",
               "CMakeLists.txt",
+               "ggml-cuda.cu",
+               "ggml-cuda.h",
               "Makefile"
            ],
            sources: sources,
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -1,7 +1,6 @@
 # llama.cpp for SYCL

 - [Background](#background)
- [Recommended Release](#recommended-release)
 - [News](#news)
 - [OS](#os)
 - [Hardware](#hardware)
@@ -20,7 +19,7 @@
 **oneAPI** is an open ecosystem and a standard-based specification, supporting multiple architectures including but not limited to intel CPUs, GPUs and FPGAs. The key components of the oneAPI ecosystem include:

 - **DPCPP** *(Data Parallel C++)*: The primary oneAPI SYCL implementation, which includes the icpx/icx Compilers.
- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. oneMKL and oneDNN)*.
+- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. oneMKL - Math Kernel Library)*.
 - **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs.
 - **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.

@@ -28,27 +27,12 @@

 The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it could support other vendor GPUs: Nvidia GPU (*AMD GPU coming*).

-## Recommended Release
-
-The SYCL backend would be broken by some PRs due to no online CI.
-
-The following release is verified with good quality:
-
-|Commit ID|Tag|Release|Verified  Platform|
-|-|-|-|-|
-|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1|
+When targeting **Intel CPU**, it is recommended to use llama.cpp for [Intel oneMKL](README.md#intel-onemkl) backend.

+It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, CLBlast etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.

 ## News

-
- 2024.8
-  - Use oneDNN as the default GEMM library, improve the compatibility for new Intel GPUs.
-
- 2024.5
-  - Performance is increased: 34 -> 37 tokens/s of llama-2-7b.Q4_0 on Arc770.
-  - Arch Linux is verified successfully.
-
 - 2024.4
  - Support data types: GGML_TYPE_IQ4_NL, GGML_TYPE_IQ4_XS, GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ3_S, GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M.

@@ -70,37 +54,30 @@ The following release is verified with good quality:

 ## OS

-| OS      | Status  | Verified                                       |
-|---------|---------|------------------------------------------------|
-| Linux   | Support | Ubuntu 22.04, Fedora Silverblue 39, Arch Linux |
-| Windows | Support | Windows 11                                     |
+| OS      | Status  | Verified                           |
+|---------|---------|------------------------------------|
+| Linux   | Support | Ubuntu 22.04, Fedora Silverblue 39 |
+| Windows | Support | Windows 11                         |


 ## Hardware

 ### Intel GPU

-SYCL backend supports Intel GPU Family:
-
- Intel Data Center Max Series
- Intel Flex Series, Arc Series
- Intel Built-in Arc GPU
- Intel iGPU in Core CPU (11th Generation Core CPU and newer, refer to [oneAPI supported GPU](https://www.intel.com/content/www/us/en/developer/articles/system-requirements/intel-oneapi-base-toolkit-system-requirements.html#inpage-nav-1-1)).
-
-#### Verified devices
+**Verified devices**

 | Intel GPU                     | Status  | Verified Model                        |
 |-------------------------------|---------|---------------------------------------|
 | Intel Data Center Max Series  | Support | Max 1550, 1100                        |
 | Intel Data Center Flex Series | Support | Flex 170                              |
-| Intel Arc Series              | Support | Arc 770, 730M, Arc A750               |
+| Intel Arc Series              | Support | Arc 770, 730M                         |
 | Intel built-in Arc GPU        | Support | built-in Arc GPU in Meteor Lake       |
-| Intel iGPU                    | Support | iGPU in 13700k, i5-1250P, i7-1260P, i7-1165G7 |
+| Intel iGPU                    | Support | iGPU in i5-1250P, i7-1260P, i7-1165G7 |

 *Notes:*

 - **Memory**
-  - The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/llama-cli`.
+  - The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/main`.

  - Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU.

@@ -122,14 +99,14 @@ The docker build option is currently limited to *intel GPU* targets.
 ### Build image
 ```sh
 # Using FP16
-docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" -f .devops/llama-cli-intel.Dockerfile .
+docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/main-intel.Dockerfile .
 ```

 *Notes*:

-To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="GGML_SYCL_F16=ON"` argument from the previous command.
+To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="LLAMA_SYCL_F16=ON"` argument from the previous command.

-You can also use the `.devops/llama-server-intel.Dockerfile`, which builds the *"server"* alternative.
+You can also use the `.devops/server-intel.Dockerfile`, which builds the *"server"* alternative.

 ### Run container

@@ -196,7 +173,7 @@ Please follow the instructions for downloading and installing the Toolkit for Li

 Following guidelines/code snippets assume the default installation values. Otherwise, please make sure the necessary changes are reflected where applicable.

-Upon a successful installation, SYCL is enabled for the available intel devices, along with relevant libraries such as oneAPI oneDNN for Intel GPUs.
+Upon a successful installation, SYCL is enabled for the available intel devices, along with relevant libraries such as oneAPI MKL for intel GPUs.

 - **Adding support to Nvidia GPUs**

@@ -244,22 +221,17 @@ Similarly, user targeting Nvidia GPUs should expect at least one SYCL-CUDA devic
 ### II. Build llama.cpp

 #### Intel GPU
-
-```
-./examples/sycl/build.sh
-```
-
-or
-
 ```sh
 # Export relevant ENV variables
 source /opt/intel/oneapi/setvars.sh

+# Build LLAMA with MKL BLAS acceleration for intel GPU
+
 # Option 1: Use FP32 (recommended for better performance in most cases)
-cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx

 # Option 2: Use FP16
-cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
+cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON

 # build all binary
 cmake --build build --config Release -j -v
@@ -276,10 +248,10 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
 # Build LLAMA with Nvidia BLAS acceleration through SYCL

 # Option 1: Use FP32 (recommended for better performance in most cases)
-cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+cmake -B build -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx

 # Option 2: Use FP16
-cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
+cmake -B build -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON

 # build all binary
 cmake --build build --config Release -j -v
@@ -288,71 +260,48 @@ cmake --build build --config Release -j -v

 ### III. Run the inference

-#### Retrieve and prepare model
+1. Retrieve and prepare model

 You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.

-##### Check device
-
-1. Enable oneAPI running environment
+2. Enable oneAPI running environment

 ```sh
 source /opt/intel/oneapi/setvars.sh
 ```

-2. List devices information
+3. List devices information

 Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:

 ```sh
-./build/bin/llama-ls-sycl-device
+./build/bin/ls-sycl-device
 ```
-
-This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
+A example of such log in a system with 1 *intel CPU* and 1 *intel GPU* can look like the following:
 ```
-found 2 SYCL devices:
-
+found 6 SYCL devices:
 |  |                  |                                             |Compute   |Max compute|Max work|Max sub|               |
 |ID|       Device Type|                                         Name|capability|units      |group   |group  |Global mem size|
 |--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
 | 0|[level_zero:gpu:0]|               Intel(R) Arc(TM) A770 Graphics|       1.3|        512|    1024|     32|    16225243136|
 | 1|[level_zero:gpu:1]|                    Intel(R) UHD Graphics 770|       1.3|         32|     512|     32|    53651849216|
+| 2|    [opencl:gpu:0]|               Intel(R) Arc(TM) A770 Graphics|       3.0|        512|    1024|     32|    16225243136|
+| 3|    [opencl:gpu:1]|                    Intel(R) UHD Graphics 770|       3.0|         32|     512|     32|    53651849216|
+| 4|    [opencl:cpu:0]|         13th Gen Intel(R) Core(TM) i7-13700K|       3.0|         24|    8192|     64|    67064815616|
+| 5|    [opencl:acc:0]|               Intel(R) FPGA Emulation Device|       1.2|         24|67108864|     64|    67064815616|
 ```

-#### Choose level-zero devices
+| Attribute              | Note                                                        |
+|------------------------|-------------------------------------------------------------|
+| compute capability 1.3 | Level-zero driver/runtime, recommended                      |
+| compute capability 3.0 | OpenCL driver/runtime, slower than level-zero in most cases |

-|Chosen Device ID|Setting|
-|-|-|
-|0|`export ONEAPI_DEVICE_SELECTOR="level_zero:1"` or no action|
-|1|`export ONEAPI_DEVICE_SELECTOR="level_zero:1"`|
-|0 & 1|`export ONEAPI_DEVICE_SELECTOR="level_zero:0;level_zero:1"`|
-
-#### Execute
-
-Choose one of following methods to run.
-
-1. Script
-
- Use device 0:
-
-```sh
-./examples/sycl/run-llama2.sh 0
-```
- Use multiple devices:
-
-```sh
-./examples/sycl/run-llama2.sh
-```
-
-2. Command line
-Launch inference
+4. Launch inference

 There are two device selection modes:

- Single device: Use one device assigned by user. Default device id is 0.
- Multiple devices: Automatically choose the devices with the same backend.
-
-In two device selection modes, the default SYCL backend is level_zero, you can choose other backend supported by SYCL by setting environment variable ONEAPI_DEVICE_SELECTOR.
+- Single device: Use one device target specified by the user.
+- Multiple devices: Automatically select the devices with the same largest Max compute-units.

 | Device selection | Parameter                              |
 |------------------|----------------------------------------|
@@ -364,13 +313,24 @@ Examples:
 - Use device 0:

 ```sh
-ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
+ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
+```
+or run by script:
+
+```sh
+./examples/sycl/run_llama2.sh 0
 ```

 - Use multiple devices:

 ```sh
-ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
+ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
+```
+
+Otherwise, you can run the script:
+
+```sh
+./examples/sycl/run_llama2.sh
 ```

 *Notes:*
@@ -419,7 +379,7 @@ c. Verify installation
 In the oneAPI command line, run the following to print the available SYCL devices:

 ```
-sycl-ls.exe
+sycl-ls
 ```

 There should be one or more *level-zero* GPU devices displayed as **[ext_oneapi_level_zero:gpu]**. Below is example of such output detecting an *intel Iris Xe* GPU as a Level-zero SYCL device:
@@ -434,120 +394,89 @@ Output (example):

 4. Install build tools

-a. Download & install cmake for Windows: https://cmake.org/download/ (CMake can also be installed from Visual Studio Installer)
-b. The new Visual Studio will install Ninja as default. (If not, please install it manually: https://ninja-build.org/)
+a. Download & install cmake for Windows: https://cmake.org/download/

+b. Download & install mingw-w64 make for Windows provided by w64devkit
+
+- Download the 1.19.0 version of [w64devkit](https://github.com/skeeto/w64devkit/releases/download/v1.19.0/w64devkit-1.19.0.zip).
+
+- Extract `w64devkit` on your pc.
+
+- Add the **bin** folder path in the Windows system PATH environment (for e.g. `C:\xxx\w64devkit\bin\`).

 ### II. Build llama.cpp

-You could download the release package for Windows directly, which including binary files and depended oneAPI dll files.
-
-Choose one of following methods to build from source code.
-
-1. Script
-
-```sh
-.\examples\sycl\win-build-sycl.bat
-```
-
-2. CMake
-
 On the oneAPI command line window, step into the llama.cpp main directory and run the following:

 ```
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force

 # Option 1: Use FP32 (recommended for better performance in most cases)
-cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release
+cmake -B build -G "MinGW Makefiles" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release

 # Option 2: Or FP16
-cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
+cmake -B build -G "MinGW Makefiles" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON

 cmake --build build --config Release -j
 ```

-Or, use CMake presets to build:
-
+Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former instructions:
 ```sh
-cmake --preset x64-windows-sycl-release
-cmake --build build-x64-windows-sycl-release -j --target llama-cli
-
-cmake -DGGML_SYCL_F16=ON --preset x64-windows-sycl-release
-cmake --build build-x64-windows-sycl-release -j --target llama-cli
-
-cmake --preset x64-windows-sycl-debug
-cmake --build build-x64-windows-sycl-debug -j --target llama-cli
+.\examples\sycl\win-build-sycl.bat
 ```

-3. Visual Studio
-
-You can use Visual Studio to open llama.cpp folder as a CMake project. Choose the sycl CMake presets (`x64-windows-sycl-release` or `x64-windows-sycl-debug`) before you compile the project.
-
 *Notes:*

- In case of a minimal experimental setup, the user can build the inference executable only through `cmake --build build --config Release -j --target llama-cli`.
+- By default, calling `make` will build all target binary files. In case of a minimal experimental setup, the user can build the inference executable only through `make main`.

 ### III. Run the inference

-#### Retrieve and prepare model
+1. Retrieve and prepare model

-You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.
+You can refer to the general [*Prepare and Quantize*](README#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.

-##### Check device
-
-1. Enable oneAPI running environment
+2. Enable oneAPI running environment

 On the oneAPI command line window, run the following and step into the llama.cpp directory:
 ```
 "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
 ```

-2. List devices information
+3. List devices information

 Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:

 ```
-build\bin\llama-ls-sycl-device.exe
+build\bin\ls-sycl-device.exe
 ```

-This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
+The output of this command in a system with 1 *intel CPU* and 1 *intel GPU* would look like the following:
 ```
-found 2 SYCL devices:
+found 6 SYCL devices:
 |  |                  |                                             |Compute   |Max compute|Max work|Max sub|               |
 |ID|       Device Type|                                         Name|capability|units      |group   |group  |Global mem size|
 |--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
 | 0|[level_zero:gpu:0]|               Intel(R) Arc(TM) A770 Graphics|       1.3|        512|    1024|     32|    16225243136|
 | 1|[level_zero:gpu:1]|                    Intel(R) UHD Graphics 770|       1.3|         32|     512|     32|    53651849216|
+| 2|    [opencl:gpu:0]|               Intel(R) Arc(TM) A770 Graphics|       3.0|        512|    1024|     32|    16225243136|
+| 3|    [opencl:gpu:1]|                    Intel(R) UHD Graphics 770|       3.0|         32|     512|     32|    53651849216|
+| 4|    [opencl:cpu:0]|         13th Gen Intel(R) Core(TM) i7-13700K|       3.0|         24|    8192|     64|    67064815616|
+| 5|    [opencl:acc:0]|               Intel(R) FPGA Emulation Device|       1.2|         24|67108864|     64|    67064815616|

 ```
-#### Choose level-zero devices

-|Chosen Device ID|Setting|
-|-|-|
-|0|`set ONEAPI_DEVICE_SELECTOR="level_zero:1"` or no action|
-|1|`set ONEAPI_DEVICE_SELECTOR="level_zero:1"`|
-|0 & 1|`set ONEAPI_DEVICE_SELECTOR="level_zero:0;level_zero:1"`|
+| Attribute              | Note                                                      |
+|------------------------|-----------------------------------------------------------|
+| compute capability 1.3 | Level-zero running time, recommended                      |
+| compute capability 3.0 | OpenCL running time, slower than level-zero in most cases |

-#### Execute

-Choose one of following methods to run.
-
-1. Script
-
-```
-examples\sycl\win-run-llama2.bat
-```
-
-2. Command line
-
-Launch inference
+4. Launch inference

 There are two device selection modes:

- Single device: Use one device assigned by user. Default device id is 0.
- Multiple devices: Automatically choose the devices with the same backend.
-
-In two device selection modes, the default SYCL backend is level_zero, you can choose other backend supported by SYCL by setting environment variable ONEAPI_DEVICE_SELECTOR.
+- Single device: Use one device assigned by user.
+- Multiple devices: Automatically choose the devices with the same biggest Max compute units.

 | Device selection | Parameter                              |
 |------------------|----------------------------------------|
@@ -559,15 +488,19 @@ Examples:
 - Use device 0:

 ```
-build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
+build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
 ```

 - Use multiple devices:

 ```
-build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
+build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
 ```
+Otherwise, run the following wrapper script:

+```
+.\examples\sycl\win-run-llama2.bat
+```

 Note:

@@ -581,18 +514,17 @@ Or
 use 1 SYCL GPUs: [0] with Max compute units:512
 ```

-
 ## Environment Variable

 #### Build

 | Name               | Value                             | Function                                    |
 |--------------------|-----------------------------------|---------------------------------------------|
-| GGML_SYCL          | ON (mandatory)                    | Enable build with SYCL code path.<br>FP32 path - recommended for better perforemance than FP16 on quantized model|
-| GGML_SYCL_TARGET   | INTEL *(default)* \| NVIDIA       | Set the SYCL target device type.            |
-| GGML_SYCL_F16      | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path.      |
-| CMAKE_C_COMPILER   | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path.      |
-| CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)*   | Set `icpx/icx` compiler for SYCL code path. |
+| LLAMA_SYCL         | ON (mandatory)                    | Enable build with SYCL code path.           |
+| LLAMA_SYCL_TARGET  | INTEL *(default)* \| NVIDIA       | Set the SYCL target device type.            |
+| LLAMA_SYCL_F16     | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path.      |
+| CMAKE_C_COMPILER   | icx                               | Set *icx* compiler for SYCL code path.      |
+| CMAKE_CXX_COMPILER | icpx *(Linux)*, icx *(Windows)*   | Set `icpx/icx` compiler for SYCL code path. |

 #### Runtime

@@ -628,26 +560,9 @@ use 1 SYCL GPUs: [0] with Max compute units:512
  ```
  Otherwise, please double-check the GPU driver installation steps.

- Can I report Ollama issue on Intel GPU to llama.cpp SYCL backend?
-
-  No. We can't support Ollama issue directly, because we aren't familiar with Ollama.
-
-  Sugguest reproducing on llama.cpp and report similar issue to llama.cpp. We will surpport it.
-
-  It's same for other projects including llama.cpp SYCL backend.
-
- Meet issue: `Native API failed. Native API returns: -6 (PI_ERROR_OUT_OF_HOST_MEMORY) -6 (PI_ERROR_OUT_OF_HOST_MEMORY) -999 (UNKNOWN PI error)` or `failed to allocate SYCL0 buffer`
-
-  Device Memory is not enough.
-
-  |Reason|Solution|
-  |-|-|
-  |Default Context is too big. It leads to more memory usage.|Set `-c 8192` or smaller value.|
-  |Model is big and require more memory than device's.|Choose smaller quantized model, like Q5 -> Q4;<br>Use more than one devices to load model.|
-
 ### **GitHub contribution**:
 Please add the **[SYCL]** prefix/tag in issues/PRs titles to help the SYCL-team check/address them without delay.

 ## TODO

- NA
+- Support row layer split for multiple card runs.
--- a/README.md
+++ b/README.md
--- a/build.zig
+++ b/build.zig
@@ -0,0 +1,172 @@
+// Compatible with Zig Version 0.11.0
+const std = @import("std");
+const ArrayList = std.ArrayList;
+const Compile = std.Build.Step.Compile;
+const ConfigHeader = std.Build.Step.ConfigHeader;
+const Mode = std.builtin.Mode;
+const CrossTarget = std.zig.CrossTarget;
+
+const Maker = struct {
+    builder: *std.build.Builder,
+    target: CrossTarget,
+    optimize: Mode,
+    enable_lto: bool,
+
+    include_dirs: ArrayList([]const u8),
+    cflags: ArrayList([]const u8),
+    cxxflags: ArrayList([]const u8),
+    objs: ArrayList(*Compile),
+
+    fn addInclude(m: *Maker, dir: []const u8) !void {
+        try m.include_dirs.append(dir);
+    }
+    fn addProjectInclude(m: *Maker, path: []const []const u8) !void {
+        try m.addInclude(try m.builder.build_root.join(m.builder.allocator, path));
+    }
+    fn addCFlag(m: *Maker, flag: []const u8) !void {
+        try m.cflags.append(flag);
+    }
+    fn addCxxFlag(m: *Maker, flag: []const u8) !void {
+        try m.cxxflags.append(flag);
+    }
+    fn addFlag(m: *Maker, flag: []const u8) !void {
+        try m.addCFlag(flag);
+        try m.addCxxFlag(flag);
+    }
+
+    fn init(builder: *std.build.Builder) !Maker {
+        const target = builder.standardTargetOptions(.{});
+        const zig_version = @import("builtin").zig_version_string;
+        const commit_hash = try std.ChildProcess.exec(
+            .{ .allocator = builder.allocator, .argv = &.{ "git", "rev-parse", "HEAD" } },
+        );
+        try std.fs.cwd().writeFile("common/build-info.cpp", builder.fmt(
+            \\int LLAMA_BUILD_NUMBER = {};
+            \\char const *LLAMA_COMMIT = "{s}";
+            \\char const *LLAMA_COMPILER = "Zig {s}";
+            \\char const *LLAMA_BUILD_TARGET = "{s}";
+            \\
+        , .{ 0, commit_hash.stdout[0 .. commit_hash.stdout.len - 1], zig_version, try target.allocDescription(builder.allocator) }));
+        var m = Maker{
+            .builder = builder,
+            .target = target,
+            .optimize = builder.standardOptimizeOption(.{}),
+            .enable_lto = false,
+            .include_dirs = ArrayList([]const u8).init(builder.allocator),
+            .cflags = ArrayList([]const u8).init(builder.allocator),
+            .cxxflags = ArrayList([]const u8).init(builder.allocator),
+            .objs = ArrayList(*Compile).init(builder.allocator),
+        };
+
+        try m.addCFlag("-std=c11");
+        try m.addCxxFlag("-std=c++11");
+        try m.addProjectInclude(&.{});
+        try m.addProjectInclude(&.{"common"});
+        return m;
+    }
+
+    fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile {
+        const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
+        if (o.target.getAbi() != .msvc)
+            o.defineCMacro("_GNU_SOURCE", null);
+
+        if (std.mem.endsWith(u8, src, ".c")) {
+            o.addCSourceFiles(&.{src}, m.cflags.items);
+            o.linkLibC();
+        } else {
+            o.addCSourceFiles(&.{src}, m.cxxflags.items);
+            if (o.target.getAbi() == .msvc) {
+                o.linkLibC(); // need winsdk + crt
+            } else {
+                // linkLibCpp already add (libc++ + libunwind + libc)
+                o.linkLibCpp();
+            }
+        }
+        for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
+        o.want_lto = m.enable_lto;
+        return o;
+    }
+
+    fn exe(m: *const Maker, name: []const u8, src: []const u8, deps: []const *Compile) *Compile {
+        const e = m.builder.addExecutable(.{ .name = name, .target = m.target, .optimize = m.optimize });
+        e.addCSourceFiles(&.{src}, m.cxxflags.items);
+        for (deps) |d| e.addObject(d);
+        for (m.objs.items) |o| e.addObject(o);
+        for (m.include_dirs.items) |i| e.addIncludePath(.{ .path = i });
+
+        // https://github.com/ziglang/zig/issues/15448
+        if (e.target.getAbi() == .msvc) {
+            e.linkLibC(); // need winsdk + crt
+        } else {
+            // linkLibCpp already add (libc++ + libunwind + libc)
+            e.linkLibCpp();
+        }
+        m.builder.installArtifact(e);
+        e.want_lto = m.enable_lto;
+        return e;
+    }
+};
+
+pub fn build(b: *std.build.Builder) !void {
+    var make = try Maker.init(b);
+    make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
+
+    const ggml = make.obj("ggml", "ggml.c");
+    const sgemm = make.obj("sgemm", "sgemm.cpp");
+    const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
+    const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
+    const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
+    const unicode = make.obj("unicode", "unicode.cpp");
+    const unicode_data = make.obj("unicode-data", "unicode-data.cpp");
+    const llama = make.obj("llama", "llama.cpp");
+    const buildinfo = make.obj("common", "common/build-info.cpp");
+    const common = make.obj("common", "common/common.cpp");
+    const console = make.obj("console", "common/console.cpp");
+    const sampling = make.obj("sampling", "common/sampling.cpp");
+    const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
+    const json_schema_to_grammar = make.obj("json-schema-to-grammar", "common/json-schema-to-grammar.cpp");
+    const train = make.obj("train", "common/train.cpp");
+    const clip = make.obj("clip", "examples/llava/clip.cpp");
+    const llava = make.obj("llava", "examples/llava/llava.cpp");
+
+    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, console, grammar_parser });
+    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
+    _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
+    _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
+    _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
+    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
+
+    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, grammar_parser, clip, llava });
+    if (server.target.isWindows()) {
+        server.linkSystemLibrary("ws2_32");
+    }
+
+    const server_assets = [_][]const u8{ "index.html", "index.js", "completion.js", "json-schema-to-grammar.mjs" };
+    for (server_assets) |asset| {
+        const input_path = b.fmt("examples/server/public/{s}", .{asset});
+        const output_path = b.fmt("examples/server/{s}.hpp", .{asset});
+
+        // Portable equivalent of `b.addSystemCommand(&.{ "xxd", "-n", asset, "-i", input_path, output_path }) })`:
+
+        const input = try std.fs.cwd().readFileAlloc(b.allocator, input_path, std.math.maxInt(usize));
+        defer b.allocator.free(input);
+
+        var buf = std.ArrayList(u8).init(b.allocator);
+        defer buf.deinit();
+
+        for (input) |byte| {
+            try std.fmt.format(buf.writer(), "0x{X:0>2}, ", .{byte});
+        }
+
+        var name = try std.mem.replaceOwned(u8, b.allocator, asset, "-", "_");
+        defer b.allocator.free(name);
+        std.mem.replaceScalar(u8, name, '.', '_');
+
+        try std.fs.cwd().writeFile(output_path, b.fmt(
+            "unsigned char {s}[] = {{{s}}};\nunsigned int {s}_len = {d};\n",
+            .{ name, buf.items, name, input.len },
+        ));
+
+        std.debug.print("Dumped hex of \"{s}\" ({s}) to {s}\n", .{ input_path, name, output_path });
+    }
+}
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -13,9 +13,6 @@
 # # with SYCL support
 # GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 #
-# # with VULKAN support
-# GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
-#

 if [ -z "$2" ]; then
    echo "usage: $0 <output-dir> <mnt-dir>"
@@ -39,11 +36,11 @@ SRC=`pwd`
 CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON"

 if [ ! -z ${GG_BUILD_METAL} ]; then
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_METAL_SHADER_DEBUG=ON"
 fi

 if [ ! -z ${GG_BUILD_CUDA} ]; then
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=native"
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUDA=1"
 fi

 if [ ! -z ${GG_BUILD_SYCL} ]; then
@@ -53,11 +50,7 @@ if [ ! -z ${GG_BUILD_SYCL} ]; then
        exit 1
    fi

-    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
-fi
-
-if [ ! -z ${GG_BUILD_VULKAN} ]; then
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON"
 fi
 ## helpers

@@ -110,11 +103,8 @@ function gg_run_ctest_debug {

    set -e

-    # Check cmake, make and ctest are installed
-    gg_check_build_requirements
-
    (time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j$(nproc)                                  ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time make -j                                          ) 2>&1 | tee -a $OUT/${ci}-make.log

    (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log

@@ -141,11 +131,8 @@ function gg_run_ctest_release {

    set -e

-    # Check cmake, make and ctest are installed
-    gg_check_build_requirements
-
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time make -j                                            ) 2>&1 | tee -a $OUT/${ci}-make.log

    if [ -z ${GG_BUILD_LOW_PERF} ]; then
        (time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log
@@ -215,15 +202,12 @@ function gg_sum_test_scripts_release {
 }

 function gg_get_model {
-    local gguf_0="$MNT/models/pythia/1.4B/ggml-model-f16.gguf"
-    local gguf_1="$MNT/models/pythia/2.8B/ggml-model-f16.gguf"
-    local gguf_2="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
-    if [[ -s $gguf_0 ]]; then
-        echo -n "$gguf_0"
-    elif [[ -s $gguf_1 ]]; then
-        echo -n "$gguf_1"
-    elif [[ -s $gguf_2 ]]; then
-        echo -n "$gguf_2"
+    local gguf_3b="$MNT/models/open-llama/3B-v2/ggml-model-f16.gguf"
+    local gguf_7b="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
+    if [[ -s $gguf_3b ]]; then
+        echo -n "$gguf_3b"
+    elif [[ -s $gguf_7b ]]; then
+        echo -n "$gguf_7b"
    else
        echo >&2 "No model found. Can't run gg_run_ctest_with_model."
        exit 1
@@ -272,7 +256,141 @@ function gg_sum_ctest_with_model_release {
    gg_printf '```\n'
 }

+# open_llama_3b_v2
+
+function gg_run_open_llama_3b_v2 {
+    cd ${SRC}
+
+    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/config.json
+    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/tokenizer.model
+    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/tokenizer_config.json
+    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/special_tokens_map.json
+    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin
+    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json
+
+    gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
+    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
+    head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
+
+    path_models="../models-mnt/open-llama/3B-v2"
+    path_wiki="../models-mnt/wikitext/wikitext-2-raw"
+
+    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
+
+    set -e
+
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j                                                             ) 2>&1 | tee -a $OUT/${ci}-make.log
+
+    python3 ../convert.py ${path_models}
+
+    model_f16="${path_models}/ggml-model-f16.gguf"
+    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
+    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
+    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
+    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
+    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
+    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
+    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
+    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
+    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
+    model_q6_k="${path_models}/ggml-model-q6_k.gguf"
+
+    wiki_test_60="${path_wiki}/wiki.test-60.raw"
+
+    ./bin/quantize ${model_f16} ${model_q8_0} q8_0
+    ./bin/quantize ${model_f16} ${model_q4_0} q4_0
+    ./bin/quantize ${model_f16} ${model_q4_1} q4_1
+    ./bin/quantize ${model_f16} ${model_q5_0} q5_0
+    ./bin/quantize ${model_f16} ${model_q5_1} q5_1
+    ./bin/quantize ${model_f16} ${model_q2_k} q2_k
+    ./bin/quantize ${model_f16} ${model_q3_k} q3_k
+    ./bin/quantize ${model_f16} ${model_q4_k} q4_k
+    ./bin/quantize ${model_f16} ${model_q5_k} q5_k
+    ./bin/quantize ${model_f16} ${model_q6_k} q6_k
+
+    (time ./bin/main --model ${model_f16}  -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+
+    (time ./bin/perplexity --model ${model_f16}  -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+
+    (time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
+
+    (time ./bin/save-load-state     --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+
+    function check_ppl {
+        qnt="$1"
+        ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
+
+        if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
+            printf '  - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
+            return 20
+        fi
+
+        printf '  - %s @ %s OK\n' "$qnt" "$ppl"
+        return 0
+    }
+
+    check_ppl "f16"  "$(cat $OUT/${ci}-tg-f16.log  | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+
+    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
+
+    set +e
+}
+
+function gg_sum_open_llama_3b_v2 {
+    gg_printf '### %s\n\n' "${ci}"
+
+    gg_printf 'OpenLLaMA 3B-v2:\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
+    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
+    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
+    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
+    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
+    gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
+    gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
+    gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
+    gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
+    gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
+    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
+    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
+    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
+    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
+}
+
 # open_llama_7b_v2
+# requires: GG_BUILD_CUDA

 function gg_run_open_llama_7b_v2 {
    cd ${SRC}
@@ -296,10 +414,10 @@ function gg_run_open_llama_7b_v2 {

    set -e

-    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j                                                           ) 2>&1 | tee -a $OUT/${ci}-make.log

-    python3 ../examples/convert_legacy_llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
+    python3 ../convert.py ${path_models}

    model_f16="${path_models}/ggml-model-f16.gguf"
    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@@ -315,47 +433,47 @@ function gg_run_open_llama_7b_v2 {

    wiki_test="${path_wiki}/wiki.test.raw"

-    ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
-    ./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
-    ./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
-    ./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
-    ./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
-    ./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
-    ./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
-    ./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
-    ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
-    ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
+    ./bin/quantize ${model_f16} ${model_q8_0} q8_0
+    ./bin/quantize ${model_f16} ${model_q4_0} q4_0
+    ./bin/quantize ${model_f16} ${model_q4_1} q4_1
+    ./bin/quantize ${model_f16} ${model_q5_0} q5_0
+    ./bin/quantize ${model_f16} ${model_q5_1} q5_1
+    ./bin/quantize ${model_f16} ${model_q2_k} q2_k
+    ./bin/quantize ${model_f16} ${model_q3_k} q3_k
+    ./bin/quantize ${model_f16} ${model_q4_k} q4_k
+    ./bin/quantize ${model_f16} ${model_q5_k} q5_k
+    ./bin/quantize ${model_f16} ${model_q6_k} q6_k

-    (time ./bin/llama-cli --model ${model_f16}  -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/main --model ${model_f16}  -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log

-    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log

-    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
+    (time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log

-    (time ./bin/llama-save-load-state     -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state     -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/save-load-state     -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/save-load-state     -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log

    function check_ppl {
        qnt="$1"
@@ -408,271 +526,6 @@ function gg_sum_open_llama_7b_v2 {
    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
 }

-# pythia_1.4b
-
-function gg_run_pythia_1_4b {
-    cd ${SRC}
-
-    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/config.json
-    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer.json
-    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer_config.json
-    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/special_tokens_map.json
-    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/resolve/main/pytorch_model.bin
-
-    gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
-    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
-    head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
-
-    path_models="../models-mnt/pythia/1.4B"
-    path_wiki="../models-mnt/wikitext/wikitext-2-raw"
-
-    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
-
-    set -e
-
-    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
-
-    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
-
-    model_f16="${path_models}/ggml-model-f16.gguf"
-    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
-    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
-    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
-    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
-    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
-    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
-    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
-    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
-    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
-    model_q6_k="${path_models}/ggml-model-q6_k.gguf"
-
-    wiki_test_60="${path_wiki}/wiki.test-60.raw"
-
-    ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
-    ./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
-    ./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
-    ./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
-    ./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
-    ./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
-    ./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
-    ./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
-    ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
-    ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
-
-    (time ./bin/llama-cli --model ${model_f16}  -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-cli --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-cli --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-cli --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-cli --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-cli --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-cli --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-cli --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-cli --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-cli --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-cli --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-
-    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-
-    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
-
-    (time ./bin/llama-save-load-state     --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-
-    function check_ppl {
-        qnt="$1"
-        ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
-
-        if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
-            printf '  - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
-            return 20
-        fi
-
-        printf '  - %s @ %s OK\n' "$qnt" "$ppl"
-        return 0
-    }
-
-    check_ppl "f16"  "$(cat $OUT/${ci}-tg-f16.log  | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-   #check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model
-    check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-
-    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
-
-    set +e
-}
-
-function gg_sum_pythia_1_4b {
-    gg_printf '### %s\n\n' "${ci}"
-
-    gg_printf 'Pythia 1.4B:\n'
-    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
-    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
-    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
-    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
-    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
-    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
-    gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
-    gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
-    gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
-    gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
-    gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
-    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
-    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
-    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
-    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
-}
-
-# pythia_2_8b
-
-function gg_run_pythia_2_8b {
-    cd ${SRC}
-
-    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/config.json
-    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer.json
-    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer_config.json
-    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/special_tokens_map.json
-    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/resolve/main/pytorch_model.bin
-
-    gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
-    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
-
-    path_models="../models-mnt/pythia/2.8B"
-    path_wiki="../models-mnt/wikitext/wikitext-2-raw"
-
-    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
-
-    set -e
-
-    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
-
-    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
-
-    model_f16="${path_models}/ggml-model-f16.gguf"
-    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
-    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
-    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
-    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
-    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
-    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
-    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
-    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
-    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
-    model_q6_k="${path_models}/ggml-model-q6_k.gguf"
-
-    wiki_test="${path_wiki}/wiki.test.raw"
-
-    ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
-    ./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
-    ./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
-    ./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
-    ./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
-    ./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
-    ./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
-    ./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
-    ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
-    ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
-
-    (time ./bin/llama-cli --model ${model_f16}  -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-
-    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-
-    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
-
-    (time ./bin/llama-save-load-state     -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state     -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-
-    function check_ppl {
-        qnt="$1"
-        ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
-
-        if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
-            printf '  - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
-            return 20
-        fi
-
-        printf '  - %s @ %s OK\n' "$qnt" "$ppl"
-        return 0
-    }
-
-    check_ppl "f16"  "$(cat $OUT/${ci}-tg-f16.log  | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-   #check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model
-    check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-
-    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
-
-    set +e
-}
-
-function gg_sum_pythia_2_8b {
-    gg_printf '### %s\n\n' "${ci}"
-
-    gg_printf 'Pythia 2.8B:\n'
-    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
-    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
-    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
-    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
-    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
-    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
-    gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
-    gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
-    gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
-    gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
-    gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
-    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
-    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
-    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
-    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
-}
-
 # bge-small

 function gg_run_embd_bge_small {
@@ -697,35 +550,21 @@ function gg_run_embd_bge_small {
    set -e

    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time make -j                                            ) 2>&1 | tee -a $OUT/${ci}-make.log

-    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
+    python3 ../convert-hf-to-gguf.py ${path_models}

    model_f16="${path_models}/ggml-model-f16.gguf"
    model_q8_0="${path_models}/ggml-model-q8_0.gguf"

-    ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
+    ./bin/quantize ${model_f16} ${model_q8_0} q8_0

-    (time ./bin/llama-embedding --model ${model_f16}  -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/embedding --model ${model_f16}  -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log

    set +e
 }

-function gg_check_build_requirements {
-    if ! command -v cmake &> /dev/null; then
-        gg_printf 'cmake not found, please install'
-    fi
-
-    if ! command -v make &> /dev/null; then
-        gg_printf 'make not found, please install'
-    fi
-
-    if ! command -v ctest &> /dev/null; then
-        gg_printf 'ctest not found, please install'
-    fi
-}
-
 function gg_sum_embd_bge_small {
    gg_printf '### %s\n\n' "${ci}"

@@ -737,9 +576,6 @@ function gg_sum_embd_bge_small {

 ## main

-export LLAMA_LOG_PREFIX=1
-export LLAMA_LOG_TIMESTAMPS=1
-
 if [ -z ${GG_BUILD_LOW_PERF} ]; then
    # Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
    rm -rf ${SRC}/models-mnt
@@ -769,11 +605,10 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
    fi

    if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
-        if [ -z ${GG_BUILD_CUDA} ] && [ -z ${GG_BUILD_VULKAN} ]; then
-            test $ret -eq 0 && gg_run pythia_1_4b
+        if [ -z ${GG_BUILD_CUDA} ]; then
+            test $ret -eq 0 && gg_run open_llama_3b_v2
        else
-            test $ret -eq 0 && gg_run pythia_2_8b
-            #test $ret -eq 0 && gg_run open_llama_7b_v2
+            test $ret -eq 0 && gg_run open_llama_7b_v2
        fi
        test $ret -eq 0 && gg_run ctest_with_model_debug
        test $ret -eq 0 && gg_run ctest_with_model_release
--- a/ggml/cmake/FindSIMD.cmake
+++ b/ggml/cmake/FindSIMD.cmake
@@ -79,22 +79,22 @@ endmacro()
 # flags are for MSVC only!
 check_sse("AVX" " ;/arch:AVX")
 if (NOT ${AVX_FOUND})
-    set(GGML_AVX OFF)
+    set(LLAMA_AVX OFF)
 else()
-    set(GGML_AVX ON)
+    set(LLAMA_AVX ON)
 endif()

 check_sse("AVX2" " ;/arch:AVX2")
 check_sse("FMA" " ;/arch:AVX2")
 if ((NOT ${AVX2_FOUND}) OR (NOT ${FMA_FOUND}))
-    set(GGML_AVX2 OFF)
+    set(LLAMA_AVX2 OFF)
 else()
-    set(GGML_AVX2 ON)
+    set(LLAMA_AVX2 ON)
 endif()

 check_sse("AVX512" " ;/arch:AVX512")
 if (NOT ${AVX512_FOUND})
-    set(GGML_AVX512 OFF)
+    set(LLAMA_AVX512 OFF)
 else()
-    set(GGML_AVX512 ON)
+    set(LLAMA_AVX512 ON)
 endif()
--- a/cmake/arm64-windows-llvm.cmake
+++ b/cmake/arm64-windows-llvm.cmake
@@ -9,7 +9,7 @@ set( CMAKE_CXX_COMPILER  clang++ )
 set( CMAKE_C_COMPILER_TARGET   ${target} )
 set( CMAKE_CXX_COMPILER_TARGET ${target} )

-set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
+set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast" )
 set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )

 set( CMAKE_C_FLAGS_INIT   "${arch_c_flags} ${warn_c_flags}" )
--- a/cmake/git-vars.cmake
+++ b/cmake/git-vars.cmake
@@ -1,22 +0,0 @@
-find_package(Git)
-
-# the commit's SHA1
-execute_process(COMMAND
-    "${GIT_EXECUTABLE}" describe --match=NeVeRmAtCh --always --abbrev=8
-    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
-    OUTPUT_VARIABLE GIT_SHA1
-    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-
-# the date of the commit
-execute_process(COMMAND
-    "${GIT_EXECUTABLE}" log -1 --format=%ad --date=local
-    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
-    OUTPUT_VARIABLE GIT_DATE
-    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-
-# the subject of the commit
-execute_process(COMMAND
-    "${GIT_EXECUTABLE}" log -1 --format=%s
-    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
-    OUTPUT_VARIABLE GIT_COMMIT_SUBJECT
-    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
--- a/cmake/llama.pc.in
+++ b/cmake/llama.pc.in
@@ -1,10 +0,0 @@
-prefix=@CMAKE_INSTALL_PREFIX@
-exec_prefix=${prefix}
-libdir=${exec_prefix}/lib
-includedir=${prefix}/include
-
-Name: llama
-Description: Port of Facebook's LLaMA model in C/C++
-Version: @PROJECT_VERSION@
-Libs: -L${libdir} -lllama
-Cflags: -I${includedir}
--- a/codecov.yml
+++ b/codecov.yml
@@ -0,0 +1,14 @@
+comment: off
+
+coverage:
+  status:
+    project:
+      default:
+        target: auto
+        threshold: 0
+        base: auto
+    patch:
+      default:
+        target: auto
+        threshold: 0
+        base: auto
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -1,6 +1,5 @@
 # common

-find_package(Threads REQUIRED)

 # Build info header
 #
@@ -37,7 +36,7 @@ add_custom_command(
    COMMENT "Generating build details from Git"
    COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
            -DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
-            -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info-gen-cpp.cmake"
+            -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/../scripts/gen-build-info-cpp.cmake"
    WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
    DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
    VERBATIM
@@ -51,23 +50,21 @@ endif()
 set(TARGET common)

 add_library(${TARGET} STATIC
-    arg.cpp
-    arg.h
    base64.hpp
-    common.cpp
    common.h
-    console.cpp
-    console.h
-    json-schema-to-grammar.cpp
-    json.hpp
-    log.cpp
-    log.h
-    ngram-cache.cpp
-    ngram-cache.h
-    sampling.cpp
+    common.cpp
    sampling.h
-    train.cpp
+    sampling.cpp
+    console.h
+    console.cpp
+    grammar-parser.h
+    grammar-parser.cpp
+    json.hpp
+    json-schema-to-grammar.cpp
    train.h
+    train.cpp
+    ngram-cache.h
+    ngram-cache.cpp
    )

 if (BUILD_SHARED_LIBS)
@@ -86,5 +83,5 @@ if (LLAMA_CURL)
 endif ()

 target_include_directories(${TARGET} PUBLIC .)
-target_compile_features   (${TARGET} PUBLIC cxx_std_11)
-target_link_libraries     (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
+target_compile_features(${TARGET} PUBLIC cxx_std_11)
+target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama)
--- a/common/arg.cpp
+++ b/common/arg.cpp
--- a/common/arg.h
+++ b/common/arg.h
@@ -1,77 +0,0 @@
-#pragma once
-
-#include "common.h"
-
-#include <set>
-#include <string>
-#include <vector>
-
-//
-// CLI argument parsing
-//
-
-struct llama_arg {
-    std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
-    std::vector<const char *> args;
-    const char * value_hint   = nullptr; // help text or example for arg value
-    const char * value_hint_2 = nullptr; // for second arg value
-    const char * env          = nullptr;
-    std::string help;
-    bool is_sparam = false; // is current arg a sampling param?
-    void (*handler_void)   (gpt_params & params) = nullptr;
-    void (*handler_string) (gpt_params & params, const std::string &) = nullptr;
-    void (*handler_str_str)(gpt_params & params, const std::string &, const std::string &) = nullptr;
-    void (*handler_int)    (gpt_params & params, int) = nullptr;
-
-    llama_arg(
-        const std::initializer_list<const char *> & args,
-        const char * value_hint,
-        const std::string & help,
-        void (*handler)(gpt_params & params, const std::string &)
-    ) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
-
-    llama_arg(
-        const std::initializer_list<const char *> & args,
-        const char * value_hint,
-        const std::string & help,
-        void (*handler)(gpt_params & params, int)
-    ) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
-
-    llama_arg(
-        const std::initializer_list<const char *> & args,
-        const std::string & help,
-        void (*handler)(gpt_params & params)
-    ) : args(args), help(help), handler_void(handler) {}
-
-    // support 2 values for arg
-    llama_arg(
-        const std::initializer_list<const char *> & args,
-        const char * value_hint,
-        const char * value_hint_2,
-        const std::string & help,
-        void (*handler)(gpt_params & params, const std::string &, const std::string &)
-    ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
-
-    llama_arg & set_examples(std::initializer_list<enum llama_example> examples);
-    llama_arg & set_env(const char * env);
-    llama_arg & set_sparam();
-    bool in_example(enum llama_example ex);
-    bool get_value_from_env(std::string & output);
-    bool has_value_from_env();
-    std::string to_string();
-};
-
-struct gpt_params_context {
-    enum llama_example ex = LLAMA_EXAMPLE_COMMON;
-    gpt_params & params;
-    std::vector<llama_arg> options;
-    void(*print_usage)(int, char **) = nullptr;
-    gpt_params_context(gpt_params & params) : params(params) {}
-};
-
-// parse input arguments from CLI
-// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
-bool gpt_params_parse(int argc, char ** argv, gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
-
-// function to be used by test-arg-parser
-gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
--- a/common/common.cpp
+++ b/common/common.cpp
--- a/common/common.h
+++ b/common/common.h
@@ -4,9 +4,18 @@

 #include "llama.h"

+#include "sampling.h"
+
+#define LOG_NO_FILE_LINE_FUNCTION
+#include "log.h"
+
+#include <cmath>
 #include <string>
 #include <vector>
-#include <sstream>
+#include <random>
+#include <thread>
+#include <unordered_map>
+#include <tuple>

 #ifdef _WIN32
 #define DIRECTORY_SEPARATOR '\\'
@@ -18,242 +27,144 @@
 #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)

 #define print_build_info() do {                                                                     \
-    fprintf(stderr, "%s: build = %d (%s)\n",      __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);      \
+    fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);           \
    fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);    \
 } while(0)

 #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"

-struct llama_lora_adapter_info {
-    std::string path;
-    float scale;
-};
-
-struct llama_lora_adapter_container : llama_lora_adapter_info {
-    struct llama_lora_adapter * adapter;
-};
-
 // build info
 extern int LLAMA_BUILD_NUMBER;
-extern char const * LLAMA_COMMIT;
-extern char const * LLAMA_COMPILER;
-extern char const * LLAMA_BUILD_TARGET;
+extern char const *LLAMA_COMMIT;
+extern char const *LLAMA_COMPILER;
+extern char const *LLAMA_BUILD_TARGET;

 struct llama_control_vector_load_info;

-//
-// CPU utils
-//
-
-struct cpu_params {
-    int      n_threads                   = -1;
-    bool     cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
-    bool     mask_valid                  = false;   // Default: any CPU
-    enum ggml_sched_priority  priority   = GGML_SCHED_PRIO_NORMAL;  // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
-    bool     strict_cpu                  = false;   // Use strict CPU placement
-    uint32_t poll                        = 50;      // Polling (busywait) level (0 - no polling, 100 - mostly polling)
-};
-
-int32_t cpu_get_num_physical_cores();
-int32_t cpu_get_num_math();
+int get_math_cpu_count();
+int32_t get_num_physical_cores();

 //
-// Common params
+// CLI argument parsing
 //

-enum llama_example {
-    LLAMA_EXAMPLE_COMMON,
-    LLAMA_EXAMPLE_SPECULATIVE,
-    LLAMA_EXAMPLE_MAIN,
-    LLAMA_EXAMPLE_INFILL,
-    LLAMA_EXAMPLE_EMBEDDING,
-    LLAMA_EXAMPLE_PERPLEXITY,
-    LLAMA_EXAMPLE_RETRIEVAL,
-    LLAMA_EXAMPLE_PASSKEY,
-    LLAMA_EXAMPLE_IMATRIX,
-    LLAMA_EXAMPLE_BENCH,
-    LLAMA_EXAMPLE_SERVER,
-    LLAMA_EXAMPLE_CVECTOR_GENERATOR,
-    LLAMA_EXAMPLE_EXPORT_LORA,
-    LLAMA_EXAMPLE_LLAVA,
-    LLAMA_EXAMPLE_LOOKUP,
-    LLAMA_EXAMPLE_PARALLEL,
-
-    LLAMA_EXAMPLE_COUNT,
-};
-
-enum gpt_sampler_type {
-    GPT_SAMPLER_TYPE_NONE        = 0,
-    GPT_SAMPLER_TYPE_TOP_K       = 1,
-    GPT_SAMPLER_TYPE_TOP_P       = 2,
-    GPT_SAMPLER_TYPE_MIN_P       = 3,
-    GPT_SAMPLER_TYPE_TFS_Z       = 4,
-    GPT_SAMPLER_TYPE_TYPICAL_P   = 5,
-    GPT_SAMPLER_TYPE_TEMPERATURE = 6,
-};
-
-// dimensionality reduction methods, used by cvector-generator
-enum dimre_method {
-    DIMRE_METHOD_PCA,
-    DIMRE_METHOD_MEAN,
-};
-
-// sampler parameters
-struct gpt_sampler_params {
-    uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
-
-    int32_t n_prev            = 64;    // number of previous tokens to remember
-    int32_t n_probs           = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
-    int32_t min_keep          = 0;     // 0 = disabled, otherwise samplers should return at least min_keep tokens
-    int32_t top_k             = 40;    // <= 0 to use vocab size
-    float   top_p             = 0.95f; // 1.0 = disabled
-    float   min_p             = 0.05f; // 0.0 = disabled
-    float   tfs_z             = 1.00f; // 1.0 = disabled
-    float   typ_p             = 1.00f; // typical_p, 1.0 = disabled
-    float   temp              = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
-    float   dynatemp_range    = 0.00f; // 0.0 = disabled
-    float   dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
-    int32_t penalty_last_n    = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
-    float   penalty_repeat    = 1.00f; // 1.0 = disabled
-    float   penalty_freq      = 0.00f; // 0.0 = disabled
-    float   penalty_present   = 0.00f; // 0.0 = disabled
-    int32_t mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
-    float   mirostat_tau      = 5.00f; // target entropy
-    float   mirostat_eta      = 0.10f; // learning rate
-    bool    penalize_nl       = false; // consider newlines as a repeatable token
-    bool    ignore_eos        = false;
-    bool    no_perf           = false; // disable performance metrics
-
-    std::vector<enum gpt_sampler_type> samplers = {
-        GPT_SAMPLER_TYPE_TOP_K,
-        GPT_SAMPLER_TYPE_TFS_Z,
-        GPT_SAMPLER_TYPE_TYPICAL_P,
-        GPT_SAMPLER_TYPE_TOP_P,
-        GPT_SAMPLER_TYPE_MIN_P,
-        GPT_SAMPLER_TYPE_TEMPERATURE
-    };
-
-    std::string grammar; // optional BNF-like grammar to constrain sampling
-
-    std::vector<llama_logit_bias> logit_bias; // logit biases to apply
-
-    // print the parameters into a string
-    std::string print() const;
-};
-
 struct gpt_params {
-    int32_t n_predict             =    -1; // new tokens to predict
-    int32_t n_ctx                 =     0; // context size
-    int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
-    int32_t n_ubatch              =   512; // physical batch size for prompt processing (must be >=32 to use BLAS)
-    int32_t n_keep                =     0; // number of tokens to keep from initial prompt
-    int32_t n_draft               =     5; // number of tokens to draft during speculative decoding
-    int32_t n_chunks              =    -1; // max number of chunks to process (-1 = unlimited)
-    int32_t n_parallel            =     1; // number of parallel sequences to decode
-    int32_t n_sequences           =     1; // number of sequences to decode
-    float   p_split               =  0.1f; // speculative decoding split probability
-    int32_t n_gpu_layers          =    -1; // number of layers to store in VRAM (-1 - use default)
-    int32_t n_gpu_layers_draft    =    -1; // number of layers to store in VRAM for the draft model (-1 - use default)
-    int32_t main_gpu              =     0; // the GPU that is used for scratch and small tensors
-    float   tensor_split[128]     =   {0}; // how split tensors should be distributed across GPUs
-    int32_t grp_attn_n            =     1; // group-attention factor
-    int32_t grp_attn_w            =   512; // group-attention width
-    int32_t n_print               =    -1; // print token count every n tokens (-1 = disabled)
-    float   rope_freq_base        =  0.0f; // RoPE base frequency
-    float   rope_freq_scale       =  0.0f; // RoPE frequency scaling factor
-    float   yarn_ext_factor       = -1.0f; // YaRN extrapolation mix factor
-    float   yarn_attn_factor      =  1.0f; // YaRN magnitude scaling factor
-    float   yarn_beta_fast        = 32.0f; // YaRN low correction dim
-    float   yarn_beta_slow        =  1.0f; // YaRN high correction dim
-    int32_t yarn_orig_ctx         =     0; // YaRN original context length
-    float   defrag_thold          = -1.0f; // KV cache defragmentation threshold
+    uint32_t seed                 = LLAMA_DEFAULT_SEED; // RNG seed

-    struct cpu_params cpuparams;
-    struct cpu_params cpuparams_batch;
-    struct cpu_params draft_cpuparams;
-    struct cpu_params draft_cpuparams_batch;
+    int32_t n_threads             = get_math_cpu_count();
+    int32_t n_threads_draft       = -1;
+    int32_t n_threads_batch       = -1;    // number of threads to use for batch processing (-1 = use n_threads)
+    int32_t n_threads_batch_draft = -1;
+    int32_t n_predict             = -1;    // new tokens to predict
+    int32_t n_ctx                 = 512;   // context size
+    int32_t n_batch               = 2048;  // logical batch size for prompt processing (must be >=32 to use BLAS)
+    int32_t n_ubatch              = 512;   // physical batch size for prompt processing (must be >=32 to use BLAS)
+    int32_t n_keep                = 0;     // number of tokens to keep from initial prompt
+    int32_t n_draft               = 5;     // number of tokens to draft during speculative decoding
+    int32_t n_chunks              = -1;    // max number of chunks to process (-1 = unlimited)
+    int32_t n_parallel            = 1;     // number of parallel sequences to decode
+    int32_t n_sequences           = 1;     // number of sequences to decode
+    float   p_split               = 0.1f;  // speculative decoding split probability
+    int32_t n_gpu_layers          = -1;    // number of layers to store in VRAM (-1 - use default)
+    int32_t n_gpu_layers_draft    = -1;    // number of layers to store in VRAM for the draft model (-1 - use default)
+    llama_split_mode split_mode   = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
+    int32_t main_gpu              = 0;     // the GPU that is used for scratch and small tensors
+    float   tensor_split[128]     = {0};   // how split tensors should be distributed across GPUs
+    int32_t n_beams               = 0;     // if non-zero then use beam search of given width.
+    int32_t grp_attn_n            = 1;     // group-attention factor
+    int32_t grp_attn_w            = 512;   // group-attention width
+    int32_t n_print               = -1;    // print token count every n tokens (-1 = disabled)
+    float   rope_freq_base        = 0.0f;  // RoPE base frequency
+    float   rope_freq_scale       = 0.0f;  // RoPE frequency scaling factor
+    float   yarn_ext_factor       = -1.0f; // YaRN extrapolation mix factor
+    float   yarn_attn_factor      = 1.0f;  // YaRN magnitude scaling factor
+    float   yarn_beta_fast        = 32.0f; // YaRN low correction dim
+    float   yarn_beta_slow        = 1.0f;  // YaRN high correction dim
+    int32_t yarn_orig_ctx         = 0;     // YaRN original context length
+    float   defrag_thold          = -1.0f; // KV cache defragmentation threshold
+    std::string rpc_servers       = "";    // comma separated list of RPC servers

    ggml_backend_sched_eval_callback cb_eval = nullptr;
    void * cb_eval_user_data                 = nullptr;

    ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;

-    enum llama_split_mode        split_mode        = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
    enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
    enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
-    enum llama_attention_type    attention_type    = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings

-    struct gpt_sampler_params sparams;
+    // // sampling parameters
+    struct llama_sampling_params sparams;

-    std::string model                = ""; // model path                                                    // NOLINT
-    std::string model_draft          = ""; // draft model for speculative decoding                          // NOLINT
-    std::string model_alias          = "unknown"; // model alias                                            // NOLINT
-    std::string model_url            = ""; // model url to download                                         // NOLINT
-    std::string hf_token             = ""; // HF token                                                      // NOLINT
-    std::string hf_repo              = ""; // HF repo                                                       // NOLINT
-    std::string hf_file              = ""; // HF file                                                       // NOLINT
-    std::string prompt               = "";                                                                  // NOLINT
-    std::string prompt_file          = ""; // store the external prompt file name                           // NOLINT
-    std::string path_prompt_cache    = ""; // path to file for saving/loading prompt eval state             // NOLINT
-    std::string input_prefix         = ""; // string to prefix user inputs with                             // NOLINT
-    std::string input_suffix         = ""; // string to suffix user inputs with                             // NOLINT
-    std::string logdir               = ""; // directory in which to save YAML log files                     // NOLINT
-    std::string lookup_cache_static  = ""; // path of static ngram cache file for lookup decoding           // NOLINT
-    std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding          // NOLINT
-    std::string logits_file          = ""; // file for saving *all* logits                                  // NOLINT
-    std::string rpc_servers          = ""; // comma separated list of RPC servers                           // NOLINT
+    std::string model                = "";  // model path
+    std::string model_draft          = "";  // draft model for speculative decoding
+    std::string model_alias          = "unknown"; // model alias
+    std::string model_url            = "";  // model url to download
+    std::string hf_repo              = "";  // HF repo
+    std::string hf_file              = "";  // HF file
+    std::string prompt               = "";
+    std::string prompt_file          = "";  // store the external prompt file name
+    std::string path_prompt_cache    = "";  // path to file for saving/loading prompt eval state
+    std::string input_prefix         = "";  // string to prefix user inputs with
+    std::string input_suffix         = "";  // string to suffix user inputs with
+    std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
+    std::string logdir               = "";  // directory in which to save YAML log files
+    std::string lookup_cache_static  = ""; // path of static ngram cache file for lookup decoding
+    std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
+    std::string logits_file          = "";  // file for saving *all* logits

-    std::vector<std::string> in_files;   // all input files
-    std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
    std::vector<llama_model_kv_override> kv_overrides;

-    bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
-    std::vector<llama_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
+    // TODO: avoid tuple, use struct
+    std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
+    std::string lora_base  = "";                              // base model path for the lora adapter

    std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale

-    int32_t verbosity                  = 0;
    int32_t control_vector_layer_start = -1; // layer range for control vector
    int32_t control_vector_layer_end   = -1; // layer range for control vector

-    int32_t ppl_stride      = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
-    int32_t ppl_output_type = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
-                                     //                                       (which is more convenient to use for plotting)
-                                     //
-    bool   hellaswag        = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
-    size_t hellaswag_tasks  = 400;   // number of tasks to use when computing the HellaSwag score
+    int  ppl_stride        = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
+    int  ppl_output_type   = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
+                                    //                                       (which is more convenient to use for plotting)
+                                    //
+    bool   hellaswag       = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
+    size_t hellaswag_tasks = 400;   // number of tasks to use when computing the HellaSwag score

-    bool   winogrande       = false; // compute Winogrande score over random tasks from datafile supplied in prompt
-    size_t winogrande_tasks = 0;     // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
+    bool   winogrande      = false; // compute Winogrande score over random tasks from datafile supplied in prompt
+    size_t winogrande_tasks= 0;     // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed

-    bool   multiple_choice  = false;  // compute TruthfulQA score over random tasks from datafile supplied in prompt
-    size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
+    bool   multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
+    size_t multiple_choice_tasks = 0;     // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed

-    bool   kl_divergence    = false; // compute KL divergence
+    bool   kl_divergence   = false; // compute KL divergence

-    bool usage             = false; // print usage
+    bool random_prompt     = false; // do not randomize prompt if none provided
    bool use_color         = false; // use color to distinguish generations and inputs
-    bool special           = false; // enable special token output
    bool interactive       = false; // interactive mode
-    bool interactive_first = false; // wait for user input immediately
+    bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
    bool conversation      = false; // conversation mode (does not print special tokens and suffix/prefix)
+    bool chatml            = false; // chatml mode (used for models trained on chatml syntax)
    bool prompt_cache_all  = false; // save user input and generations to prompt cache
    bool prompt_cache_ro   = false; // open the prompt cache read-only and do not update it

-    bool escape            = true;  // escape "\n", "\r", "\t", "\'", "\"", and "\\"
+    bool embedding         = false; // get only sentence embedding
+    bool escape            = false; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
+    bool interactive_first = false; // wait for user input immediately
    bool multiline_input   = false; // reverse the usage of `\`
    bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
    bool cont_batching     = true;  // insert new sequences for decoding on-the-fly
    bool flash_attn        = false; // flash attention
-    bool no_perf           = false; // disable performance metrics
-    bool ctx_shift         = true;  // context shift on inifinite text generation

    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
+    bool ignore_eos        = false; // ignore generated EOS tokens
+    bool instruct          = false; // instruction mode (used for Alpaca models)
    bool logits_all        = false; // return logits for all tokens in the batch
    bool use_mmap          = true;  // use mmap for faster loads
+    bool use_direct_io     = false; // use direct I/O
    bool use_mlock         = false; // use mlock to keep model in memory
    bool verbose_prompt    = false; // print prompt tokens before generation
    bool display_prompt    = true;  // print prompt before generation
+    bool infill            = false; // use infill mode
    bool dump_kv_cache     = false; // dump the KV cache contents for debugging purposes
    bool no_kv_offload     = false; // disable KV offloading
    bool warmup            = true;  // warmup run
@@ -263,160 +174,52 @@ struct gpt_params {
    std::string cache_type_v = "f16"; // KV cache data type for the V

    // multimodal models (see examples/llava)
-    std::string mmproj = "";        // path to multimodal projector                                         // NOLINT
+    std::string mmproj = "";        // path to multimodal projector
    std::vector<std::string> image; // path to image file(s)
-
-    // embedding
-    bool embedding         = false; // get only sentence embedding
-    int32_t embd_normalize = 2;     // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
-    std::string embd_out   = "";    // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
-    std::string embd_sep   = "\n";  // separator of embendings
-
-    // server params
-    int32_t port           = 8080;         // server listens on this network port
-    int32_t timeout_read   = 600;          // http read timeout in seconds
-    int32_t timeout_write  = timeout_read; // http write timeout in seconds
-    int     n_threads_http = -1;           // number of threads to process HTTP requests (TODO: support threadpool)
-
-    std::string hostname      = "127.0.0.1";
-    std::string public_path   = "";                                                                         // NOLINT
-    std::string chat_template = "";                                                                         // NOLINT
-    std::string system_prompt = "";                                                                         // NOLINT
-    bool enable_chat_template = true;
-
-    std::vector<std::string> api_keys;
-
-    std::string ssl_file_key  = "";                                                                         // NOLINT
-    std::string ssl_file_cert = "";                                                                         // NOLINT
-
-    bool endpoint_slots   = true;
-    bool endpoint_metrics = false;
-
-    bool log_json = false;
-
-    std::string slot_save_path;
-
-    float slot_prompt_similarity = 0.5f;
-
-    // batched-bench params
-    bool is_pp_shared = false;
-
-    std::vector<int32_t> n_pp;
-    std::vector<int32_t> n_tg;
-    std::vector<int32_t> n_pl;
-
-    // retrieval params
-    std::vector<std::string> context_files; // context files to embed
-
-    int32_t chunk_size = 64; // chunk size for context embedding
-
-    std::string chunk_separator = "\n"; // chunk separator for context embedding
-
-    // passkey params
-    int32_t n_junk = 250; // number of times to repeat the junk text
-    int32_t i_pos  = -1;  // position of the passkey in the junk text
-
-    // imatrix params
-    std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
-
-    int32_t n_out_freq  = 10; // output the imatrix every n_out_freq iterations
-    int32_t n_save_freq =  0; // save the imatrix every n_save_freq iterations
-    int32_t i_chunk     =  0; // start processing from this chunk
-
-    bool process_output = false; // collect data for the output tensor
-    bool compute_ppl    = true;  // whether to compute perplexity
-
-    // cvector-generator params
-    int n_pca_batch = 100;
-    int n_pca_iterations = 1000;
-    dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
-    std::string cvector_outfile       = "control_vector.gguf";
-    std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
-    std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
-
-    bool spm_infill = false; // suffix/prefix/middle pattern for infill
-
-    std::string lora_outfile = "ggml-lora-merged-f16.gguf";
-
-    // batched-bench params
-    bool batched_bench_output_jsonl = false;
 };

-// call once at the start of a program if it uses libcommon
-// initializes the logging system and prints info about the build
-void gpt_init();
+void gpt_params_handle_model_default(gpt_params & params);

-std::string gpt_params_get_system_info(const gpt_params & params);
+bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);

-bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
-bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
-void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr);
-bool set_process_priority(enum ggml_sched_priority prio);
+bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
+
+bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
+
+void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
+
+bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
+
+std::string get_system_info(const gpt_params & params);
+
+std::string gpt_random_prompt(std::mt19937 & rng);
+
+void process_escapes(std::string& input);
+
+bool validate_file_name(const std::string & filename);

 //
 // String utils
 //

+std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
+std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string);
 std::vector<std::string> string_split(std::string input, char separator);
-
 std::string string_strip(const std::string & str);
-std::string string_get_sortable_timestamp();
-
-void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
-
-template<class T>
-static std::vector<T> string_split(const std::string & str, char delim) {
-    std::vector<T> values;
-    std::istringstream str_stream(str);
-    std::string token;
-    while (std::getline(str_stream, token, delim)) {
-        T value;
-        std::istringstream token_stream(token);
-        token_stream >> value;
-        values.push_back(value);
-    }
-    return values;
-}
-
-bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
-void string_process_escapes(std::string & input);
-
-std::string string_from(bool value);
-std::string string_from(const std::vector<int> & values);
-std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
-std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
-
-//
-// Filesystem utils
-//
-
-bool fs_validate_filename(const std::string & filename);
-bool fs_create_directory_with_parents(const std::string & path);
-
-std::string fs_get_cache_directory();
-std::string fs_get_cache_file(const std::string & filename);
+std::string sampler_type_to_name_string(llama_sampler_type sampler_type);

 //
 // Model utils
 //

-struct llama_init_result {
-    struct llama_model   * model   = nullptr;
-    struct llama_context * context = nullptr;
-    std::vector<llama_lora_adapter_container> lora_adapters;
-};
+// TODO: avoid tuplue, use struct
+std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params);

-struct llama_init_result    llama_init_from_gpt_params(gpt_params & params);
+struct llama_model_params   llama_model_params_from_gpt_params  (const gpt_params & params);
+struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);

-struct llama_model_params     llama_model_params_from_gpt_params    (const gpt_params & params);
-struct llama_context_params   llama_context_params_from_gpt_params  (const gpt_params & params);
-struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
-
-struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
-struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
-
-// clear LoRA adapters from context, then apply new list of adapters
-void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters);
+struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const struct llama_model_params & params);
+struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const struct llama_model_params & params);

 // Batch utils

@@ -454,61 +257,55 @@ std::string llama_token_to_piece(
                       llama_token   token,
                       bool          special = true);

+// TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function
+//       that takes into account the tokenizer type and decides how to handle the leading space
+//
 // detokenizes a vector of tokens into a string
 // should work similar to Python's `tokenizer.decode`
-// optionally renders special/control tokens
-std::string llama_detokenize(
+// removes the leading space from the first non-BOS token
+std::string llama_detokenize_spm(
                         llama_context * ctx,
-        const std::vector<llama_token> & tokens,
-                                  bool   special = true);
+        const std::vector<llama_token> & tokens);
+
+// detokenizes a vector of tokens into a string
+// should work similar to Python's `tokenizer.decode`
+std::string llama_detokenize_bpe(
+                         llama_context * ctx,
+        const std::vector<llama_token> & tokens);
+
+// Uses the value from the model metadata if possible, otherwise
+// defaults to true when model type is SPM, otherwise false.
+bool llama_should_add_bos_token(const llama_model * model);

 //
-// Chat template utils
+// YAML utils
 //

-// same with llama_chat_message, but uses std::string
-struct llama_chat_msg {
-    std::string role;
-    std::string content;
-};
+bool create_directory_with_parents(const std::string & path);
+void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector<float> & data);
+void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector<int> & data);
+void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data);
+std::string get_sortable_timestamp();

-// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
-bool llama_chat_verify_template(const std::string & tmpl);
-
-// CPP wrapper for llama_chat_apply_template
-// If the built-in template is not supported, we default to chatml
-// If the custom "tmpl" is not supported, we throw an error
-std::string llama_chat_apply_template(const struct llama_model * model,
-        const std::string & tmpl,
-        const std::vector<llama_chat_msg> & chat,
-        bool add_ass);
-
-// Format single message, while taking into account the position of that message in chat history
-std::string llama_chat_format_single(const struct llama_model * model,
-        const std::string & tmpl,
-        const std::vector<llama_chat_msg> & past_msg,
-        const llama_chat_msg & new_msg,
-        bool add_ass);
-
-// Returns an example of formatted chat
-std::string llama_chat_format_example(const struct llama_model * model,
-        const std::string & tmpl);
+void dump_non_result_info_yaml(
+    FILE * stream, const gpt_params & params, const llama_context * lctx,
+    const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);

 //
 // KV cache utils
 //

 // Dump the KV cache view with the number of sequences per cell.
-void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
+void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);

 // Dump the KV cache view showing individual sequences in each cell (long output).
-void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
+void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);

 //
 // Embedding utils
 //

-void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
+void llama_embd_normalize(const float * inp, float * out, int n);

 float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);

@@ -536,19 +333,6 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
 //
 // Split utils
 //
-
 static const char * const LLM_KV_SPLIT_NO            = "split.no";
 static const char * const LLM_KV_SPLIT_COUNT         = "split.count";
 static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
-
-//
-// YAML utils
-//
-
-void yaml_dump_vector_float    (FILE * stream, const char * prop_name, const std::vector<float> & data);
-void yaml_dump_vector_int      (FILE * stream, const char * prop_name, const std::vector<int> & data);
-void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
-
-void yaml_dump_non_result_info(
-    FILE * stream, const gpt_params & params, const llama_context * lctx,
-    const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
--- a/common/grammar-parser.cpp
+++ b/common/grammar-parser.cpp
@@ -0,0 +1,449 @@
+#include "grammar-parser.h"
+#include <cstdint>
+#include <cwchar>
+#include <string>
+#include <utility>
+#include <stdexcept>
+#include <exception>
+
+namespace grammar_parser {
+    // NOTE: assumes valid utf8 (but checks for overrun)
+    // copied from llama.cpp
+    static std::pair<uint32_t, const char *> decode_utf8(const char * src) {
+        static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
+        uint8_t  first_byte = static_cast<uint8_t>(*src);
+        uint8_t  highbits   = first_byte >> 4;
+        int      len        = lookup[highbits];
+        uint8_t  mask       = (1 << (8 - len)) - 1;
+        uint32_t value      = first_byte & mask;
+        const char * end    = src + len; // may overrun!
+        const char * pos    = src + 1;
+        for ( ; pos < end && *pos; pos++) {
+            value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
+        }
+        return std::make_pair(value, pos);
+    }
+
+    static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
+        uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
+        auto result = state.symbol_ids.emplace(std::string(src, len), next_id);
+        return result.first->second;
+    }
+
+    static uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) {
+        uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
+        state.symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id;
+        return next_id;
+    }
+
+    static void add_rule(
+            parse_state & state,
+            uint32_t      rule_id,
+            const std::vector<llama_grammar_element> & rule) {
+        if (state.rules.size() <= rule_id) {
+            state.rules.resize(rule_id + 1);
+        }
+        state.rules[rule_id] = rule;
+    }
+
+    static bool is_word_char(char c) {
+        return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || ('0' <= c && c <= '9');
+    }
+
+    static std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
+        const char * pos   = src;
+        const char * end   = src + size;
+        uint32_t     value = 0;
+        for ( ; pos < end && *pos; pos++) {
+            value <<= 4;
+            char c = *pos;
+            if ('a' <= c && c <= 'f') {
+                value += c - 'a' + 10;
+            } else if ('A' <= c && c <= 'F') {
+                value += c - 'A' + 10;
+            } else if ('0' <= c && c <= '9') {
+                value += c - '0';
+            } else {
+                break;
+            }
+        }
+        if (pos != end) {
+            throw std::runtime_error("expecting " + std::to_string(size) + " hex chars at " + src);
+        }
+        return std::make_pair(value, pos);
+    }
+
+    static const char * parse_space(const char * src, bool newline_ok) {
+        const char * pos = src;
+        while (*pos == ' ' || *pos == '\t' || *pos == '#' ||
+                (newline_ok && (*pos == '\r' || *pos == '\n'))) {
+            if (*pos == '#') {
+                while (*pos && *pos != '\r' && *pos != '\n') {
+                    pos++;
+                }
+            } else {
+                pos++;
+            }
+        }
+        return pos;
+    }
+
+    static const char * parse_name(const char * src) {
+        const char * pos = src;
+        while (is_word_char(*pos)) {
+            pos++;
+        }
+        if (pos == src) {
+            throw std::runtime_error(std::string("expecting name at ") + src);
+        }
+        return pos;
+    }
+
+    static std::pair<uint32_t, const char *> parse_char(const char * src) {
+        if (*src == '\\') {
+            switch (src[1]) {
+                case 'x': return parse_hex(src + 2, 2);
+                case 'u': return parse_hex(src + 2, 4);
+                case 'U': return parse_hex(src + 2, 8);
+                case 't': return std::make_pair('\t', src + 2);
+                case 'r': return std::make_pair('\r', src + 2);
+                case 'n': return std::make_pair('\n', src + 2);
+                case '\\':
+                case '"':
+                case '[':
+                case ']':
+                    return std::make_pair(src[1], src + 2);
+                default:
+                    throw std::runtime_error(std::string("unknown escape at ") + src);
+            }
+        } else if (*src) {
+            return decode_utf8(src);
+        }
+        throw std::runtime_error("unexpected end of input");
+    }
+
+    const char * parse_alternates(
+            parse_state       & state,
+            const char        * src,
+            const std::string & rule_name,
+            uint32_t            rule_id,
+            bool                is_nested);
+
+    static const char * parse_sequence(
+            parse_state                        & state,
+            const char                         * src,
+            const std::string                  & rule_name,
+            std::vector<llama_grammar_element> & out_elements,
+            bool                                 is_nested) {
+        size_t last_sym_start = out_elements.size();
+        const char * pos = src;
+        while (*pos) {
+            if (*pos == '"') { // literal string
+                pos++;
+                last_sym_start = out_elements.size();
+                while (*pos != '"') {
+                    if (!*pos) {
+                        throw std::runtime_error("unexpected end of input");
+                    }
+                    auto char_pair = parse_char(pos);
+                         pos       = char_pair.second;
+                    out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
+                }
+                pos = parse_space(pos + 1, is_nested);
+            } else if (*pos == '[') { // char range(s)
+                pos++;
+                enum llama_gretype start_type = LLAMA_GRETYPE_CHAR;
+                if (*pos == '^') {
+                    pos++;
+                    start_type = LLAMA_GRETYPE_CHAR_NOT;
+                }
+                last_sym_start = out_elements.size();
+                while (*pos != ']') {
+                    if (!*pos) {
+                        throw std::runtime_error("unexpected end of input");
+                    }
+                    auto char_pair = parse_char(pos);
+                         pos       = char_pair.second;
+                    enum llama_gretype type = last_sym_start < out_elements.size()
+                        ? LLAMA_GRETYPE_CHAR_ALT
+                        : start_type;
+
+                    out_elements.push_back({type, char_pair.first});
+                    if (pos[0] == '-' && pos[1] != ']') {
+                        if (!pos[1]) {
+                            throw std::runtime_error("unexpected end of input");
+                        }
+                        auto endchar_pair = parse_char(pos + 1);
+                             pos          = endchar_pair.second;
+                        out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});
+                    }
+                }
+                pos = parse_space(pos + 1, is_nested);
+            } else if (is_word_char(*pos)) { // rule reference
+                const char * name_end    = parse_name(pos);
+                uint32_t     ref_rule_id = get_symbol_id(state, pos, name_end - pos);
+                pos = parse_space(name_end, is_nested);
+                last_sym_start = out_elements.size();
+                out_elements.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id});
+            } else if (*pos == '(') { // grouping
+                // parse nested alternates into synthesized rule
+                pos = parse_space(pos + 1, true);
+                uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
+                pos = parse_alternates(state, pos, rule_name, sub_rule_id, true);
+                last_sym_start = out_elements.size();
+                // output reference to synthesized rule
+                out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
+                if (*pos != ')') {
+                    throw std::runtime_error(std::string("expecting ')' at ") + pos);
+                }
+                pos = parse_space(pos + 1, is_nested);
+            } else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator
+                if (last_sym_start == out_elements.size()) {
+                    throw std::runtime_error(std::string("expecting preceding item to */+/? at ") + pos);
+                }
+
+                // apply transformation to previous symbol (last_sym_start to end) according to
+                // rewrite rules:
+                // S* --> S' ::= S S' |
+                // S+ --> S' ::= S S' | S
+                // S? --> S' ::= S |
+                uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
+                std::vector<llama_grammar_element> sub_rule;
+                // add preceding symbol to generated rule
+                sub_rule.insert(
+                    sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
+                if (*pos == '*' || *pos == '+') {
+                    // cause generated rule to recurse
+                    sub_rule.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
+                }
+                // mark start of alternate def
+                sub_rule.push_back({LLAMA_GRETYPE_ALT, 0});
+                if (*pos == '+') {
+                    // add preceding symbol as alternate only for '+' (otherwise empty)
+                    sub_rule.insert(
+                        sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
+                }
+                sub_rule.push_back({LLAMA_GRETYPE_END, 0});
+                add_rule(state, sub_rule_id, sub_rule);
+
+                // in original rule, replace previous symbol with reference to generated rule
+                out_elements.resize(last_sym_start);
+                out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
+
+                pos = parse_space(pos + 1, is_nested);
+            } else {
+                break;
+            }
+        }
+        return pos;
+    }
+
+    const char * parse_alternates(
+            parse_state       & state,
+            const char        * src,
+            const std::string & rule_name,
+            uint32_t            rule_id,
+            bool                is_nested) {
+        std::vector<llama_grammar_element> rule;
+        const char * pos = parse_sequence(state, src, rule_name, rule, is_nested);
+        while (*pos == '|') {
+            rule.push_back({LLAMA_GRETYPE_ALT, 0});
+            pos = parse_space(pos + 1, true);
+            pos = parse_sequence(state, pos, rule_name, rule, is_nested);
+        }
+        rule.push_back({LLAMA_GRETYPE_END, 0});
+        add_rule(state, rule_id, rule);
+        return pos;
+    }
+
+    static const char * parse_rule(parse_state & state, const char * src) {
+        const char * name_end = parse_name(src);
+        const char * pos      = parse_space(name_end, false);
+        size_t       name_len = name_end - src;
+        uint32_t     rule_id  = get_symbol_id(state, src, name_len);
+        const std::string name(src, name_len);
+
+        if (!(pos[0] == ':' && pos[1] == ':' && pos[2] == '=')) {
+            throw std::runtime_error(std::string("expecting ::= at ") + pos);
+        }
+        pos = parse_space(pos + 3, true);
+
+        pos = parse_alternates(state, pos, name, rule_id, false);
+
+        if (*pos == '\r') {
+            pos += pos[1] == '\n' ? 2 : 1;
+        } else if (*pos == '\n') {
+            pos++;
+        } else if (*pos) {
+            throw std::runtime_error(std::string("expecting newline or end at ") + pos);
+        }
+        return parse_space(pos, true);
+    }
+
+    parse_state parse(const char * src) {
+        try {
+            parse_state state;
+            const char * pos = parse_space(src, true);
+            while (*pos) {
+                pos = parse_rule(state, pos);
+            }
+            // Validate the state to ensure that all rules are defined
+            for (const auto & rule : state.rules) {
+                for (const auto & elem : rule) {
+                    if (elem.type == LLAMA_GRETYPE_RULE_REF) {
+                        // Ensure that the rule at that location exists
+                        if (elem.value >= state.rules.size() || state.rules[elem.value].empty()) {
+                            // Get the name of the rule that is missing
+                            for (const auto & kv : state.symbol_ids) {
+                                if (kv.second == elem.value) {
+                                    throw std::runtime_error("Undefined rule identifier '" + kv.first + "'");
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            return state;
+        } catch (const std::exception & err) {
+            fprintf(stderr, "%s: error parsing grammar: %s\n", __func__, err.what());
+            return parse_state();
+        }
+    }
+
+    static void print_grammar_char(FILE * file, uint32_t c) {
+        if (0x20 <= c && c <= 0x7f) {
+            fprintf(file, "%c", static_cast<char>(c));
+        } else {
+            // cop out of encoding UTF-8
+            fprintf(file, "<U+%04X>", c);
+        }
+    }
+
+    static bool is_char_element(llama_grammar_element elem) {
+        switch (elem.type) {
+            case LLAMA_GRETYPE_CHAR:           return true;
+            case LLAMA_GRETYPE_CHAR_NOT:       return true;
+            case LLAMA_GRETYPE_CHAR_ALT:       return true;
+            case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true;
+            default:                           return false;
+        }
+    }
+
+    static void print_rule_binary(FILE * file, const std::vector<llama_grammar_element> & rule) {
+        for (auto elem : rule) {
+            switch (elem.type) {
+                case LLAMA_GRETYPE_END:            fprintf(file, "END");            break;
+                case LLAMA_GRETYPE_ALT:            fprintf(file, "ALT");            break;
+                case LLAMA_GRETYPE_RULE_REF:       fprintf(file, "RULE_REF");       break;
+                case LLAMA_GRETYPE_CHAR:           fprintf(file, "CHAR");           break;
+                case LLAMA_GRETYPE_CHAR_NOT:       fprintf(file, "CHAR_NOT");       break;
+                case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
+                case LLAMA_GRETYPE_CHAR_ALT:       fprintf(file, "CHAR_ALT");       break;
+            }
+            switch (elem.type) {
+                case LLAMA_GRETYPE_END:
+                case LLAMA_GRETYPE_ALT:
+                case LLAMA_GRETYPE_RULE_REF:
+                    fprintf(file, "(%u) ", elem.value);
+                    break;
+                case LLAMA_GRETYPE_CHAR:
+                case LLAMA_GRETYPE_CHAR_NOT:
+                case LLAMA_GRETYPE_CHAR_RNG_UPPER:
+                case LLAMA_GRETYPE_CHAR_ALT:
+                    fprintf(file, "(\"");
+                    print_grammar_char(file, elem.value);
+                    fprintf(file, "\") ");
+                    break;
+            }
+        }
+        fprintf(file, "\n");
+    }
+
+    static void print_rule(
+            FILE     * file,
+            uint32_t   rule_id,
+            const std::vector<llama_grammar_element> & rule,
+            const std::map<uint32_t, std::string>    & symbol_id_names) {
+        if (rule.empty() || rule.back().type != LLAMA_GRETYPE_END) {
+            throw std::runtime_error(
+                "malformed rule, does not end with LLAMA_GRETYPE_END: " + std::to_string(rule_id));
+        }
+        fprintf(file, "%s ::= ", symbol_id_names.at(rule_id).c_str());
+        for (size_t i = 0, end = rule.size() - 1; i < end; i++) {
+            llama_grammar_element elem = rule[i];
+            switch (elem.type) {
+                case LLAMA_GRETYPE_END:
+                    throw std::runtime_error(
+                        "unexpected end of rule: " + std::to_string(rule_id) + "," +
+                        std::to_string(i));
+                case LLAMA_GRETYPE_ALT:
+                    fprintf(file, "| ");
+                    break;
+                case LLAMA_GRETYPE_RULE_REF:
+                    fprintf(file, "%s ", symbol_id_names.at(elem.value).c_str());
+                    break;
+                case LLAMA_GRETYPE_CHAR:
+                    fprintf(file, "[");
+                    print_grammar_char(file, elem.value);
+                    break;
+                case LLAMA_GRETYPE_CHAR_NOT:
+                    fprintf(file, "[^");
+                    print_grammar_char(file, elem.value);
+                    break;
+                case LLAMA_GRETYPE_CHAR_RNG_UPPER:
+                    if (i == 0 || !is_char_element(rule[i - 1])) {
+                        throw std::runtime_error(
+                            "LLAMA_GRETYPE_CHAR_RNG_UPPER without preceding char: " +
+                            std::to_string(rule_id) + "," + std::to_string(i));
+                    }
+                    fprintf(file, "-");
+                    print_grammar_char(file, elem.value);
+                    break;
+                case LLAMA_GRETYPE_CHAR_ALT:
+                    if (i == 0 || !is_char_element(rule[i - 1])) {
+                        throw std::runtime_error(
+                            "LLAMA_GRETYPE_CHAR_ALT without preceding char: " +
+                            std::to_string(rule_id) + "," + std::to_string(i));
+                    }
+                    print_grammar_char(file, elem.value);
+                    break;
+            }
+            if (is_char_element(elem)) {
+                switch (rule[i + 1].type) {
+                    case LLAMA_GRETYPE_CHAR_ALT:
+                    case LLAMA_GRETYPE_CHAR_RNG_UPPER:
+                        break;
+                    default:
+                        fprintf(file, "] ");
+                }
+            }
+        }
+        fprintf(file, "\n");
+    }
+
+    void print_grammar(FILE * file, const parse_state & state) {
+        try {
+            std::map<uint32_t, std::string> symbol_id_names;
+            for (const auto & kv : state.symbol_ids) {
+                symbol_id_names[kv.second] = kv.first;
+            }
+            for (size_t i = 0, end = state.rules.size(); i < end; i++) {
+                // fprintf(file, "%zu: ", i);
+                // print_rule_binary(file, state.rules[i]);
+                print_rule(file, uint32_t(i), state.rules[i], symbol_id_names);
+                // fprintf(file, "\n");
+            }
+        } catch (const std::exception & err) {
+            fprintf(stderr, "\n%s: error printing grammar: %s\n", __func__, err.what());
+        }
+    }
+
+    std::vector<const llama_grammar_element *> parse_state::c_rules() {
+        std::vector<const llama_grammar_element *> ret;
+        ret.reserve(rules.size());
+        for (const auto & rule : rules) {
+            ret.push_back(rule.data());
+        }
+        return ret;
+    }
+}
--- a/common/grammar-parser.h
+++ b/common/grammar-parser.h
@@ -0,0 +1,29 @@
+// Implements a parser for an extended Backus-Naur form (BNF), producing the
+// binary context-free grammar format specified by llama.h. Supports character
+// ranges, grouping, and repetition operators. As an example, a grammar for
+// arithmetic might look like:
+//
+// root  ::= expr
+// expr  ::= term ([-+*/] term)*
+// term  ::= num | "(" space expr ")" space
+// num   ::= [0-9]+ space
+// space ::= [ \t\n]*
+
+#pragma once
+#include "llama.h"
+#include <vector>
+#include <map>
+#include <cstdint>
+#include <string>
+
+namespace grammar_parser {
+    struct parse_state {
+        std::map<std::string, uint32_t>                 symbol_ids;
+        std::vector<std::vector<llama_grammar_element>> rules;
+
+        std::vector<const llama_grammar_element *> c_rules();
+    };
+
+    parse_state parse(const char * src);
+    void print_grammar(FILE * file, const parse_state & state);
+}
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -16,282 +16,92 @@ static std::string join(Iterator begin, Iterator end, const std::string & separa

 static std::string repeat(const std::string & str, size_t n);

-static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
-    auto has_max = max_items != std::numeric_limits<int>::max();
-
-    if (min_items == 0 && max_items == 1) {
-        return item_rule + "?";
-    }
-
+static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "", bool item_rule_is_literal = false) {
    if (separator_rule.empty()) {
-        if (min_items == 1 && !has_max) {
+        if (min_items == 0 && max_items == 1) {
+            return item_rule + "?";
+        } else if (min_items == 1 && max_items == std::numeric_limits<int>::max()) {
            return item_rule + "+";
-        } else if (min_items == 0 && !has_max) {
-            return item_rule + "*";
-        } else {
-            return item_rule + "{" + std::to_string(min_items) + "," + (has_max ? std::to_string(max_items) : "") + "}";
        }
    }

-    auto result = item_rule + " " + build_repetition("(" + separator_rule + " " + item_rule + ")", min_items == 0 ? 0 : min_items - 1, has_max ? max_items - 1 : max_items);
-    if (min_items == 0) {
-        result = "(" + result + ")?";
+    std::string result;
+    if (min_items > 0) {
+        if (item_rule_is_literal && separator_rule.empty()) {
+            result = "\"" + repeat(std::string(item_rule.begin() + 1, item_rule.end() - 1), min_items) + "\"";
+        } else {
+            std::vector<std::string> items(min_items, item_rule);
+            result = join(items.begin(), items.end(), separator_rule.empty() ? " " : " " + separator_rule + " ");
+        }
    }
+
+    std::function<std::string(int, bool)> opt_repetitions = [&](int up_to_n, bool prefix_with_sep) -> std::string {
+        auto content = prefix_with_sep && !separator_rule.empty() ? separator_rule + " " + item_rule : item_rule;
+
+        if (up_to_n == 0) {
+            return "";
+        } else if (up_to_n == 1) {
+            return "(" + content + ")?";
+        } else if (!separator_rule.empty() && !prefix_with_sep) {
+            return "(" + content + " " + opt_repetitions(up_to_n - 1, true) + ")?";
+        } else {
+            std::string res = repeat("(" + content + " ", up_to_n);
+            // strip trailing space
+            res = res.substr(0, res.length() - 1);
+            res += repeat(")?", up_to_n);
+            return res;
+        }
+    };
+
+    if (min_items > 0 && max_items != min_items) {
+        result += " ";
+    }
+
+    if (max_items != std::numeric_limits<int>::max()) {
+        result += opt_repetitions(max_items - min_items, min_items > 0);
+    } else {
+        std::string item_operator = "(" + (separator_rule.empty() ? "" : separator_rule + " ") + item_rule + ")";
+        if (min_items == 0 && !separator_rule.empty()) {
+            result = "(" + item_rule + " " + item_operator + "*)?";
+        } else {
+            result += item_operator + "*";
+        }
+    }
+
    return result;
 }

-/* Minimalistic replacement for std::string_view, which is only available from C++17 onwards */
-class string_view {
-    const std::string & _str;
-    const size_t _start;
-    const size_t _end;
-public:
-    string_view(const std::string & str, size_t start = 0, size_t end  = std::string::npos) : _str(str), _start(start), _end(end == std::string::npos ? str.length() : end) {}
-
-    size_t size() const {
-        return _end - _start;
-    }
-
-    size_t length() const {
-        return size();
-    }
-
-    operator std::string() const {
-        return str();
-    }
-
-    std::string str() const {
-        return _str.substr(_start, _end - _start);
-    }
-
-    string_view substr(size_t pos, size_t len = std::string::npos) const {
-        return string_view(_str, _start + pos, len == std::string::npos ? _end : _start + pos + len);
-    }
-
-    char operator[](size_t pos) const {
-        auto index = _start + pos;
-        if (index >= _end) {
-            throw std::out_of_range("string_view index out of range");
-        }
-        return _str[_start + pos];
-    }
-
-    bool operator==(const string_view & other) const {
-        std::string this_str = *this;
-        std::string other_str = other;
-        return this_str == other_str;
-    }
-};
-
-static void _build_min_max_int(int min_value, int max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
-    auto has_min = min_value != std::numeric_limits<int>::min();
-    auto has_max = max_value != std::numeric_limits<int>::max();
-
-    auto digit_range = [&](char from, char to) {
-        out << "[";
-        if (from == to) {
-            out << from;
-        } else {
-            out << from << "-" << to;
-        }
-        out << "]";
-    };
-    auto more_digits = [&](int min_digits, int max_digits) {
-        out << "[0-9]";
-        if (min_digits == max_digits && min_digits == 1) {
-            return;
-        }
-        out << "{";
-        out << min_digits;
-        if (max_digits != min_digits) {
-            out << ",";
-            if (max_digits != std::numeric_limits<int>::max()) {
-                out << max_digits;
-            }
-        }
-        out << "}";
-    };
-    std::function<void(const string_view &, const string_view &)> uniform_range =
-        [&](const string_view & from, const string_view & to) {
-            size_t i = 0;
-            while (i < from.length() && i < to.length() && from[i] == to[i]) {
-                i++;
-            }
-            if (i > 0) {
-                out << "\"" << from.substr(0, i).str() << "\"";
-            }
-            if (i < from.length() && i < to.length()) {
-                if (i > 0) {
-                    out << " ";
-                }
-                auto sub_len = from.length() - i - 1;
-                if (sub_len > 0) {
-                    auto from_sub = from.substr(i + 1);
-                    auto to_sub = to.substr(i + 1);
-                    auto sub_zeros = repeat("0", sub_len);
-                    auto sub_nines = repeat("9", sub_len);
-
-                    auto to_reached = false;
-                    out << "(";
-                    if (from_sub == sub_zeros) {
-                        digit_range(from[i], to[i] - 1);
-                        out << " ";
-                        more_digits(sub_len, sub_len);
-                    } else {
-                        out << "[" << from[i] << "] ";
-                        out << "(";
-                        uniform_range(from_sub, sub_nines);
-                        out << ")";
-                        if (from[i] < to[i] - 1) {
-                            out << " | ";
-                            if (to_sub == sub_nines) {
-                                digit_range(from[i] + 1, to[i]);
-                                to_reached = true;
-                            } else {
-                                digit_range(from[i] + 1, to[i] - 1);
-                            }
-                            out << " ";
-                            more_digits(sub_len, sub_len);
-                        }
-                    }
-                    if (!to_reached) {
-                        out << " | ";
-                        digit_range(to[i], to[i]);
-                        out << " ";
-                        uniform_range(sub_zeros, to_sub);
-                    }
-                    out << ")";
-                } else {
-                    out << "[" << from[i] << "-" << to[i] << "]";
-                }
-            }
-        };
-
-    if (has_min && has_max) {
-        if (min_value < 0 && max_value < 0) {
-            out << "\"-\" (";
-            _build_min_max_int(-max_value, -min_value, out, decimals_left, /* top_level= */ true);
-            out << ")";
-            return;
-        }
-
-        if (min_value < 0) {
-            out << "\"-\" (";
-            _build_min_max_int(0, -min_value, out, decimals_left, /* top_level= */ true);
-            out << ") | ";
-            min_value = 0;
-        }
-
-        auto min_s = std::to_string(min_value);
-        auto max_s = std::to_string(max_value);
-        auto min_digits = min_s.length();
-        auto max_digits = max_s.length();
-
-        for (auto digits = min_digits; digits < max_digits; digits++) {
-            uniform_range(min_s, repeat("9", digits));
-            min_s = "1" + repeat("0", digits);
-            out << " | ";
-        }
-        uniform_range(min_s, max_s);
-        return;
-    }
-
-    auto less_decimals = std::max(decimals_left - 1, 1);
-
-    if (has_min) {
-        if (min_value < 0) {
-            out << "\"-\" (";
-            _build_min_max_int(std::numeric_limits<int>::min(), -min_value, out, decimals_left, /* top_level= */ false);
-            out << ") | [0] | [1-9] ";
-            more_digits(0, decimals_left - 1);
-        } else if (min_value == 0) {
-            if (top_level) {
-                out << "[0] | [1-9] ";
-                more_digits(0, less_decimals);
-            } else {
-                more_digits(1, decimals_left);
-            }
-        } else if (min_value <= 9) {
-            char c = '0' + min_value;
-            auto range_start = top_level ? '1' : '0';
-            if (c > range_start) {
-                digit_range(range_start, c - 1);
-                out << " ";
-                more_digits(1, less_decimals);
-                out << " | ";
-            }
-            digit_range(c, '9');
-            out << " ";
-            more_digits(0, less_decimals);
-        } else {
-            auto min_s = std::to_string(min_value);
-            auto len = min_s.length();
-            auto c = min_s[0];
-
-            if (c > '1') {
-                digit_range(top_level ? '1' : '0', c - 1);
-                out << " ";
-                more_digits(len, less_decimals);
-                out << " | ";
-            }
-            digit_range(c, c);
-            out << " (";
-            _build_min_max_int(std::stoi(min_s.substr(1)), std::numeric_limits<int>::max(), out, less_decimals, /* top_level= */ false);
-            out << ")";
-            if (c < '9') {
-                out << " | ";
-                digit_range(c + 1, '9');
-                out << " ";
-                more_digits(len - 1, less_decimals);
-            }
-        }
-        return;
-    }
-
-    if (has_max) {
-        if (max_value >= 0) {
-            if (top_level) {
-                out << "\"-\" [1-9] ";
-                more_digits(0, less_decimals);
-                out << " | ";
-            }
-            _build_min_max_int(0, max_value, out, decimals_left, /* top_level= */ true);
-        } else {
-            out << "\"-\" (";
-            _build_min_max_int(-max_value, std::numeric_limits<int>::max(), out, decimals_left, /* top_level= */ false);
-            out << ")";
-        }
-        return;
-    }
-
-    throw std::runtime_error("At least one of min_value or max_value must be set");
-}
-
-const std::string SPACE_RULE = "| \" \" | \"\\n\" [ \\t]{0,20}";
+const std::string SPACE_RULE = "\" \"?";

 struct BuiltinRule {
    std::string content;
    std::vector<std::string> deps;
 };

+const std::string _up_to_15_digits = build_repetition("[0-9]", 0, 15);
+
 std::unordered_map<std::string, BuiltinRule> PRIMITIVE_RULES = {
    {"boolean", {"(\"true\" | \"false\") space", {}}},
-    {"decimal-part", {"[0-9]{1,16}", {}}},
-    {"integral-part", {"[0] | [1-9] [0-9]{0,15}", {}}},
+    {"decimal-part", {"[0-9] " + _up_to_15_digits, {}}},
+    {"integral-part", {"[0-9] | [1-9] " + _up_to_15_digits, {}}},
    {"number", {"(\"-\"? integral-part) (\".\" decimal-part)? ([eE] [-+]? integral-part)? space", {"integral-part", "decimal-part"}}},
    {"integer", {"(\"-\"? integral-part) space", {"integral-part"}}},
    {"value", {"object | array | string | number | boolean | null", {"object", "array", "string", "number", "boolean", "null"}}},
    {"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", {"string", "value"}}},
    {"array", {"\"[\" space ( value (\",\" space value)* )? \"]\" space", {"value"}}},
-    {"uuid", {"\"\\\"\" [0-9a-fA-F]{8} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{12} \"\\\"\" space", {}}},
-    {"char",   {"[^\"\\\\\\x7F\\x00-\\x1F] | [\\\\] ([\"\\\\bfnrt] | \"u\" [0-9a-fA-F]{4})", {}}},
+    {"uuid", {"\"\\\"\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
+                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
+                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
+                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
+                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] \"\\\"\" space", {}}},
+    {"char",   {"[^\"\\\\] | \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])", {}}},
    {"string", {"\"\\\"\" char* \"\\\"\" space", {"char"}}},
    {"null", {"\"null\" space", {}}},
 };

 std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
-    {"date", {"[0-9]{4} \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}},
-    {"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9]{3} )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}},
+    {"date", {"[0-9] [0-9] [0-9] [0-9] \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}},
+    {"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9] [0-9] [0-9] )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}},
    {"date-time", {"date \"T\" time", {"date", "time"}}},
    {"date-string", {"\"\\\"\" date \"\\\"\" space", {"date"}}},
    {"time-string", {"\"\\\"\" time \"\\\"\" space", {"time"}}},
@@ -316,7 +126,7 @@ std::unordered_map<char, std::string> GRAMMAR_LITERAL_ESCAPES = {
 };

 std::unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
-std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'^', '$', '.', '[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};
+std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};

 template <typename Iterator>
 std::string join(Iterator begin, Iterator end, const std::string & separator) {
@@ -387,6 +197,7 @@ static std::string format_literal(const std::string & literal) {
    return "\"" + escaped + "\"";
 }

+
 class SchemaConverter {
 private:
    std::function<json(const std::string &)> _fetch_json;
@@ -574,7 +385,8 @@ private:
                        sub_is_literal ? "\"" + sub + "\"" : sub,
                        min_times,
                        max_times,
-                        ""
+                        "",
+                        sub_is_literal
                    );
                    seq.back().second = false;
                } else {
@@ -614,75 +426,6 @@ private:
        return _add_rule(name, "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space");
    }

-    /*
-        Returns a rule that matches a JSON string that is none of the provided strings
-
-        not_strings({"a"})
-            -> ["] ( [a] char+ | [^"a] char* )? ["] space
-        not_strings({"and", "also"})
-            -> ["] ( [a] ([l] ([s] ([o] char+ | [^"o] char*) | [^"s] char*) | [n] ([d] char+ | [^"d] char*) | [^"ln] char*) | [^"a] char* )? ["] space
-    */
-    std::string _not_strings(const std::vector<std::string> & strings) {
-
-        struct TrieNode {
-            std::map<char, TrieNode> children;
-            bool is_end_of_string;
-
-            TrieNode() : is_end_of_string(false) {}
-
-            void insert(const std::string & string) {
-                auto node = this;
-                for (char c : string) {
-                    node = &node->children[c];
-                }
-                node->is_end_of_string = true;
-            }
-        };
-
-        TrieNode trie;
-        for (const auto & s : strings) {
-            trie.insert(s);
-        }
-
-        std::string char_rule = _add_primitive("char", PRIMITIVE_RULES.at("char"));
-        std::ostringstream out;
-        out << "[\"] ( ";
-        std::function<void(const TrieNode &)> visit = [&](const TrieNode & node) {
-            std::ostringstream rejects;
-            auto first = true;
-            for (const auto & kv : node.children) {
-                rejects << kv.first;
-                if (first) {
-                    first = false;
-                } else {
-                    out << " | ";
-                }
-                out << "[" << kv.first << "]";
-                if (!kv.second.children.empty()) {
-                    out << " (";
-                    visit(kv.second);
-                    out << ")";
-                } else if (kv.second.is_end_of_string) {
-                    out << " " << char_rule << "+";
-                }
-            }
-            if (!node.children.empty()) {
-                if (!first) {
-                    out << " | ";
-                }
-                out << "[^\"" << rejects.str() << "] " << char_rule << "*";
-            }
-        };
-        visit(trie);
-
-        out << " )";
-        if (!trie.is_end_of_string) {
-            out << "?";
-        }
-        out << " [\"] space";
-        return out.str();
-    }
-
    std::string _resolve_ref(const std::string & ref) {
        std::string ref_name = ref.substr(ref.find_last_of('/') + 1);
        if (_rules.find(ref_name) == _rules.end() && _refs_being_resolved.find(ref) == _refs_being_resolved.end()) {
@@ -703,7 +446,6 @@ private:
        std::vector<std::string> required_props;
        std::vector<std::string> optional_props;
        std::unordered_map<std::string, std::string> prop_kv_rule_names;
-        std::vector<std::string> prop_names;
        for (const auto & kv : properties) {
            const auto &prop_name = kv.first;
            const auto &prop_schema = kv.second;
@@ -718,18 +460,11 @@ private:
            } else {
                optional_props.push_back(prop_name);
            }
-            prop_names.push_back(prop_name);
        }
-        if ((additional_properties.is_boolean() && additional_properties.get<bool>()) || additional_properties.is_object()) {
+        if (additional_properties.is_object() || (additional_properties.is_boolean() && additional_properties.get<bool>())) {
            std::string sub_name = name + (name.empty() ? "" : "-") + "additional";
-            std::string value_rule =
-                additional_properties.is_object() ? visit(additional_properties, sub_name + "-value")
-                : _add_primitive("value", PRIMITIVE_RULES.at("value"));
-
-            auto key_rule =
-                prop_names.empty() ? _add_primitive("string", PRIMITIVE_RULES.at("string"))
-                : _add_rule(sub_name + "-k", _not_strings(prop_names));
-            std::string kv_rule = _add_rule(sub_name + "-kv", key_rule + " \":\" space " + value_rule);
+            std::string value_rule = visit(additional_properties.is_object() ? additional_properties : json::object(), sub_name + "-value");
+            std::string kv_rule = _add_rule(sub_name + "-kv", _add_primitive("string", PRIMITIVE_RULES.at("string")) + " \":\" space " + value_rule);
            prop_kv_rule_names["*"] = kv_rule;
            optional_props.push_back("*");
        }
@@ -755,11 +490,15 @@ private:
                }
                std::string k = ks[0];
                std::string kv_rule_name = prop_kv_rule_names[k];
-                std::string comma_ref = "( \",\" space " + kv_rule_name + " )";
-                if (first_is_optional) {
-                    res = comma_ref + (k == "*" ? "*" : "?");
+                if (k == "*") {
+                    res = _add_rule(
+                        name + (name.empty() ? "" : "-") + "additional-kvs",
+                        kv_rule_name + " ( \",\" space " + kv_rule_name + " )*"
+                    );
+                } else if (first_is_optional) {
+                    res = "( \",\" space " + kv_rule_name + " )?";
                } else {
-                    res = kv_rule_name + (k == "*" ? " " + comma_ref + "*" : "");
+                    res = kv_rule_name;
                }
                if (ks.size() > 1) {
                    res += " " + _add_rule(
@@ -893,19 +632,17 @@ public:
        } else if (schema_type.is_array()) {
            std::vector<json> schema_types;
            for (const auto & t : schema_type) {
-                json schema_copy(schema);
-                schema_copy["type"] = t;
-                schema_types.push_back(schema_copy);
+                schema_types.push_back({{"type", t}});
            }
            return _add_rule(rule_name, _generate_union_rule(name, schema_types));
        } else if (schema.contains("const")) {
-            return _add_rule(rule_name, _generate_constant_rule(schema["const"]) + " space");
+            return _add_rule(rule_name, _generate_constant_rule(schema["const"]));
        } else if (schema.contains("enum")) {
            std::vector<std::string> enum_values;
            for (const auto & v : schema["enum"]) {
                enum_values.push_back(_generate_constant_rule(v));
            }
-            return _add_rule(rule_name, "(" + join(enum_values.begin(), enum_values.end(), " | ") + ") space");
+            return _add_rule(rule_name, join(enum_values.begin(), enum_values.end(), " | "));
        } else if ((schema_type.is_null() || schema_type == "object")
                && (schema.contains("properties") ||
                    (schema.contains("additionalProperties") && schema["additionalProperties"] != true))) {
@@ -987,24 +724,6 @@ public:
            int min_len = schema.contains("minLength") ? schema["minLength"].get<int>() : 0;
            int max_len = schema.contains("maxLength") ? schema["maxLength"].get<int>() : std::numeric_limits<int>::max();
            return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space");
-        } else if (schema_type == "integer" && (schema.contains("minimum") || schema.contains("exclusiveMinimum") || schema.contains("maximum") || schema.contains("exclusiveMaximum"))) {
-            int min_value = std::numeric_limits<int>::min();
-            int max_value = std::numeric_limits<int>::max();
-            if (schema.contains("minimum")) {
-                min_value = schema["minimum"].get<int>();
-            } else if (schema.contains("exclusiveMinimum")) {
-                min_value = schema["exclusiveMinimum"].get<int>() + 1;
-            }
-            if (schema.contains("maximum")) {
-                max_value = schema["maximum"].get<int>();
-            } else if (schema.contains("exclusiveMaximum")) {
-                max_value = schema["exclusiveMaximum"].get<int>() - 1;
-            }
-            std::stringstream out;
-            out << "(";
-            _build_min_max_int(min_value, max_value, out);
-            out << ") space";
-            return _add_rule(rule_name, out.str());
        } else if (schema.empty() || schema_type == "object") {
            return _add_rule(rule_name, _add_primitive("object", PRIMITIVE_RULES.at("object")));
        } else {
--- a/common/log.cpp
+++ b/common/log.cpp
@@ -1,401 +0,0 @@
-#include "log.h"
-
-#include <condition_variable>
-#include <cstdarg>
-#include <cstdio>
-#include <mutex>
-#include <sstream>
-#include <thread>
-#include <vector>
-
-int gpt_log_verbosity_thold = LOG_DEFAULT_LLAMA;
-
-void gpt_log_set_verbosity_thold(int verbosity) {
-    gpt_log_verbosity_thold = verbosity;
-}
-
-#define LOG_COL_DEFAULT "\033[0m"
-#define LOG_COL_BOLD    "\033[1m"
-#define LOG_COL_RED     "\033[31m"
-#define LOG_COL_GREEN   "\033[32m"
-#define LOG_COL_YELLOW  "\033[33m"
-#define LOG_COL_BLUE    "\033[34m"
-#define LOG_COL_MAGENTA "\033[35m"
-#define LOG_COL_CYAN    "\033[36m"
-#define LOG_COL_WHITE   "\033[37m"
-
-static int64_t t_us() {
-    return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
-}
-
-// colors
-enum gpt_log_col : int {
-    GPT_LOG_COL_DEFAULT = 0,
-    GPT_LOG_COL_BOLD,
-    GPT_LOG_COL_RED,
-    GPT_LOG_COL_GREEN,
-    GPT_LOG_COL_YELLOW,
-    GPT_LOG_COL_BLUE,
-    GPT_LOG_COL_MAGENTA,
-    GPT_LOG_COL_CYAN,
-    GPT_LOG_COL_WHITE,
-};
-
-// disable colors by default
-static std::vector<const char *> g_col = {
-    "",
-    "",
-    "",
-    "",
-    "",
-    "",
-    "",
-    "",
-    "",
-};
-
-struct gpt_log_entry {
-    enum ggml_log_level level;
-
-    bool prefix;
-
-    int64_t timestamp;
-
-    std::vector<char> msg;
-
-    // signals the worker thread to stop
-    bool is_end;
-
-    void print(FILE * file = nullptr) const {
-        FILE * fcur = file;
-        if (!fcur) {
-            // stderr displays DBG messages only when their verbosity level is not higher than the threshold
-            // these messages will still be logged to a file
-            if (level == GGML_LOG_LEVEL_DEBUG && gpt_log_verbosity_thold < LOG_DEFAULT_DEBUG) {
-                return;
-            }
-
-            fcur = stdout;
-
-            if (level != GGML_LOG_LEVEL_NONE) {
-                fcur = stderr;
-            }
-        }
-
-        if (level != GGML_LOG_LEVEL_NONE && prefix) {
-            if (timestamp) {
-                // [M.s.ms.us]
-                fprintf(fcur, "%s%d.%02d.%03d.%03d%s ",
-                        g_col[GPT_LOG_COL_BLUE],
-                        (int) (timestamp / 1000000 / 60),
-                        (int) (timestamp / 1000000 % 60),
-                        (int) (timestamp / 1000 % 1000),
-                        (int) (timestamp % 1000),
-                        g_col[GPT_LOG_COL_DEFAULT]);
-            }
-
-            switch (level) {
-                case GGML_LOG_LEVEL_INFO:  fprintf(fcur, "%sI %s", g_col[GPT_LOG_COL_GREEN],   g_col[GPT_LOG_COL_DEFAULT]); break;
-                case GGML_LOG_LEVEL_WARN:  fprintf(fcur, "%sW %s", g_col[GPT_LOG_COL_MAGENTA], ""                        ); break;
-                case GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[GPT_LOG_COL_RED],     ""                        ); break;
-                case GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[GPT_LOG_COL_YELLOW],  ""                        ); break;
-                default:
-                    break;
-            }
-        }
-
-        fprintf(fcur, "%s", msg.data());
-
-        if (level == GGML_LOG_LEVEL_WARN || level == GGML_LOG_LEVEL_ERROR || level == GGML_LOG_LEVEL_DEBUG) {
-            fprintf(fcur, "%s", g_col[GPT_LOG_COL_DEFAULT]);
-        }
-
-        fflush(fcur);
-    }
-};
-
-struct gpt_log {
-    // default capacity - will be expanded if needed
-    gpt_log() : gpt_log(256) {}
-
-    gpt_log(size_t capacity) {
-        file = nullptr;
-        prefix = false;
-        timestamps = false;
-        running = false;
-        t_start = t_us();
-
-        // initial message size - will be expanded if longer messages arrive
-        entries.resize(capacity);
-        for (auto & entry : entries) {
-            entry.msg.resize(256);
-        }
-
-        head = 0;
-        tail = 0;
-
-        resume();
-    }
-
-    ~gpt_log() {
-        pause();
-        if (file) {
-            fclose(file);
-        }
-    }
-
-private:
-    std::mutex mtx;
-    std::thread thrd;
-    std::condition_variable cv;
-
-    FILE * file;
-
-    bool prefix;
-    bool timestamps;
-    bool running;
-
-    int64_t t_start;
-
-    // ring buffer of entries
-    std::vector<gpt_log_entry> entries;
-    size_t head;
-    size_t tail;
-
-    // worker thread copies into this
-    gpt_log_entry cur;
-
-public:
-    void add(enum ggml_log_level level, const char * fmt, va_list args) {
-        std::lock_guard<std::mutex> lock(mtx);
-
-        if (!running) {
-            // discard messages while the worker thread is paused
-            return;
-        }
-
-        auto & entry = entries[tail];
-
-        {
-            // cannot use args twice, so make a copy in case we need to expand the buffer
-            va_list args_copy;
-            va_copy(args_copy, args);
-
-#if 1
-            const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args);
-            if (n >= entry.msg.size()) {
-                entry.msg.resize(n + 1);
-                vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args_copy);
-            }
-#else
-            // hack for bolding arguments
-
-            std::stringstream ss;
-            for (int i = 0; fmt[i] != 0; i++) {
-                if (fmt[i] == '%') {
-                    ss << LOG_COL_BOLD;
-                    while (fmt[i] != ' ' && fmt[i] != ')' && fmt[i] != ']' && fmt[i] != 0) ss << fmt[i++];
-                    ss << LOG_COL_DEFAULT;
-                    if (fmt[i] == 0) break;
-                }
-                ss << fmt[i];
-            }
-            const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args);
-            if (n >= entry.msg.size()) {
-                entry.msg.resize(n + 1);
-                vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args_copy);
-            }
-#endif
-        }
-
-        entry.level = level;
-        entry.prefix = prefix;
-        entry.timestamp = 0;
-        if (timestamps) {
-            entry.timestamp = t_us() - t_start;
-        }
-        entry.is_end = false;
-
-        tail = (tail + 1) % entries.size();
-        if (tail == head) {
-            // expand the buffer
-            std::vector<gpt_log_entry> new_entries(2*entries.size());
-
-            size_t new_tail = 0;
-
-            do {
-                new_entries[new_tail] = std::move(entries[head]);
-
-                head     = (head     + 1) % entries.size();
-                new_tail = (new_tail + 1);
-            } while (head != tail);
-
-            head = 0;
-            tail = new_tail;
-
-            for (size_t i = tail; i < new_entries.size(); i++) {
-                new_entries[i].msg.resize(256);
-            }
-
-            entries = std::move(new_entries);
-        }
-
-        cv.notify_one();
-    }
-
-    void resume() {
-        std::lock_guard<std::mutex> lock(mtx);
-
-        if (running) {
-            return;
-        }
-
-        running = true;
-
-        thrd = std::thread([this]() {
-            while (true) {
-                {
-                    std::unique_lock<std::mutex> lock(mtx);
-                    cv.wait(lock, [this]() { return head != tail; });
-
-                    cur = entries[head];
-
-                    head = (head + 1) % entries.size();
-                }
-
-                if (cur.is_end) {
-                    break;
-                }
-
-                cur.print(); // stdout and stderr
-
-                if (file) {
-                    cur.print(file);
-                }
-            }
-        });
-    }
-
-    void pause() {
-        {
-            std::lock_guard<std::mutex> lock(mtx);
-
-            if (!running) {
-                return;
-            }
-
-            running = false;
-
-            // push an entry to signal the worker thread to stop
-            {
-                auto & entry = entries[tail];
-                entry.is_end = true;
-
-                tail = (tail + 1) % entries.size();
-            }
-
-            cv.notify_one();
-        }
-
-        thrd.join();
-    }
-
-    void set_file(const char * path) {
-        pause();
-
-        if (file) {
-            fclose(file);
-        }
-
-        if (path) {
-            file = fopen(path, "w");
-        } else {
-            file = nullptr;
-        }
-
-        resume();
-    }
-
-    void set_colors(bool colors) {
-        pause();
-
-        if (colors) {
-            g_col[GPT_LOG_COL_DEFAULT] = LOG_COL_DEFAULT;
-            g_col[GPT_LOG_COL_BOLD]    = LOG_COL_BOLD;
-            g_col[GPT_LOG_COL_RED]     = LOG_COL_RED;
-            g_col[GPT_LOG_COL_GREEN]   = LOG_COL_GREEN;
-            g_col[GPT_LOG_COL_YELLOW]  = LOG_COL_YELLOW;
-            g_col[GPT_LOG_COL_BLUE]    = LOG_COL_BLUE;
-            g_col[GPT_LOG_COL_MAGENTA] = LOG_COL_MAGENTA;
-            g_col[GPT_LOG_COL_CYAN]    = LOG_COL_CYAN;
-            g_col[GPT_LOG_COL_WHITE]   = LOG_COL_WHITE;
-        } else {
-            for (size_t i = 0; i < g_col.size(); i++) {
-                g_col[i] = "";
-            }
-        }
-
-        resume();
-    }
-
-    void set_prefix(bool prefix) {
-        std::lock_guard<std::mutex> lock(mtx);
-
-        this->prefix = prefix;
-    }
-
-    void set_timestamps(bool timestamps) {
-        std::lock_guard<std::mutex> lock(mtx);
-
-        this->timestamps = timestamps;
-    }
-};
-
-//
-// public API
-//
-
-struct gpt_log * gpt_log_init() {
-    return new gpt_log;
-}
-
-struct gpt_log * gpt_log_main() {
-    static struct gpt_log log;
-
-    return &log;
-}
-
-void gpt_log_pause(struct gpt_log * log) {
-    log->pause();
-}
-
-void gpt_log_resume(struct gpt_log * log) {
-    log->resume();
-}
-
-void gpt_log_free(struct gpt_log * log) {
-    delete log;
-}
-
-void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * fmt, ...) {
-    va_list args;
-    va_start(args, fmt);
-    log->add(level, fmt, args);
-    va_end(args);
-}
-
-void gpt_log_set_file(struct gpt_log * log, const char * file) {
-    log->set_file(file);
-}
-
-void gpt_log_set_colors(struct gpt_log * log, bool colors) {
-    log->set_colors(colors);
-}
-
-void gpt_log_set_prefix(struct gpt_log * log, bool prefix) {
-    log->set_prefix(prefix);
-}
-
-void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps) {
-    log->set_timestamps(timestamps);
-}
--- a/common/log.h
+++ b/common/log.h
@@ -1,90 +1,724 @@
 #pragma once

-#include "ggml.h" // for ggml_log_level
+#include <chrono>
+#include <cstring>
+#include <sstream>
+#include <iostream>
+#include <thread>
+#include <vector>
+#include <algorithm>
+#include <cinttypes>

-#ifndef __GNUC__
-#    define LOG_ATTRIBUTE_FORMAT(...)
-#elif defined(__MINGW32__)
-#    define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
-#else
-#    define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
+// --------------------------------
+//
+// Basic usage:
+//
+// --------
+//
+//  The LOG() and LOG_TEE() macros are ready to go by default
+//   they do not require any initialization.
+//
+//  LOGLN() and LOG_TEELN() are variants which automatically
+//   include \n character at the end of the log string.
+//
+//  LOG() behaves exactly like printf, by default writing to a logfile.
+//  LOG_TEE() additionally, prints to the screen too ( mimics Unix tee command ).
+//
+//  Default logfile is named
+//   "llama.<threadID>.log"
+//  Default LOG_TEE() secondary output target is
+//   stderr
+//
+//  Logs can be dynamically disabled or enabled using functions:
+//   log_disable()
+//  and
+//   log_enable()
+//
+//  A log target can be changed with:
+//   log_set_target( string )
+//    creating and opening, or re-opening a file by string filename
+//  or
+//   log_set_target( FILE* )
+//    allowing to point at stderr, stdout, or any valid FILE* file handler.
+//
+// --------
+//
+// End of Basic usage.
+//
+// --------------------------------
+
+// Specifies a log target.
+//  default uses log_handler() with "llama.log" log file
+//  this can be changed, by defining LOG_TARGET
+//  like so:
+//
+//  #define LOG_TARGET (a valid FILE*)
+//  #include "log.h"
+//
+//  or it can be simply redirected to stdout or stderr
+//  like so:
+//
+//  #define LOG_TARGET stderr
+//  #include "log.h"
+//
+//  The log target can also be redirected to a different function
+//  like so:
+//
+//  #define LOG_TARGET log_handler_different()
+//  #include "log.h"
+//
+//  FILE* log_handler_different()
+//  {
+//      return stderr;
+//  }
+//
+//  or:
+//
+//  #define LOG_TARGET log_handler_another_one("somelog.log")
+//  #include "log.h"
+//
+//  FILE* log_handler_another_one(char*filename)
+//  {
+//      static FILE* logfile = nullptr;
+//      (...)
+//      if( !logfile )
+//      {
+//          fopen(...)
+//      }
+//      (...)
+//      return logfile
+//  }
+//
+#ifndef LOG_TARGET
+    #define LOG_TARGET log_handler()
 #endif

-#define LOG_DEFAULT_DEBUG 1
-#define LOG_DEFAULT_LLAMA 0
+#ifndef LOG_TEE_TARGET
+    #define LOG_TEE_TARGET stderr
+#endif

-// needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
-// set via gpt_log_set_verbosity()
-extern int gpt_log_verbosity_thold;
+// Utility for synchronizing log configuration state
+//  since std::optional was introduced only in c++17
+enum LogTriState
+{
+    LogTriStateSame,
+    LogTriStateFalse,
+    LogTriStateTrue
+};

-void gpt_log_set_verbosity_thold(int verbosity); // not thread-safe
+// Utility to obtain "pid" like unique process id and use it when creating log files.
+inline std::string log_get_pid()
+{
+   static std::string pid;
+   if (pid.empty())
+   {
+       // std::this_thread::get_id() is the most portable way of obtaining a "process id"
+       //  it's not the same as "pid" but is unique enough to solve multiple instances
+       //  trying to write to the same log.
+       std::stringstream ss;
+       ss << std::this_thread::get_id();
+       pid = ss.str();
+   }

-// the gpt_log uses an internal worker thread to print/write log messages
-// when the worker thread is paused, incoming log messages are discarded
-struct gpt_log;
+   return pid;
+}

-struct gpt_log * gpt_log_init();
-struct gpt_log * gpt_log_main(); // singleton, automatically destroys itself on exit
-void             gpt_log_pause (struct gpt_log * log); // pause  the worker thread, not thread-safe
-void             gpt_log_resume(struct gpt_log * log); // resume the worker thread, not thread-safe
-void             gpt_log_free  (struct gpt_log * log);
+// Utility function for generating log file names with unique id based on thread id.
+//  invocation with log_filename_generator( "llama", "log" ) creates a string "llama.<number>.log"
+//  where the number is a runtime id of the current thread.

-LOG_ATTRIBUTE_FORMAT(3, 4)
-void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * fmt, ...);
+#define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(LogTriStateSame, log_file_basename, log_file_extension)

-// defaults: file = NULL, colors = false, prefix = false, timestamps = false
-//
-// regular log output:
-//
-//   ggml_backend_metal_log_allocated_size: allocated buffer, size =  6695.84 MiB, ( 6695.91 / 21845.34)
-//   llm_load_tensors: ggml ctx size =    0.27 MiB
-//   llm_load_tensors: offloading 32 repeating layers to GPU
-//   llm_load_tensors: offloading non-repeating layers to GPU
-//
-// with prefix = true, timestamps = true, the log output will look like this:
-//
-//   0.00.035.060 D ggml_backend_metal_log_allocated_size: allocated buffer, size =  6695.84 MiB, ( 6695.91 / 21845.34)
-//   0.00.035.064 I llm_load_tensors: ggml ctx size =    0.27 MiB
-//   0.00.090.578 I llm_load_tensors: offloading 32 repeating layers to GPU
-//   0.00.090.579 I llm_load_tensors: offloading non-repeating layers to GPU
-//
-// I - info    (stdout, V = 0)
-// W - warning (stderr, V = 0)
-// E - error   (stderr, V = 0)
-// D - debug   (stderr, V = LOG_DEFAULT_DEBUG)
-//
+// INTERNAL, DO NOT USE
+inline std::string log_filename_generator_impl(LogTriState multilog, const std::string & log_file_basename, const std::string & log_file_extension)
+{
+    static bool _multilog = false;

-void gpt_log_set_file      (struct gpt_log * log, const char * file);       // not thread-safe
-void gpt_log_set_colors    (struct gpt_log * log,       bool   colors);     // not thread-safe
-void gpt_log_set_prefix    (struct gpt_log * log,       bool   prefix);     // whether to output prefix to each log
-void gpt_log_set_timestamps(struct gpt_log * log,       bool   timestamps); // whether to output timestamps in the prefix
+    if (multilog != LogTriStateSame)
+    {
+        _multilog = multilog == LogTriStateTrue;
+    }

-// helper macros for logging
-// use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
-//
-// for example:
-//
-//   LOG_DBG("this is a debug message: %d\n", expensive_function());
-//
-// this will avoid calling expensive_function() if LOG_DEFAULT_DEBUG > gpt_log_verbosity_thold
-//
+    std::stringstream buf;

-#define LOG_TMPL(level, verbosity, ...) \
-    do { \
-        if ((verbosity) <= gpt_log_verbosity_thold) { \
-            gpt_log_add(gpt_log_main(), (level), __VA_ARGS__); \
-        } \
+    buf << log_file_basename;
+    if (_multilog)
+    {
+        buf << ".";
+        buf << log_get_pid();
+    }
+    buf << ".";
+    buf << log_file_extension;
+
+    return buf.str();
+}
+
+#ifndef LOG_DEFAULT_FILE_NAME
+    #define LOG_DEFAULT_FILE_NAME log_filename_generator("llama", "log")
+#endif
+
+// Utility for turning #define values into string literals
+//  so we can have a define for stderr and
+//  we can print "stderr" instead of literal stderr, etc.
+#define LOG_STRINGIZE1(s) #s
+#define LOG_STRINGIZE(s) LOG_STRINGIZE1(s)
+
+#define LOG_TEE_TARGET_STRING LOG_STRINGIZE(LOG_TEE_TARGET)
+
+// Allows disabling timestamps.
+//  in order to disable, define LOG_NO_TIMESTAMPS
+//  like so:
+//
+//  #define LOG_NO_TIMESTAMPS
+//  #include "log.h"
+//
+#ifndef LOG_NO_TIMESTAMPS
+    #ifndef _MSC_VER
+        #define LOG_TIMESTAMP_FMT "[%" PRIu64 "] "
+        #define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
+    #else
+        #define LOG_TIMESTAMP_FMT "[%" PRIu64 "] "
+        #define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
+    #endif
+#else
+    #define LOG_TIMESTAMP_FMT "%s"
+    #define LOG_TIMESTAMP_VAL ,""
+#endif
+
+#ifdef LOG_TEE_TIMESTAMPS
+    #ifndef _MSC_VER
+        #define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] "
+        #define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
+    #else
+        #define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] "
+        #define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
+    #endif
+#else
+    #define LOG_TEE_TIMESTAMP_FMT "%s"
+    #define LOG_TEE_TIMESTAMP_VAL ,""
+#endif
+
+// Allows disabling file/line/function prefix
+//  in order to disable, define LOG_NO_FILE_LINE_FUNCTION
+//  like so:
+//
+//  #define LOG_NO_FILE_LINE_FUNCTION
+//  #include "log.h"
+//
+#ifndef LOG_NO_FILE_LINE_FUNCTION
+    #ifndef _MSC_VER
+        #define LOG_FLF_FMT "[%24s:%5d][%24s] "
+        #define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
+    #else
+        #define LOG_FLF_FMT "[%24s:%5ld][%24s] "
+        #define LOG_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
+    #endif
+#else
+    #define LOG_FLF_FMT "%s"
+    #define LOG_FLF_VAL ,""
+#endif
+
+#ifdef LOG_TEE_FILE_LINE_FUNCTION
+    #ifndef _MSC_VER
+        #define LOG_TEE_FLF_FMT "[%24s:%5d][%24s] "
+        #define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
+    #else
+        #define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] "
+        #define LOG_TEE_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
+    #endif
+#else
+    #define LOG_TEE_FLF_FMT "%s"
+    #define LOG_TEE_FLF_VAL ,""
+#endif
+
+// INTERNAL, DO NOT USE
+//  USE LOG() INSTEAD
+//
+#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__)
+    #define LOG_IMPL(str, ...)                                                                                      \
+    do {                                                                                                            \
+        if (LOG_TARGET != nullptr)                                                                                  \
+        {                                                                                                           \
+            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \
+            fflush(LOG_TARGET);                                                                                     \
+        }                                                                                                           \
    } while (0)
+#else
+    #define LOG_IMPL(str, ...)                                                                                           \
+    do {                                                                                                                 \
+        if (LOG_TARGET != nullptr)                                                                                       \
+        {                                                                                                                \
+            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \
+            fflush(LOG_TARGET);                                                                                          \
+        }                                                                                                                \
+    } while (0)
+#endif

-#define LOG(...)             LOG_TMPL(GGML_LOG_LEVEL_NONE, 0,         __VA_ARGS__)
-#define LOGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_NONE, verbosity, __VA_ARGS__)
+// INTERNAL, DO NOT USE
+//  USE LOG_TEE() INSTEAD
+//
+#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__)
+    #define LOG_TEE_IMPL(str, ...)                                                                                                      \
+    do {                                                                                                                                \
+        if (LOG_TARGET != nullptr)                                                                                                      \
+        {                                                                                                                               \
+            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__);                     \
+            fflush(LOG_TARGET);                                                                                                         \
+        }                                                                                                                               \
+        if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr)                         \
+        {                                                                                                                               \
+            fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL, __VA_ARGS__); \
+            fflush(LOG_TEE_TARGET);                                                                                                     \
+        }                                                                                                                               \
+    } while (0)
+#else
+    #define LOG_TEE_IMPL(str, ...)                                                                                                           \
+    do {                                                                                                                                     \
+        if (LOG_TARGET != nullptr)                                                                                                           \
+        {                                                                                                                                    \
+            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__);                     \
+            fflush(LOG_TARGET);                                                                                                              \
+        }                                                                                                                                    \
+        if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr)                              \
+        {                                                                                                                                    \
+            fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL "", ##__VA_ARGS__); \
+            fflush(LOG_TEE_TARGET);                                                                                                          \
+        }                                                                                                                                    \
+    } while (0)
+#endif

-#define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO,  0,                 __VA_ARGS__)
-#define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN,  0,                 __VA_ARGS__)
-#define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, 0,                 __VA_ARGS__)
-#define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_DEFAULT_DEBUG, __VA_ARGS__)
+// The '\0' as a last argument, is a trick to bypass the silly
+//  "warning: ISO C++11 requires at least one argument for the "..." in a variadic macro"
+//  so we can have a single macro which can be called just like printf.

-#define LOG_INFV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_INFO,  verbosity, __VA_ARGS__)
-#define LOG_WRNV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_WARN,  verbosity, __VA_ARGS__)
-#define LOG_ERRV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, verbosity, __VA_ARGS__)
-#define LOG_DBGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, verbosity, __VA_ARGS__)
+// Main LOG macro.
+//  behaves like printf, and supports arguments the exact same way.
+//
+#if !defined(_MSC_VER) || defined(__clang__)
+    #define LOG(...) LOG_IMPL(__VA_ARGS__, "")
+#else
+    #define LOG(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "")
+#endif
+
+// Main TEE macro.
+//  does the same as LOG
+//  and
+//  simultaneously writes stderr.
+//
+// Secondary target can be changed just like LOG_TARGET
+//  by defining LOG_TEE_TARGET
+//
+#if !defined(_MSC_VER) || defined(__clang__)
+    #define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
+#else
+    #define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "")
+#endif
+
+// LOG macro variants with auto endline.
+#if !defined(_MSC_VER) || defined(__clang__)
+    #define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
+    #define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
+#else
+    #define LOGLN(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "\n")
+    #define LOG_TEELN(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "\n")
+#endif
+
+// INTERNAL, DO NOT USE
+inline FILE *log_handler1_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr)
+{
+    static bool _initialized = false;
+    static bool _append = false;
+    static bool _disabled = filename.empty() && target == nullptr;
+    static std::string log_current_filename{filename};
+    static FILE *log_current_target{target};
+    static FILE *logfile = nullptr;
+
+    if (change)
+    {
+        if (append != LogTriStateSame)
+        {
+            _append = append == LogTriStateTrue;
+            return logfile;
+        }
+
+        if (disable == LogTriStateTrue)
+        {
+            // Disable primary target
+            _disabled = true;
+        }
+        // If previously disabled, only enable, and keep previous target
+        else if (disable == LogTriStateFalse)
+        {
+            _disabled = false;
+        }
+        // Otherwise, process the arguments
+        else if (log_current_filename != filename || log_current_target != target)
+        {
+            _initialized = false;
+        }
+    }
+
+    if (_disabled)
+    {
+        // Log is disabled
+        return nullptr;
+    }
+
+    if (_initialized)
+    {
+        // with fallback in case something went wrong
+        return logfile ? logfile : stderr;
+    }
+
+    // do the (re)initialization
+    if (target != nullptr)
+    {
+        if (logfile != nullptr && logfile != stdout && logfile != stderr)
+        {
+            fclose(logfile);
+        }
+
+        log_current_filename = LOG_DEFAULT_FILE_NAME;
+        log_current_target = target;
+
+        logfile = target;
+    }
+    else
+    {
+        if (log_current_filename != filename)
+        {
+            if (logfile != nullptr && logfile != stdout && logfile != stderr)
+            {
+                fclose(logfile);
+            }
+        }
+
+        logfile = fopen(filename.c_str(), _append ? "a" : "w");
+    }
+
+    if (!logfile)
+    {
+        //  Verify whether the file was opened, otherwise fallback to stderr
+        logfile = stderr;
+
+        fprintf(stderr, "Failed to open logfile '%s' with error '%s'\n", filename.c_str(), std::strerror(errno));
+        fflush(stderr);
+
+        // At this point we let the init flag be to true below, and let the target fallback to stderr
+        //  otherwise we would repeatedly fopen() which was already unsuccessful
+    }
+
+    _initialized = true;
+
+    return logfile ? logfile : stderr;
+}
+
+// INTERNAL, DO NOT USE
+inline FILE *log_handler2_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME)
+{
+    return log_handler1_impl(change, append, disable, filename, target);
+}
+
+// Disables logs entirely at runtime.
+//  Makes LOG() and LOG_TEE() produce no output,
+//  until enabled back.
+#define log_disable() log_disable_impl()
+
+// INTERNAL, DO NOT USE
+inline FILE *log_disable_impl()
+{
+    return log_handler1_impl(true, LogTriStateSame, LogTriStateTrue);
+}
+
+// Enables logs at runtime.
+#define log_enable() log_enable_impl()
+
+// INTERNAL, DO NOT USE
+inline FILE *log_enable_impl()
+{
+    return log_handler1_impl(true, LogTriStateSame, LogTriStateFalse);
+}
+
+// Sets target fir logs, either by a file name or FILE* pointer (stdout, stderr, or any valid FILE*)
+#define log_set_target(target) log_set_target_impl(target)
+
+// INTERNAL, DO NOT USE
+inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, LogTriStateSame, filename); }
+inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, LogTriStateSame, target); }
+
+// INTERNAL, DO NOT USE
+inline FILE *log_handler() { return log_handler1_impl(); }
+
+// Enable or disable creating separate log files for each run.
+//  can ONLY be invoked BEFORE first log use.
+#define log_multilog(enable) log_filename_generator_impl((enable) ? LogTriStateTrue : LogTriStateFalse, "", "")
+// Enable or disable append mode for log file.
+//  can ONLY be invoked BEFORE first log use.
+#define log_append(enable) log_append_impl(enable)
+// INTERNAL, DO NOT USE
+inline FILE *log_append_impl(bool enable)
+{
+    return log_handler1_impl(true, enable ? LogTriStateTrue : LogTriStateFalse, LogTriStateSame);
+}
+
+inline void log_test()
+{
+    log_disable();
+    LOG("01 Hello World to nobody, because logs are disabled!\n");
+    log_enable();
+    LOG("02 Hello World to default output, which is \"%s\" ( Yaaay, arguments! )!\n", LOG_STRINGIZE(LOG_TARGET));
+    LOG_TEE("03 Hello World to **both** default output and " LOG_TEE_TARGET_STRING "!\n");
+    log_set_target(stderr);
+    LOG("04 Hello World to stderr!\n");
+    LOG_TEE("05 Hello World TEE with double printing to stderr prevented!\n");
+    log_set_target(LOG_DEFAULT_FILE_NAME);
+    LOG("06 Hello World to default log file!\n");
+    log_set_target(stdout);
+    LOG("07 Hello World to stdout!\n");
+    log_set_target(LOG_DEFAULT_FILE_NAME);
+    LOG("08 Hello World to default log file again!\n");
+    log_disable();
+    LOG("09 Hello World _1_ into the void!\n");
+    log_enable();
+    LOG("10 Hello World back from the void ( you should not see _1_ in the log or the output )!\n");
+    log_disable();
+    log_set_target("llama.anotherlog.log");
+    LOG("11 Hello World _2_ to nobody, new target was selected but logs are still disabled!\n");
+    log_enable();
+    LOG("12 Hello World this time in a new file ( you should not see _2_ in the log or the output )?\n");
+    log_set_target("llama.yetanotherlog.log");
+    LOG("13 Hello World this time in yet new file?\n");
+    log_set_target(log_filename_generator("llama_autonamed", "log"));
+    LOG("14 Hello World in log with generated filename!\n");
+#ifdef _MSC_VER
+    LOG_TEE("15 Hello msvc TEE without arguments\n");
+    LOG_TEE("16 Hello msvc TEE with (%d)(%s) arguments\n", 1, "test");
+    LOG_TEELN("17 Hello msvc TEELN without arguments\n");
+    LOG_TEELN("18 Hello msvc TEELN with (%d)(%s) arguments\n", 1, "test");
+    LOG("19 Hello msvc LOG without arguments\n");
+    LOG("20 Hello msvc LOG with (%d)(%s) arguments\n", 1, "test");
+    LOGLN("21 Hello msvc LOGLN without arguments\n");
+    LOGLN("22 Hello msvc LOGLN with (%d)(%s) arguments\n", 1, "test");
+#endif
+}
+
+inline bool log_param_single_parse(const std::string & param)
+{
+    if ( param == "--log-test")
+    {
+        log_test();
+        return true;
+    }
+
+    if ( param == "--log-disable")
+    {
+        log_disable();
+        return true;
+    }
+
+    if ( param == "--log-enable")
+    {
+        log_enable();
+        return true;
+    }
+
+    if (param == "--log-new")
+    {
+        log_multilog(true);
+        return true;
+    }
+
+    if (param == "--log-append")
+    {
+        log_append(true);
+        return true;
+    }
+
+    return false;
+}
+
+inline bool log_param_pair_parse(bool check_but_dont_parse, const std::string & param, const std::string & next = std::string())
+{
+    if ( param == "--log-file")
+    {
+        if (!check_but_dont_parse)
+        {
+            log_set_target(log_filename_generator(next.empty() ? "unnamed" : next, "log"));
+        }
+
+        return true;
+    }
+
+    return false;
+}
+
+inline void log_print_usage()
+{
+    printf("log options:\n");
+    /* format
+    printf("  -h, --help            show this help message and exit\n");*/
+    /* spacing
+    printf("__-param----------------Description\n");*/
+    printf("  --log-test            Run simple logging test\n");
+    printf("  --log-disable         Disable trace logs\n");
+    printf("  --log-enable          Enable trace logs\n");
+    printf("  --log-file            Specify a log filename (without extension)\n");
+    printf("  --log-new             Create a separate new log file on start. "
+                                   "Each log file will have unique name: \"<name>.<ID>.log\"\n");
+    printf("  --log-append          Don't truncate the old log file.\n");
+    printf("\n");
+}
+
+#define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv)
+
+// INTERNAL, DO NOT USE
+inline void log_dump_cmdline_impl(int argc, char **argv)
+{
+    std::stringstream buf;
+    for (int i = 0; i < argc; ++i)
+    {
+        if (std::string(argv[i]).find(' ') != std::string::npos)
+        {
+            buf << " \"" << argv[i] <<"\"";
+        }
+        else
+        {
+            buf << " " << argv[i];
+        }
+    }
+    LOGLN("Cmd:%s", buf.str().c_str());
+}
+
+#define log_tostr(var) log_var_to_string_impl(var).c_str()
+
+inline std::string log_var_to_string_impl(bool var)
+{
+    return var ? "true" : "false";
+}
+
+inline std::string log_var_to_string_impl(std::string var)
+{
+    return var;
+}
+
+inline std::string log_var_to_string_impl(const std::vector<int> & var)
+{
+    std::stringstream buf;
+    buf << "[ ";
+    bool first = true;
+    for (auto e : var)
+    {
+        if (first)
+        {
+            first = false;
+        }
+        else
+        {
+            buf << ", ";
+        }
+        buf << std::to_string(e);
+    }
+    buf << " ]";
+
+    return buf.str();
+}
+
+template <typename C, typename T>
+inline std::string LOG_TOKENS_TOSTR_PRETTY(const C & ctx, const T & tokens)
+{
+    std::stringstream buf;
+    buf << "[ ";
+
+    bool first = true;
+    for (const auto &token : tokens)
+    {
+        if (!first) {
+            buf << ", ";
+        } else {
+            first = false;
+        }
+
+        auto detokenized = llama_token_to_piece(ctx, token);
+
+        detokenized.erase(
+            std::remove_if(
+                detokenized.begin(),
+                detokenized.end(),
+                [](const unsigned char c) { return !std::isprint(c); }),
+            detokenized.end());
+
+        buf
+            << "'" << detokenized << "'"
+            << ":" << std::to_string(token);
+    }
+    buf << " ]";
+
+    return buf.str();
+}
+
+template <typename C, typename B>
+inline std::string LOG_BATCH_TOSTR_PRETTY(const C & ctx, const B & batch)
+{
+    std::stringstream buf;
+    buf << "[ ";
+
+    bool first = true;
+    for (int i = 0; i < batch.n_tokens; ++i)
+    {
+        if (!first) {
+            buf << ", ";
+        } else {
+            first = false;
+        }
+
+        auto detokenized = llama_token_to_piece(ctx, batch.token[i]);
+
+        detokenized.erase(
+            std::remove_if(
+                detokenized.begin(),
+                detokenized.end(),
+                [](const unsigned char c) { return !std::isprint(c); }),
+            detokenized.end());
+
+        buf
+            << "\n" << std::to_string(i)
+            << ":token '" << detokenized << "'"
+            << ":pos " << std::to_string(batch.pos[i])
+            << ":n_seq_id  " << std::to_string(batch.n_seq_id[i])
+            << ":seq_id " << std::to_string(batch.seq_id[i][0])
+            << ":logits " << std::to_string(batch.logits[i]);
+    }
+    buf << " ]";
+
+    return buf.str();
+}
+
+#ifdef LOG_DISABLE_LOGS
+
+#undef LOG
+#define LOG(...) // dummy stub
+#undef LOGLN
+#define LOGLN(...) // dummy stub
+
+#undef LOG_TEE
+#define LOG_TEE(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf
+
+#undef LOG_TEELN
+#define LOG_TEELN(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf
+
+#undef LOG_DISABLE
+#define LOG_DISABLE() // dummy stub
+
+#undef LOG_ENABLE
+#define LOG_ENABLE() // dummy stub
+
+#undef LOG_ENABLE
+#define LOG_ENABLE() // dummy stub
+
+#undef LOG_SET_TARGET
+#define LOG_SET_TARGET(...) // dummy stub
+
+#undef LOG_DUMP_CMDLINE
+#define LOG_DUMP_CMDLINE(...) // dummy stub
+
+#endif // LOG_DISABLE_LOGS
--- a/common/ngram-cache.cpp
+++ b/common/ngram-cache.cpp
@@ -2,11 +2,8 @@
 #include "common.h"
 #include "log.h"

-#include <cinttypes>
 #include <cstdint>
-#include <cstdio>
 #include <fstream>
-#include <thread>

 void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
                              std::vector<llama_token> & inp, int nnew, bool print_progress) {
--- a/common/ngram-cache.h
+++ b/common/ngram-cache.h
@@ -37,18 +37,11 @@ struct llama_ngram {
    }
 };

-struct llama_token_hash_function {
-    size_t operator()(const llama_token token) const {
-        // see https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
-        return token * 11400714819323198485llu;
-    }
-};
-
 struct llama_ngram_hash_function {
    size_t operator()(const llama_ngram & ngram) const {
-        size_t hash = llama_token_hash_function{}(ngram.tokens[0]);
-        for (int i = 1; i < LLAMA_NGRAM_MAX; ++i) {
-            hash ^= llama_token_hash_function{}(ngram.tokens[i]);
+        size_t hash = 0;
+        for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
+            hash ^= std::hash<llama_token>{}(ngram.tokens[i]);
        }
        return hash;
    }
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -1,450 +1,369 @@
+#define LLAMA_API_INTERNAL
 #include "sampling.h"
+#include <random>

-#include "common.h"
+struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) {
+    struct llama_sampling_context * result = new llama_sampling_context();

-#include <cmath>
-#include <unordered_map>
+    result->params  = params;
+    result->grammar = nullptr;

-// the ring buffer works similarly to std::deque, but with a fixed capacity
-// TODO: deduplicate with llama-impl.h
-template<typename T>
-struct ring_buffer {
-    ring_buffer(size_t cap) : capacity(cap), data(cap) {}
+    // if there is a grammar, parse it
+    if (!params.grammar.empty()) {
+        result->parsed_grammar = grammar_parser::parse(params.grammar.c_str());

-    T & front() {
-        if (sz == 0) {
-            throw std::runtime_error("ring buffer is empty");
-        }
-        return data[first];
-    }
-
-    const T & front() const {
-        if (sz == 0) {
-            throw std::runtime_error("ring buffer is empty");
-        }
-        return data[first];
-    }
-
-    T & back() {
-        if (sz == 0) {
-            throw std::runtime_error("ring buffer is empty");
-        }
-        return data[pos];
-    }
-
-    const T & back() const {
-        if (sz == 0) {
-            throw std::runtime_error("ring buffer is empty");
-        }
-        return data[pos];
-    }
-
-    void push_back(const T & value) {
-        if (sz == capacity) {
-            // advance the start when buffer is full
-            first = (first + 1) % capacity;
-        } else {
-            sz++;
-        }
-        data[pos] = value;
-        pos = (pos + 1) % capacity;
-    }
-
-    T pop_front() {
-        if (sz == 0) {
-            throw std::runtime_error("ring buffer is empty");
-        }
-        T value = data[first];
-        first = (first + 1) % capacity;
-        sz--;
-        return value;
-    }
-
-    const T & rat(size_t i) const {
-        if (i >= sz) {
-            throw std::runtime_error("ring buffer: index out of bounds");
-        }
-        return data[(first + sz - i - 1) % capacity];
-    }
-
-    std::vector<T> to_vector() const {
-        std::vector<T> result;
-        result.reserve(sz);
-        for (size_t i = 0; i < sz; i++) {
-            result.push_back(data[(first + i) % capacity]);
-        }
-        return result;
-    }
-
-    void clear() {
-        // here only reset the status of the buffer
-        sz = 0;
-        first = 0;
-        pos = 0;
-    }
-
-    bool empty() const {
-        return sz == 0;
-    }
-
-    size_t size() const {
-        return sz;
-    }
-
-    size_t capacity = 0;
-    size_t sz = 0;
-    size_t first = 0;
-    size_t pos = 0;
-    std::vector<T> data;
-};
-
-struct gpt_sampler {
-    gpt_sampler_params params;
-
-    struct llama_sampler * grmr;
-    struct llama_sampler * chain;
-
-    ring_buffer<llama_token> prev;
-
-    std::vector<llama_token_data> cur;
-
-    llama_token_data_array cur_p;
-
-    void set_logits(struct llama_context * ctx, int idx) {
-        const auto * logits = llama_get_logits_ith(ctx, idx);
-
-        const int n_vocab = llama_n_vocab(llama_get_model(ctx));
-
-        cur.resize(n_vocab);
-
-        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-            cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
+        // will be empty (default) if there are parse errors
+        if (result->parsed_grammar.rules.empty()) {
+            fprintf(stderr, "%s: failed to parse grammar\n", __func__);
+            delete result;
+            return nullptr;
        }

-        cur_p = { cur.data(), cur.size(), -1, false };
-    }
-};
+        // Ensure that there is a "root" node.
+        if (result->parsed_grammar.symbol_ids.find("root") == result->parsed_grammar.symbol_ids.end()) {
+            fprintf(stderr, "%s: grammar does not contain a 'root' symbol\n", __func__);
+            delete result;
+            return nullptr;
+        }

-std::string gpt_sampler_params::print() const {
+        std::vector<const llama_grammar_element *> grammar_rules(result->parsed_grammar.c_rules());
+
+        result->grammar = llama_grammar_init(
+                grammar_rules.data(),
+                grammar_rules.size(), result->parsed_grammar.symbol_ids.at("root"));
+    }
+
+    result->prev.resize(params.n_prev);
+
+    result->n_valid = 0;
+
+    llama_sampling_set_rng_seed(result, params.seed);
+
+    return result;
+}
+
+void llama_sampling_free(struct llama_sampling_context * ctx) {
+    if (ctx->grammar != NULL) {
+        llama_grammar_free(ctx->grammar);
+    }
+
+    delete ctx;
+}
+
+void llama_sampling_reset(llama_sampling_context * ctx) {
+    if (ctx->grammar != NULL) {
+        llama_grammar_free(ctx->grammar);
+        ctx->grammar = NULL;
+    }
+
+    if (!ctx->parsed_grammar.rules.empty()) {
+        std::vector<const llama_grammar_element *> grammar_rules(ctx->parsed_grammar.c_rules());
+
+        ctx->grammar = llama_grammar_init(
+                grammar_rules.data(),
+                grammar_rules.size(), ctx->parsed_grammar.symbol_ids.at("root"));
+    }
+
+    std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
+    ctx->cur.clear();
+    ctx->n_valid = 0;
+}
+
+void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) {
+    if (seed == LLAMA_DEFAULT_SEED) {
+        seed = std::random_device{}();
+    }
+    ctx->rng.seed(seed);
+}
+
+void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst) {
+    if (dst->grammar) {
+        llama_grammar_free(dst->grammar);
+        dst->grammar = nullptr;
+    }
+
+    if (src->grammar) {
+        dst->grammar = llama_grammar_copy(src->grammar);
+    }
+
+    dst->prev = src->prev;
+}
+
+llama_token llama_sampling_last(llama_sampling_context * ctx) {
+    return ctx->prev.back();
+}
+
+std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n) {
+    const int size = ctx_sampling->prev.size();
+
+    n = std::min(n, size);
+
+    std::string result;
+
+    for (int i = size - n; i < size; i++) {
+        result += llama_token_to_piece(ctx_main, ctx_sampling->prev[i]);
+    }
+
+    return result;
+}
+
+std::string llama_sampling_print(const llama_sampling_params & params) {
    char result[1024];

    snprintf(result, sizeof(result),
            "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
            "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
            "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
-            penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
-            top_k, tfs_z, top_p, min_p, typ_p, temp,
-            mirostat, mirostat_eta, mirostat_tau);
+            params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present,
+            params.top_k, params.tfs_z, params.top_p, params.min_p, params.typical_p, params.temp,
+            params.mirostat, params.mirostat_eta, params.mirostat_tau);

    return std::string(result);
 }

-struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) {
-    llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
-
-    lparams.no_perf = params.no_perf;
-
-    auto * result = new gpt_sampler {
-        /* .params = */ params,
-        /* .grmr   = */ llama_sampler_init_grammar(model, params.grammar.c_str(), "root"),
-        /* .chain  = */ llama_sampler_chain_init(lparams),
-        /* .prev   = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
-        /* .cur    = */ {},
-        /* .cur_p  = */ {},
-    };
-
-    llama_sampler_chain_add(result->chain,
-            llama_sampler_init_logit_bias(
-                llama_n_vocab(model),
-                params.logit_bias.size(),
-                params.logit_bias.data()));
-
-    llama_sampler_chain_add(result->chain,
-            llama_sampler_init_penalties(
-                llama_n_vocab  (model),
-                llama_token_eos(model),
-                llama_token_nl (model),
-                params.penalty_last_n,
-                params.penalty_repeat,
-                params.penalty_freq,
-                params.penalty_present,
-                params.penalize_nl,
-                params.ignore_eos));
-
-    if (params.temp > 0.0f) {
-        if (params.mirostat == 0) {
-            for (const auto & cnstr : params.samplers) {
-                switch (cnstr) {
-                    case GPT_SAMPLER_TYPE_TOP_K:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_top_k    (params.top_k));
-                        break;
-                    case GPT_SAMPLER_TYPE_TOP_P:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_top_p    (params.top_p, params.min_keep));
-                        break;
-                    case GPT_SAMPLER_TYPE_MIN_P:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_min_p    (params.min_p, params.min_keep));
-                        break;
-                    case GPT_SAMPLER_TYPE_TFS_Z:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
-                        break;
-                    case GPT_SAMPLER_TYPE_TYPICAL_P:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_typical  (params.typ_p, params.min_keep));
-                        break;
-                    case GPT_SAMPLER_TYPE_TEMPERATURE:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
-                        break;
-                    default:
-                        GGML_ASSERT(false && "unknown sampler type");
-                }
+std::string llama_sampling_order_print(const llama_sampling_params & params) {
+    std::string result = "CFG -> Penalties ";
+    if (params.mirostat == 0) {
+        for (auto sampler_type : params.samplers_sequence) {
+            const auto sampler_type_name = sampler_type_to_name_string(sampler_type);
+            if (!sampler_type_name.empty()) {
+                result += "-> " + sampler_type_name + " ";
            }
-            llama_sampler_chain_add(result->chain, llama_sampler_init_softmax());
-            llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
-        } else if (params.mirostat == 1) {
-            llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
-            llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
-        } else if (params.mirostat == 2) {
-            llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
-            llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
-        } else {
-            GGML_ASSERT(false && "unknown mirostat version");
        }
    } else {
-        llama_sampler_chain_add(result->chain, llama_sampler_init_softmax());
-        llama_sampler_chain_add(result->chain, llama_sampler_init_greedy());
+        result += "-> mirostat ";
    }

    return result;
 }

-void gpt_sampler_free(struct gpt_sampler * gsmpl) {
-    if (gsmpl) {
-        llama_sampler_free(gsmpl->grmr);
+// no reasons to expose this function in header
+static void sampler_queue(
+                   struct llama_context * ctx_main,
+            const llama_sampling_params & params,
+                 llama_token_data_array & cur_p,
+                                 size_t   min_keep) {
+    const float         temp              = params.temp;
+    const float         dynatemp_range    = params.dynatemp_range;
+    const float         dynatemp_exponent = params.dynatemp_exponent;
+    const int32_t       top_k             = params.top_k;
+    const float         top_p             = params.top_p;
+    const float         min_p             = params.min_p;
+    const float         tfs_z             = params.tfs_z;
+    const float         typical_p         = params.typical_p;
+    const std::vector<llama_sampler_type> & samplers_sequence = params.samplers_sequence;

-        llama_sampler_free(gsmpl->chain);
-
-        delete gsmpl;
+    for (auto sampler_type : samplers_sequence) {
+        switch (sampler_type) {
+            case llama_sampler_type::TOP_K    : llama_sample_top_k    (ctx_main, &cur_p, top_k,     min_keep); break;
+            case llama_sampler_type::TFS_Z    : llama_sample_tail_free(ctx_main, &cur_p, tfs_z,     min_keep); break;
+            case llama_sampler_type::TYPICAL_P: llama_sample_typical  (ctx_main, &cur_p, typical_p, min_keep); break;
+            case llama_sampler_type::TOP_P    : llama_sample_top_p    (ctx_main, &cur_p, top_p,     min_keep); break;
+            case llama_sampler_type::MIN_P    : llama_sample_min_p    (ctx_main, &cur_p, min_p,     min_keep); break;
+            case llama_sampler_type::TEMPERATURE:
+                if (dynatemp_range > 0) {
+                    float dynatemp_min = std::max(0.0f, temp - dynatemp_range);
+                    float dynatemp_max = std::max(0.0f, temp + dynatemp_range);
+                    llama_sample_entropy(ctx_main, &cur_p, dynatemp_min, dynatemp_max, dynatemp_exponent);
+                } else {
+                    llama_sample_temp(ctx_main, &cur_p, temp);
+                }
+                break;
+            default : break;
+        }
    }
 }

-void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool accept_grammar) {
-    if (accept_grammar) {
-        llama_sampler_accept(gsmpl->grmr, token);
+static llama_token llama_sampling_sample_impl(
+                  struct llama_sampling_context * ctx_sampling,
+                  struct llama_context * ctx_main,
+                  struct llama_context * ctx_cfg,
+                  const int idx,
+                  bool is_resampling) {  // Add a parameter to indicate if we are resampling
+    const llama_sampling_params & params = ctx_sampling->params;
+
+    const float   temp            = params.temp;
+    const int     mirostat        = params.mirostat;
+    const float   mirostat_tau    = params.mirostat_tau;
+    const float   mirostat_eta    = params.mirostat_eta;
+
+    std::vector<float> original_logits;
+    auto cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx, !is_resampling, &original_logits);
+    if (!is_resampling) {
+        GGML_ASSERT(!original_logits.empty());
    }
+    llama_token id = 0;
+    // Get a pointer to the logits
+    float * logits = llama_get_logits_ith(ctx_main, idx);

-    llama_sampler_accept(gsmpl->chain, token);
+    if (temp < 0.0) {
+        // greedy sampling, with probs
+        llama_sample_softmax(ctx_main, &cur_p);
+        id = cur_p.data[0].id;
+    } else if (temp == 0.0) {
+        // greedy sampling, no probs
+        id = llama_sample_token_greedy(ctx_main, &cur_p);
+    } else {
+        if (mirostat == 1) {
+            const int mirostat_m = 100;
+            llama_sample_temp(ctx_main, &cur_p, temp);
+            id = llama_sample_token_mirostat(ctx_main, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &ctx_sampling->mirostat_mu);
+        } else if (mirostat == 2) {
+            llama_sample_temp(ctx_main, &cur_p, temp);
+            id = llama_sample_token_mirostat_v2(ctx_main, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling->mirostat_mu);
+        } else {
+            // temperature sampling
+            size_t min_keep = std::max(1, params.min_keep);

-    gsmpl->prev.push_back(token);
-}
+            sampler_queue(ctx_main, params, cur_p, min_keep);

-void gpt_sampler_reset(struct gpt_sampler * gsmpl) {
-    llama_sampler_reset(gsmpl->grmr);
+            id = llama_sample_token_with_rng(ctx_main, &cur_p, ctx_sampling->rng);

-    llama_sampler_reset(gsmpl->chain);
-}
+            //{
+            //    const int n_top = 10;
+            //    LOG("top %d candidates:\n", n_top);

-struct gpt_sampler * gpt_sampler_clone(gpt_sampler * gsmpl) {
-    return new gpt_sampler {
-        /* .params = */ gsmpl->params,
-        /* .grmr   = */ llama_sampler_clone(gsmpl->grmr),
-        /* .chain  = */ llama_sampler_clone(gsmpl->chain),
-        /* .prev   = */ gsmpl->prev,
-        /* .cur    = */ gsmpl->cur,
-        /* .cur_p  = */ gsmpl->cur_p,
-    };
-}
+            //    for (int i = 0; i < n_top; i++) {
+            //        const llama_token id = cur_p.data[i].id;
+            //        (void)id; // To avoid a warning that id is unused when logging is disabled.
+            //        LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx_main, id).c_str(), cur_p.data[i].p);
+            //    }
+            //}

-void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * gsmpl) {
-    // TODO: measure grammar performance
-
-    if (gsmpl) {
-        llama_perf_sampler_print(gsmpl->chain);
-    }
-    if (ctx) {
-        llama_perf_context_print(ctx);
-    }
-}
-
-llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
-    gsmpl->set_logits(ctx, idx);
-
-    auto & grmr  = gsmpl->grmr;
-    auto & chain = gsmpl->chain;
-    auto & cur_p = gsmpl->cur_p; // initialized by set_logits
-
-    if (grammar_first) {
-        llama_sampler_apply(grmr, &cur_p);
-    }
-
-    llama_sampler_apply(chain, &cur_p);
-
-    GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
-
-    const llama_token id = cur_p.data[cur_p.selected].id;
-
-    if (grammar_first) {
-        return id;
-    }
-
-    // check if it the sampled token fits the grammar
-    {
-        llama_token_data       single_token_data       = { id, 1.0f, 0.0f };
-        llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
-
-        llama_sampler_apply(grmr, &single_token_data_array);
-
-        const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
-        if (is_valid) {
-            return id;
+            //LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx_main, id).c_str());
        }
    }

-    // resampling:
-    // if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
-    gsmpl->set_logits(ctx, idx);
+    if (ctx_sampling->grammar != NULL && !is_resampling) {
+        // Create an array with a single token data element for the sampled id
+        llama_token_data single_token_data = {id, logits[id], 0.0f};
+        llama_token_data_array single_token_data_array = { &single_token_data, 1, false };

-    llama_sampler_apply(grmr,  &cur_p);
-    llama_sampler_apply(chain, &cur_p);
+        // Apply grammar constraints to the single token
+        llama_sample_grammar(ctx_main, &single_token_data_array, ctx_sampling->grammar);

-    GGML_ASSERT(cur_p.selected != -1 && "no selected token during re-sampling - check your sampling configuration");
+        // Check if the token is valid according to the grammar by seeing if its logit has been set to -INFINITY
+        bool is_valid = single_token_data_array.data[0].logit != -INFINITY;

-    return cur_p.data[cur_p.selected].id;
-}
+        // If the token is not valid according to the grammar, perform resampling
+        if (!is_valid) {
+            LOG("Resampling because token %d: '%s' does not meet grammar rules\n", id, llama_token_to_piece(ctx_main, id).c_str());

-uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl) {
-    return llama_sampler_get_seed(gsmpl->chain);
-}
+            // Restore logits from the copy
+            std::copy(original_logits.begin(), original_logits.end(), logits);

-// helpers
-
-llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl) {
-    return &gsmpl->cur_p;
-}
-
-llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl) {
-    return gsmpl->prev.rat(0);
-}
-
-std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) {
-    std::string result = "logits ";
-
-    for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
-        const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
-        result += std::string("-> ") + llama_sampler_name(smpl) + " ";
+            return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, true);  // Pass true for is_resampling
+        }
    }

-    return result;
+    ctx_sampling->n_valid = temp == 0.0f ? 0 : cur_p.size;
+
+    return id;
 }

-std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx_main, int n) {
-    n = std::min(n, (int) gsmpl->prev.size());
+static llama_token_data_array llama_sampling_prepare_impl(
+                  struct llama_sampling_context * ctx_sampling,
+                  struct llama_context * ctx_main,
+                  struct llama_context * ctx_cfg,
+                  const int idx,
+                  bool apply_grammar,
+                  std::vector<float> * original_logits) {
+    const llama_sampling_params & params = ctx_sampling->params;

-    if (n <= 0) {
-        return "";
+    const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
+
+    const int32_t penalty_last_n  = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
+    const float   penalty_repeat  = params.penalty_repeat;
+    const float   penalty_freq    = params.penalty_freq;
+    const float   penalty_present = params.penalty_present;
+
+    const bool    penalize_nl     = params.penalize_nl;
+
+    auto & prev = ctx_sampling->prev;
+    auto & cur  = ctx_sampling->cur;
+
+    // Get a pointer to the logits
+    float * logits = llama_get_logits_ith(ctx_main, idx);
+
+    if (apply_grammar && original_logits != NULL) {
+        // Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this.
+        *original_logits = {logits, logits + llama_n_vocab(llama_get_model(ctx_main))};
    }

-    std::string result;
-    result.reserve(8*n); // 8 is the average length of a token [citation needed], TODO: compute this from the vocab
-
-    for (int i = n - 1; i >= 0; i--) {
-        const llama_token id = gsmpl->prev.rat(i);
-
-        GGML_ASSERT(id != LLAMA_TOKEN_NULL && "null token in the sampling history - should not happen");
-
-        result += llama_token_to_piece(ctx_main, id);
+    // apply params.logit_bias map
+    for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
+        logits[it->first] += it->second;
    }

-    return result;
-}
-
-char gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr) {
-    switch (cnstr) {
-        case GPT_SAMPLER_TYPE_TOP_K:       return 'k';
-        case GPT_SAMPLER_TYPE_TFS_Z:       return 'f';
-        case GPT_SAMPLER_TYPE_TYPICAL_P:   return 'y';
-        case GPT_SAMPLER_TYPE_TOP_P:       return 'p';
-        case GPT_SAMPLER_TYPE_MIN_P:       return 'm';
-        case GPT_SAMPLER_TYPE_TEMPERATURE: return 't';
-        default : return '?';
+    if (ctx_cfg) {
+        float * logits_guidance = llama_get_logits_ith(ctx_cfg, idx);
+        llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale);
    }
-}

-std::string gpt_sampler_type_to_str(enum gpt_sampler_type cnstr) {
-    switch (cnstr) {
-        case GPT_SAMPLER_TYPE_TOP_K:       return "top_k";
-        case GPT_SAMPLER_TYPE_TFS_Z:       return "tfs_z";
-        case GPT_SAMPLER_TYPE_TYPICAL_P:   return "typ_p";
-        case GPT_SAMPLER_TYPE_TOP_P:       return "top_p";
-        case GPT_SAMPLER_TYPE_MIN_P:       return "min_p";
-        case GPT_SAMPLER_TYPE_TEMPERATURE: return "temperature";
-        default : return "";
+    cur.clear();
+
+    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+        cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
    }
-}

-std::vector<gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
-    std::unordered_map<std::string, gpt_sampler_type> sampler_canonical_name_map {
-        { "top_k",       GPT_SAMPLER_TYPE_TOP_K },
-        { "top_p",       GPT_SAMPLER_TYPE_TOP_P },
-        { "typ_p",       GPT_SAMPLER_TYPE_TYPICAL_P },
-        { "min_p",       GPT_SAMPLER_TYPE_MIN_P },
-        { "tfs_z",       GPT_SAMPLER_TYPE_TFS_Z },
-        { "temperature", GPT_SAMPLER_TYPE_TEMPERATURE },
-    };
+    llama_token_data_array cur_p = { cur.data(), cur.size(), false };

-    // since samplers names are written multiple ways
-    // make it ready for both system names and input names
-    std::unordered_map<std::string, gpt_sampler_type> sampler_alt_name_map {
-        { "top-k",       GPT_SAMPLER_TYPE_TOP_K },
-        { "top-p",       GPT_SAMPLER_TYPE_TOP_P },
-        { "nucleus",     GPT_SAMPLER_TYPE_TOP_P },
-        { "typical-p",   GPT_SAMPLER_TYPE_TYPICAL_P },
-        { "typical",     GPT_SAMPLER_TYPE_TYPICAL_P },
-        { "typ-p",       GPT_SAMPLER_TYPE_TYPICAL_P },
-        { "typ",         GPT_SAMPLER_TYPE_TYPICAL_P },
-        { "min-p",       GPT_SAMPLER_TYPE_MIN_P },
-        { "tfs-z",       GPT_SAMPLER_TYPE_TFS_Z },
-        { "tfs",         GPT_SAMPLER_TYPE_TFS_Z },
-        { "temp",        GPT_SAMPLER_TYPE_TEMPERATURE },
-    };
+    // apply penalties
+    const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev;
+    const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n);
+    if (penalty_tokens_used_size) {
+        const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];

-    std::vector<gpt_sampler_type> samplers;
-    samplers.reserve(names.size());
+        llama_sample_repetition_penalties(ctx_main, &cur_p,
+                penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size,
+                penalty_tokens_used_size, penalty_repeat, penalty_freq, penalty_present);

-    for (const auto & name : names) {
-        auto sampler = sampler_canonical_name_map.find(name);
-        if (sampler != sampler_canonical_name_map.end()) {
-            samplers.push_back(sampler->second);
-        } else {
-            if (allow_alt_names) {
-                sampler = sampler_alt_name_map.find(name);
-                if (sampler != sampler_alt_name_map.end()) {
-                    samplers.push_back(sampler->second);
+        if (!penalize_nl) {
+            for (size_t idx = 0; idx < cur_p.size; idx++) {
+                if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) {
+                    cur_p.data[idx].logit = nl_logit;
+                    break;
                }
            }
        }
    }

-    return samplers;
-}
-
-std::vector<gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars) {
-    std::unordered_map<char, gpt_sampler_type> sampler_name_map = {
-        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_K),       GPT_SAMPLER_TYPE_TOP_K },
-        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TFS_Z),       GPT_SAMPLER_TYPE_TFS_Z },
-        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TYPICAL_P),   GPT_SAMPLER_TYPE_TYPICAL_P },
-        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_P),       GPT_SAMPLER_TYPE_TOP_P },
-        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_MIN_P),       GPT_SAMPLER_TYPE_MIN_P },
-        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TEMPERATURE), GPT_SAMPLER_TYPE_TEMPERATURE }
-    };
-
-    std::vector<gpt_sampler_type> samplers;
-    samplers.reserve(chars.size());
-
-    for (const auto & c : chars) {
-        const auto sampler = sampler_name_map.find(c);
-        if (sampler != sampler_name_map.end()) {
-            samplers.push_back(sampler->second);
-        }
+    // apply grammar checks before sampling logic
+    if (apply_grammar && ctx_sampling->grammar != NULL) {
+        llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
    }

-    return samplers;
+    return cur_p;
+}
+
+llama_token llama_sampling_sample(
+                  struct llama_sampling_context * ctx_sampling,
+                  struct llama_context * ctx_main,
+                  struct llama_context * ctx_cfg,
+                  const int idx) {
+    // Call the implementation function with is_resampling set to false by default
+    return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, false);
+}
+
+llama_token_data_array llama_sampling_prepare(
+                  struct llama_sampling_context * ctx_sampling,
+                  struct llama_context * ctx_main,
+                  struct llama_context * ctx_cfg,
+                  const int idx,
+                  bool apply_grammar,
+                  std::vector<float> * original_logits) {
+    return llama_sampling_prepare_impl(ctx_sampling,ctx_main, ctx_cfg, idx, apply_grammar, original_logits);
+}
+
+void llama_sampling_accept(
+        struct llama_sampling_context * ctx_sampling,
+        struct llama_context * ctx_main,
+        llama_token id,
+        bool apply_grammar) {
+    ctx_sampling->prev.erase(ctx_sampling->prev.begin());
+    ctx_sampling->prev.push_back(id);
+
+    if (ctx_sampling->grammar != NULL && apply_grammar) {
+        llama_grammar_accept_token(ctx_main, ctx_sampling->grammar, id);
+    }
 }
--- a/common/sampling.h
+++ b/common/sampling.h
@@ -2,82 +2,154 @@

 #include "llama.h"

-#include "common.h"
+#include "grammar-parser.h"

+#include <random>
 #include <string>
+#include <unordered_map>
 #include <vector>

-// gpt_sampler extends llama_sampler with additional functionality:
+// sampler types
+enum class llama_sampler_type : char {
+    TOP_K       = 'k',
+    TOP_P       = 'p',
+    MIN_P       = 'm',
+    TFS_Z       = 'f',
+    TYPICAL_P   = 'y',
+    TEMPERATURE = 't'
+};
+
+// sampling parameters
+typedef struct llama_sampling_params {
+    int32_t     n_prev                = 64;                 // number of previous tokens to remember
+    int32_t     n_probs               = 0;                  // if greater than 0, output the probabilities of top n_probs tokens.
+    int32_t     min_keep              = 0;                  // 0 = disabled, otherwise samplers should return at least min_keep tokens
+    int32_t     top_k                 = 40;                 // <= 0 to use vocab size
+    float       top_p                 = 0.95f;              // 1.0 = disabled
+    float       min_p                 = 0.05f;              // 0.0 = disabled
+    float       tfs_z                 = 1.00f;              // 1.0 = disabled
+    float       typical_p             = 1.00f;              // 1.0 = disabled
+    float       temp                  = 0.80f;              // <= 0.0 to sample greedily, 0.0 to not output probabilities
+    float       dynatemp_range        = 0.00f;              // 0.0 = disabled
+    float       dynatemp_exponent     = 1.00f;              // controls how entropy maps to temperature in dynamic temperature sampler
+    int32_t     penalty_last_n        = 64;                 // last n tokens to penalize (0 = disable penalty, -1 = context size)
+    float       penalty_repeat        = 1.00f;              // 1.0 = disabled
+    float       penalty_freq          = 0.00f;              // 0.0 = disabled
+    float       penalty_present       = 0.00f;              // 0.0 = disabled
+    int32_t     mirostat              = 0;                  // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
+    float       mirostat_tau          = 5.00f;              // target entropy
+    float       mirostat_eta          = 0.10f;              // learning rate
+    bool        penalize_nl           = false;              // consider newlines as a repeatable token
+    uint32_t    seed                  = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampling_context
+
+    std::vector<llama_sampler_type> samplers_sequence = {
+        llama_sampler_type::TOP_K,
+        llama_sampler_type::TFS_Z,
+        llama_sampler_type::TYPICAL_P,
+        llama_sampler_type::TOP_P,
+        llama_sampler_type::MIN_P,
+        llama_sampler_type::TEMPERATURE
+    };
+
+    std::string grammar;  // optional BNF-like grammar to constrain sampling
+
+    // Classifier-Free Guidance
+    // https://arxiv.org/abs/2306.17806
+    std::string cfg_negative_prompt; // string to help guidance
+    float       cfg_scale     = 1.f; // how strong is guidance
+
+    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
+
+    std::vector<llama_token> penalty_prompt_tokens;
+    bool                     use_penalty_prompt_tokens = false;
+} llama_sampling_params;
+
+// general sampler context
+// TODO: move to llama.h
+struct llama_sampling_context {
+    // parameters that will be used for sampling
+    llama_sampling_params params;
+
+    // mirostat sampler state
+    float mirostat_mu;
+
+    llama_grammar * grammar;
+
+    // internal
+    grammar_parser::parse_state parsed_grammar;
+
+    // TODO: replace with ring-buffer
+    std::vector<llama_token>      prev;
+    std::vector<llama_token_data> cur;
+    size_t n_valid; // Number of correct top tokens with correct probabilities.
+
+    std::mt19937 rng;
+};
+
+#include "common.h"
+
+// Create a new sampling context instance.
+struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params);
+
+void llama_sampling_free(struct llama_sampling_context * ctx);
+
+// Reset the sampler context
+// - clear prev tokens
+// - reset grammar
+void llama_sampling_reset(llama_sampling_context * ctx);
+
+// Set the sampler seed
+void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed);
+
+// Copy the sampler context
+void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst);
+
+// Get the last sampled token
+llama_token llama_sampling_last(llama_sampling_context * ctx);
+
+// Get a string representation of the last sampled tokens
+std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n);
+
+// Print sampling parameters into a string
+std::string llama_sampling_print(const llama_sampling_params & params);
+
+// Print sampling order into a string
+std::string llama_sampling_order_print(const llama_sampling_params & params);
+
+// this is a common sampling function used across the examples for convenience
+// it can serve as a starting point for implementing your own sampling function
+// Note: When using multiple sequences, it is the caller's responsibility to call
+//       llama_sampling_reset when a sequence ends
 //
-//  - grammar support
-//  - custom sampler logic based on the parameters
-//  - history of the last accepted tokens
-//  - performance metrics
+// required:
+//  - ctx_main:     context to use for sampling
+//  - ctx_sampling: sampling-specific context
 //
-// This goal is to have a common implementation of the sampling logic shared across the examples.
-// For example, depending on the temperature, the sampling chain can be very simple (greedy) or more
-// complex (top-k, top-p, etc).
+// optional:
+//  - ctx_cfg:      context to use for classifier-free guidance
+//  - idx:          sample from llama_get_logits_ith(ctx, idx)
 //
-// Another example is related to the grammar. In general, the grammar constraints applied on the full
-// vocabulary can be very taxing. To improve performance, the grammar can be applied only to the sampled
-// token in order to verify if it fits the grammar. And only if the token doesn't fit the grammar, the
-// grammar constraints are applied to the full vocabulary and the token is resampled.
-//
-// The gpt_sampler also maintains a container with the last accepted tokens. In the future, this can
-// be moved into the core llama library.
-//
-// For convenience, the gpt_sampler also maintains a container with the current candidate tokens.
-// This can be used to access the probabilities of the rest of the non-sampled tokens.
-//
-// TODO: measure grammar performance
+// returns:
+//  - token:      sampled token
+//  - candidates: vector of candidate tokens
 //
+llama_token llama_sampling_sample(
+        struct llama_sampling_context * ctx_sampling,
+        struct llama_context * ctx_main,
+        struct llama_context * ctx_cfg,
+        int idx = -1);

-struct gpt_sampler;
+// Prepares and adjusts the set of token candidates for sampling based on penalties, biases, and sampling parameters.
+llama_token_data_array llama_sampling_prepare(
+        struct llama_sampling_context * ctx_sampling,
+        struct llama_context * ctx_main,
+        struct llama_context * ctx_cfg,
+        int idx = 0,
+        bool apply_grammar = true,
+        std::vector<float> * original_logits = nullptr);

-// llama_sampler API overloads
-
-struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params);
-
-void gpt_sampler_free(struct gpt_sampler * gsmpl);
-
-// if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
-void                 gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool accept_grammar);
-void                 gpt_sampler_reset (struct gpt_sampler * gsmpl);
-struct gpt_sampler * gpt_sampler_clone (struct gpt_sampler * gsmpl);
-
-// arguments can be nullptr to skip printing
-void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * gsmpl);
-
-// extended sampling implementation:
-//
-// - set logits
-// - apply the configured sampler chain
-// - check if the token fits the grammar (if any)
-// - if not: resample by first applying the grammar constraints and then sampling again (slower path)
-//
-// if grammar_first is true, the grammar is applied before the samplers (slower)
-// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
-//
-llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
-
-uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl);
-
-// helpers
-
-// access the internal list of current candidate tokens
-llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl);
-
-// get the last accepted token
-llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl);
-
-// print the sampler chain into a string
-std::string gpt_sampler_print(const struct gpt_sampler * gsmpl);
-
-// get a string representation of the last accepted tokens
-std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx, int n);
-
-char        gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr);
-std::string gpt_sampler_type_to_str(enum gpt_sampler_type cnstr);
-
-std::vector<enum gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
-std::vector<enum gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars);
+void llama_sampling_accept(
+        struct llama_sampling_context * ctx_sampling,
+        struct llama_context * ctx_main,
+        llama_token id,
+        bool apply_grammar);
--- a/common/stb_image.h
+++ b/common/stb_image.h
--- a/common/train.cpp
+++ b/common/train.cpp
@@ -1,11 +1,9 @@
 #include "train.h"
 #include "common.h"

-#include <algorithm>
 #include <random>
 #include <sstream>
 #include <functional>
-#include <cstring>

 struct random_normal_distribution {
    std::mt19937 gen;
@@ -1054,7 +1052,7 @@ struct train_params_common get_default_train_params_common() {

    params.custom_n_ctx = false;

-    params.use_flash              = false;
+    params.use_flash              = true;
    params.use_checkpointing      = true;

    params.sample_start           = "";
@@ -1382,7 +1380,7 @@ bool consume_common_train_arg(

 void finish_processing_train_args(struct train_params_common * params) {
    if (params->escape) {
-        string_process_escapes(params->sample_start);
+        process_escapes(params->sample_start);
    }
 }

--- a/convert-hf-to-gguf-update.py
+++ b/convert-hf-to-gguf-update.py
@@ -1,8 +1,7 @@
 #!/usr/bin/env python3
-# -*- coding: utf-8 -*-

 # This script downloads the tokenizer models of the specified models from Huggingface and
-# generates the get_vocab_base_pre() function for convert_hf_to_gguf.py
+# generates the get_vocab_base_pre() function for convert-hf-to-gguf.py
 #
 # This is necessary in order to analyze the type of pre-tokenizer used by the model and
 # provide the necessary information to llama.cpp via the GGUF header in order to implement
@@ -15,9 +14,9 @@
 # - Add a new model to the "models" list
 # - Run the script with your huggingface token:
 #
-#   python3 convert_hf_to_gguf_update.py <huggingface_token>
+#   python3 convert-hf-to-gguf-update.py <huggingface_token>
 #
-# - Copy-paste the generated get_vocab_base_pre() function into convert_hf_to_gguf.py
+# - Copy-paste the generated get_vocab_base_pre() function into convert-hf-to-gguf.py
 # - Update llama.cpp with the new pre-tokenizer if necessary
 #
 # TODO: generate tokenizer tests for llama.cpp
@@ -31,14 +30,13 @@ import re
 import requests
 import sys
 import json
-import shutil

 from hashlib import sha256
 from enum import IntEnum, auto
 from transformers import AutoTokenizer

 logging.basicConfig(level=logging.DEBUG)
-logger = logging.getLogger("convert_hf_to_gguf_update")
+logger = logging.getLogger("convert-hf-to-gguf-update")
 sess = requests.Session()


@@ -46,21 +44,20 @@ class TOKENIZER_TYPE(IntEnum):
    SPM = auto()
    BPE = auto()
    WPM = auto()
-    UGM = auto()


 # TODO: this string has to exercise as much pre-tokenizer functionality as possible
 #       will be updated with time - contributions welcome
-CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
+chktxt = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'

 if len(sys.argv) == 2:
    token = sys.argv[1]
    if not token.startswith("hf_"):
        logger.info("Huggingface token seems invalid")
-        logger.info("Usage: python convert_hf_to_gguf_update.py <huggingface_token>")
+        logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
        sys.exit(1)
 else:
-    logger.info("Usage: python convert_hf_to_gguf_update.py <huggingface_token>")
+    logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
    sys.exit(1)

 # TODO: add models here, base models preferred
@@ -75,7 +72,7 @@ models = [
    {"name": "mpt",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
    {"name": "starcoder",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
    {"name": "gpt-2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
-    {"name": "stablelm2",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b", },
+    {"name": "stablelm",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b", },
    {"name": "refact",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
    {"name": "command-r",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
    {"name": "qwen2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
@@ -84,21 +81,6 @@ models = [
    {"name": "jina-v2-en",     "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
    {"name": "jina-v2-es",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
    {"name": "jina-v2-de",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
-    {"name": "smaug-bpe",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
-    {"name": "poro-chat",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
-    {"name": "jina-v2-code",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
-    {"name": "viking",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B
-    {"name": "gemma",          "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2b", },
-    {"name": "gemma-2",        "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", },
-    {"name": "jais",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", },
-    {"name": "t5",             "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", },
-    {"name": "codeshell",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", },
-    {"name": "tekken",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },
-    {"name": "smollm",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", },
-    {'name': "bloom",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", },
-    {'name': "gpt3-finnish",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", },
-    {"name": "exaone",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
-    {"name": "phi-2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
 ]


@@ -107,8 +89,8 @@ def download_file_with_auth(url, token, save_path):
    response = sess.get(url, headers=headers)
    response.raise_for_status()
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
-    with open(save_path, 'wb') as downloaded_file:
-        downloaded_file.write(response.content)
+    with open(save_path, 'wb') as f:
+        f.write(response.content)
    logger.info(f"File {save_path} downloaded successfully")


@@ -120,34 +102,15 @@ def download_model(model):
    os.makedirs(f"models/tokenizers/{name}", exist_ok=True)

    files = ["config.json", "tokenizer.json", "tokenizer_config.json"]
-
    if tokt == TOKENIZER_TYPE.SPM:
        files.append("tokenizer.model")

-    if tokt == TOKENIZER_TYPE.UGM:
-        files.append("spiece.model")
-
-    if os.path.isdir(repo):
-        # If repo is a path on the file system, copy the directory
-        for file in files:
-            src_path = os.path.join(repo, file)
-            dst_path = f"models/tokenizers/{name}/{file}"
-            if os.path.isfile(dst_path):
-                logger.info(f"{name}: File {dst_path} already exists - skipping")
-                continue
-            if os.path.isfile(src_path):
-                shutil.copy2(src_path, dst_path)
-                logger.info(f"{name}: Copied {src_path} to {dst_path}")
-            else:
-                logger.warning(f"{name}: Source file {src_path} does not exist")
-    else:
-        # If repo is a URL, download the files
-        for file in files:
-            save_path = f"models/tokenizers/{name}/{file}"
-            if os.path.isfile(save_path):
-                logger.info(f"{name}: File {save_path} already exists - skipping")
-                continue
-            download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)
+    for file in files:
+        save_path = f"models/tokenizers/{name}/{file}"
+        if os.path.isfile(save_path):
+            logger.info(f"{name}: File {save_path} already exists - skipping")
+            continue
+        download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)


 for model in models:
@@ -157,14 +120,14 @@ for model in models:
        logger.error(f"Failed to download model {model['name']}. Error: {e}")


-# generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:
+# generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function:

 src_ifs = ""
 for model in models:
    name = model["name"]
    tokt = model["tokt"]

-    if tokt == TOKENIZER_TYPE.SPM or tokt == TOKENIZER_TYPE.UGM:
+    if tokt == TOKENIZER_TYPE.SPM:
        continue

    # Skip if the tokenizer folder does not exist or there are other download issues previously
@@ -174,15 +137,12 @@ for model in models:

    # create the tokenizer
    try:
-        if name == "t5":
-            tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
-        else:
-            tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
+        tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
    except OSError as e:
        logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
        continue  # Skip to the next model if the tokenizer can't be loaded

-    chktok = tokenizer.encode(CHK_TXT)
+    chktok = tokenizer.encode(chktxt)
    chkhsh = sha256(str(chktok).encode()).hexdigest()

    logger.info(f"model: {name}")
@@ -214,7 +174,7 @@ src_func = f"""
        # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
        # use in llama.cpp to implement the same pre-tokenizer

-        chktxt = {repr(CHK_TXT)}
+        chktxt = {repr(chktxt)}

        chktok = tokenizer.encode(chktxt)
        chkhsh = sha256(str(chktok).encode()).hexdigest()
@@ -224,7 +184,7 @@ src_func = f"""

        res = None

-        # NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script
+        # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script
        #       or pull the latest version of the model from Huggingface
        #       don't edit the hashes manually!
 {src_ifs}
@@ -233,9 +193,9 @@ src_func = f"""
            logger.warning("**************************************************************************************")
            logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
            logger.warning("**          There are 2 possible reasons for this:")
-            logger.warning("**          - the model has not been added to convert_hf_to_gguf_update.py yet")
+            logger.warning("**          - the model has not been added to convert-hf-to-gguf-update.py yet")
            logger.warning("**          - the pre-tokenization config has changed upstream")
-            logger.warning("**          Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
+            logger.warning("**          Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
            logger.warning("** ref:     https://github.com/ggerganov/llama.cpp/pull/6920")
            logger.warning("**")
            logger.warning(f"** chkhsh:  {{chkhsh}}")
@@ -249,8 +209,8 @@ src_func = f"""
        return res
 """

-convert_py_pth = pathlib.Path("convert_hf_to_gguf.py")
-convert_py = convert_py_pth.read_text(encoding="utf-8")
+convert_py_pth = pathlib.Path("convert-hf-to-gguf.py")
+convert_py = convert_py_pth.read_text()
 convert_py = re.sub(
    r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
    lambda m: m.group(1) + src_func + m.group(3),
@@ -258,9 +218,9 @@ convert_py = re.sub(
    flags=re.DOTALL | re.MULTILINE,
 )

-convert_py_pth.write_text(convert_py, encoding="utf-8")
+convert_py_pth.write_text(convert_py)

-logger.info("+++ convert_hf_to_gguf.py was updated")
+logger.info("+++ convert-hf-to-gguf.py was updated")

 # generate tests for each tokenizer model

@@ -298,7 +258,6 @@ tests = [
    "\n =",
    "' era",
    "Hello, y'all! How are you 😁 ?我想在apple工作1314151天～",
-    "!!!!!!",
    "3",
    "33",
    "333",
@@ -308,9 +267,8 @@ tests = [
    "3333333",
    "33333333",
    "333333333",
-    "Cửa Việt", # llama-bpe fails on this
-    " discards",
-    CHK_TXT,
+    # "Cửa Việt", # llama-bpe fails on this
+    chktxt,
 ]

 # write the tests to ./models/ggml-vocab-{name}.gguf.inp
@@ -337,10 +295,7 @@ for model in models:

    # create the tokenizer
    try:
-        if name == "t5":
-            tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
-        else:
-            tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
+        tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
    except OSError as e:
        logger.error(f"Failed to load tokenizer for model {name}. Error: {e}")
        continue  # Skip this model and continue with the next one in the loop
@@ -366,6 +321,6 @@ logger.info("\nRun the following commands to generate the vocab files for testin
 for model in models:
    name = model["name"]

-    print(f"python3 convert_hf_to_gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") # noqa: NP100
+    print(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") # noqa: NP100

 logger.info("\n")
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
--- a/convert-llama-ggml-to-gguf.py
+++ b/convert-llama-ggml-to-gguf.py
@@ -116,7 +116,7 @@ class Tensor:
        assert quant is not None, 'Unknown tensor type'
        (blksize, tysize) = quant
        offset += 12
-        self.dtype= gguf.GGMLQuantizationType(dtype)
+        self.dtype= dtype
        self.dims = struct.unpack(f'<{n_dims}I', data[offset:offset + (4 * n_dims)])
        offset += 4 * n_dims
        self.name = bytes(data[offset:offset + name_len])
@@ -132,10 +132,6 @@ class Tensor:


 class GGMLModel:
-
-    file_format: GGMLFormat
-    format_version: int
-
    def __init__(self):
        self.hyperparameters = None
        self.vocab = None
@@ -294,7 +290,7 @@ class GGMLToGGUF:
        if self.vocab_override is not None:
            vo = self.vocab_override
            logger.info('* Adding vocab item(s)')
-            for (_, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
+            for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
                tokens.append(vbytes)
                scores.append(score)
                toktypes.append(ttype)
@@ -358,8 +354,7 @@ class GGMLToGGUF:


 def handle_metadata(cfg, hp):
-    import examples.convert_legacy_llama as convert
-
+    import convert
    assert cfg.model_metadata_dir.is_dir(), 'Metadata dir is not a directory'
    hf_config_path   = cfg.model_metadata_dir / "config.json"
    orig_config_path = cfg.model_metadata_dir / "params.json"
--- a/examples/convert_legacy_llama.py
+++ b/examples/convert_legacy_llama.py
@@ -24,16 +24,14 @@ from abc import ABC, abstractmethod
 from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
 from dataclasses import dataclass
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, IO, Iterable, Literal, TypeVar
+from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable, Optional

 import numpy as np
+from sentencepiece import SentencePieceProcessor

 if 'NO_LOCAL_GGUF' not in os.environ:
-    # use .parent.parent since we are in "examples" directory
-    sys.path.insert(1, str(Path(__file__).parent.parent / 'gguf-py'))
-
+    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
 import gguf
-from gguf import BaseVocab, Vocab, NoVocab, BpeVocab, SentencePieceVocab, LlamaHfVocab

 if TYPE_CHECKING:
    from typing_extensions import Self, TypeAlias
@@ -176,7 +174,7 @@ class Params:
    rope_scaling_type: gguf.RopeScalingType | None = None
    f_rope_freq_base: float | None = None
    f_rope_scale: float | None = None
-    n_ctx_orig: int | None = None
+    n_orig_ctx: int | None = None
    rope_finetuned: bool | None = None

    ftype: GGMLFileType | None = None
@@ -226,7 +224,7 @@ class Params:
        with open(config_path) as f:
            config = json.load(f)

-        rope_scaling_type = f_rope_scale = n_ctx_orig = rope_finetuned = None
+        rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
        rope_scaling = config.get("rope_scaling")

        if rope_scaling is not None and (typ := rope_scaling.get("type")):
@@ -236,7 +234,7 @@ class Params:
                rope_scaling_type = gguf.RopeScalingType.LINEAR
            elif typ == "yarn":
                rope_scaling_type = gguf.RopeScalingType.YARN
-                n_ctx_orig = rope_scaling['original_max_position_embeddings']
+                n_orig_ctx = rope_scaling['original_max_position_embeddings']
                rope_finetuned = rope_scaling['finetuned']
            else:
                raise NotImplementedError(f'Unknown rope scaling type: {typ}')
@@ -272,7 +270,7 @@ class Params:
            f_rope_freq_base  = config.get("rope_theta"),
            rope_scaling_type = rope_scaling_type,
            f_rope_scale      = f_rope_scale,
-            n_ctx_orig        = n_ctx_orig,
+            n_orig_ctx        = n_orig_ctx,
            rope_finetuned    = rope_finetuned,
        )

@@ -346,6 +344,342 @@ class Params:
        return params


+@dataclass
+class Metadata:
+    name: Optional[str] = None
+    author: Optional[str] = None
+    version: Optional[str] = None
+    url: Optional[str] = None
+    description: Optional[str] = None
+    licence: Optional[str] = None
+    source_url: Optional[str] = None
+    source_hf_repo: Optional[str] = None
+
+    @staticmethod
+    def load(metadata_path: Path) -> Metadata:
+        if metadata_path is None or not metadata_path.exists():
+            return Metadata()
+
+        with open(metadata_path, 'r') as file:
+            data = json.load(file)
+
+        # Create a new Metadata instance
+        metadata = Metadata()
+
+        # Assigning values to Metadata attributes if they exist in the JSON file
+        # This is based on LLM_KV_NAMES mapping in llama.cpp
+        metadata.name = data.get("general.name")
+        metadata.author = data.get("general.author")
+        metadata.version = data.get("general.version")
+        metadata.url = data.get("general.url")
+        metadata.description = data.get("general.description")
+        metadata.license = data.get("general.license")
+        metadata.source_url = data.get("general.source.url")
+        metadata.source_hf_repo = data.get("general.source.huggingface.repository")
+
+        return metadata
+
+
+#
+# vocab
+#
+
+
+@runtime_checkable
+class BaseVocab(Protocol):
+    tokenizer_model: ClassVar[str]
+    name: ClassVar[str]
+
+
+class NoVocab(BaseVocab):
+    tokenizer_model = "no_vocab"
+    name = "no_vocab"
+
+    def __repr__(self) -> str:
+        return "<NoVocab for a model without integrated vocabulary>"
+
+
+@runtime_checkable
+class Vocab(BaseVocab, Protocol):
+    vocab_size: int
+    added_tokens_dict: dict[str, int]
+    added_tokens_list: list[str]
+    fname_tokenizer: Path
+
+    def __init__(self, base_path: Path): ...
+    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: ...
+
+
+class BpeVocab(Vocab):
+    tokenizer_model = "gpt2"
+    name = "bpe"
+
+    def __init__(self, base_path: Path):
+        added_tokens: dict[str, int] = {}
+
+        if (fname_tokenizer := base_path / 'vocab.json').exists():
+            # "slow" tokenizer
+            with open(fname_tokenizer, encoding="utf-8") as f:
+                self.vocab = json.load(f)
+
+            try:
+                # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
+                with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
+                    added_tokens = json.load(f)
+            except FileNotFoundError:
+                pass
+        else:
+            # "fast" tokenizer
+            fname_tokenizer = base_path / FAST_TOKENIZER_FILE
+
+            # if this fails, FileNotFoundError propagates to caller
+            with open(fname_tokenizer, encoding="utf-8") as f:
+                tokenizer_json = json.load(f)
+
+            tokenizer_model: dict[str, Any] = tokenizer_json['model']
+            if (
+                tokenizer_model['type'] != 'BPE' or tokenizer_model.get('byte_fallback', False)
+                or tokenizer_json['decoder']['type'] != 'ByteLevel'
+            ):
+                raise FileNotFoundError('Cannot find GPT-2 BPE tokenizer')
+
+            self.vocab = tokenizer_model["vocab"]
+
+            if (added := tokenizer_json.get('added_tokens')) is not None:
+                # Added tokens here can be duplicates of the main vocabulary.
+                added_tokens = {item['content']: item['id']
+                                for item in added
+                                if item['content'] not in self.vocab}
+
+        vocab_size   = len(self.vocab)
+        expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
+        actual_ids   = sorted(added_tokens.values())
+        if expected_ids != actual_ids:
+            expected_end_id = vocab_size + len(actual_ids) - 1
+            raise ValueError(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range "
+                             f"{vocab_size} - {expected_end_id}; got {actual_ids}")
+
+        items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
+        self.added_tokens_dict    = added_tokens
+        self.added_tokens_list    = [text for (text, idx) in items]
+        self.vocab_size_base      = vocab_size
+        self.vocab_size           = self.vocab_size_base + len(self.added_tokens_list)
+        self.fname_tokenizer      = fname_tokenizer
+
+    def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}
+
+        for i, _ in enumerate(self.vocab):
+            yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL
+
+    def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        for text in self.added_tokens_list:
+            score = -1000.0
+            yield text.encode("utf-8"), score, gguf.TokenType.CONTROL
+
+    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        yield from self.bpe_tokens()
+        yield from self.added_tokens()
+
+    def __repr__(self) -> str:
+        return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
+
+
+class SentencePieceVocab(Vocab):
+    tokenizer_model = "llama"
+    name = "spm"
+
+    def __init__(self, base_path: Path):
+        added_tokens: dict[str, int] = {}
+        if (fname_tokenizer := base_path / 'tokenizer.model').exists():
+            # normal location
+            try:
+                with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
+                    added_tokens = json.load(f)
+            except FileNotFoundError:
+                pass
+        elif not (fname_tokenizer := base_path.parent / 'tokenizer.model').exists():
+            # not found in alternate location either
+            raise FileNotFoundError('Cannot find tokenizer.model')
+
+        self.sentencepiece_tokenizer = SentencePieceProcessor()
+        self.sentencepiece_tokenizer.LoadFromFile(str(fname_tokenizer))
+        vocab_size = self.sentencepiece_tokenizer.vocab_size()
+
+        new_tokens       = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
+        expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
+        actual_new_ids   = sorted(new_tokens.keys())
+
+        if expected_new_ids != actual_new_ids:
+            raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
+
+        # Token pieces that were added to the base vocabulary.
+        self.added_tokens_dict  = added_tokens
+        self.added_tokens_list  = [new_tokens[id] for id in actual_new_ids]
+        self.vocab_size_base    = vocab_size
+        self.vocab_size         = self.vocab_size_base + len(self.added_tokens_list)
+        self.fname_tokenizer    = fname_tokenizer
+
+    def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        tokenizer = self.sentencepiece_tokenizer
+        for i in range(tokenizer.vocab_size()):
+            piece = tokenizer.IdToPiece(i)
+            text         = piece.encode("utf-8")
+            score: float = tokenizer.GetScore(i)
+
+            toktype = gguf.TokenType.NORMAL
+            if tokenizer.IsUnknown(i):
+                toktype = gguf.TokenType.UNKNOWN
+            if tokenizer.IsControl(i):
+                toktype = gguf.TokenType.CONTROL
+
+            # NOTE: I think added_tokens are user defined.
+            # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
+            # if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
+
+            if tokenizer.IsUnused(i):
+                toktype = gguf.TokenType.UNUSED
+            if tokenizer.IsByte(i):
+                toktype = gguf.TokenType.BYTE
+
+            yield text, score, toktype
+
+    def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        for text in self.added_tokens_list:
+            score = -1000.0
+            yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
+
+    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        yield from self.sentencepiece_tokens()
+        yield from self.added_tokens()
+
+    def __repr__(self) -> str:
+        return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
+
+
+class LlamaHfVocab(Vocab):
+    tokenizer_model = "llama"
+    name = "hfft"
+
+    def __init__(self, base_path: Path):
+        fname_tokenizer = base_path / FAST_TOKENIZER_FILE
+        # if this fails, FileNotFoundError propagates to caller
+        with open(fname_tokenizer, encoding='utf-8') as f:
+            tokenizer_json = json.load(f)
+
+        # pre-check so we know if we need transformers
+        tokenizer_model: dict[str, Any] = tokenizer_json['model']
+        is_llama3 = (
+            tokenizer_model['type'] == 'BPE' and tokenizer_model.get('ignore_merges', False)
+            and not tokenizer_model.get('byte_fallback', True)
+        )
+        if is_llama3:
+            raise TypeError('Llama 3 must be converted with BpeVocab')
+
+        if not is_llama3 and (
+            tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
+            or tokenizer_json['decoder']['type'] != 'Sequence'
+        ):
+            raise FileNotFoundError('Cannot find Llama BPE tokenizer')
+
+        try:
+            from transformers import AutoTokenizer
+        except ImportError as e:
+            raise ImportError(
+                "To use LlamaHfVocab, please install the `transformers` package. "
+                "You can install it with `pip install transformers`."
+            ) from e
+
+        # Allow the tokenizer to default to slow or fast versions.
+        # Explicitly set tokenizer to use local paths.
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            base_path,
+            cache_dir=base_path,
+            local_files_only=True,
+        )
+        assert self.tokenizer.is_fast  # assume tokenizer.json is used
+
+        # Initialize lists and dictionaries for added tokens
+        self.added_tokens_list = []
+        self.added_tokens_dict = dict()
+        self.added_tokens_ids  = set()
+
+        # Process added tokens
+        for tok, tokidx in sorted(
+            self.tokenizer.get_added_vocab().items(), key=lambda x: x[1]
+        ):
+            # Only consider added tokens that are not in the base vocabulary
+            if tokidx >= self.tokenizer.vocab_size:
+                self.added_tokens_list.append(tok)
+                self.added_tokens_dict[tok] = tokidx
+                self.added_tokens_ids.add(tokidx)
+
+        # Store special tokens and their IDs
+        self.specials = {
+            tok: self.tokenizer.get_vocab()[tok]
+            for tok in self.tokenizer.all_special_tokens
+        }
+        self.special_ids = set(self.tokenizer.all_special_ids)
+
+        # Set vocabulary sizes
+        self.vocab_size_base = self.tokenizer.vocab_size
+        self.vocab_size      = self.vocab_size_base + len(self.added_tokens_list)
+
+        self.fname_tokenizer = fname_tokenizer
+
+    def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        reverse_vocab = {
+            id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()
+        }
+
+        for token_id in range(self.vocab_size_base):
+            # Skip processing added tokens here
+            if token_id in self.added_tokens_ids:
+                continue
+
+            # Convert token text to bytes
+            token_text = reverse_vocab[token_id].encode("utf-8")
+
+            # Yield token text, score, and type
+            yield token_text, self.get_token_score(token_id), self.get_token_type(
+                token_id, token_text, self.special_ids  # Reuse already stored special IDs
+            )
+
+    def get_token_type(self, token_id: int, token_text: bytes, special_ids: set[int]) -> gguf.TokenType:
+        # Special case for byte tokens
+        if re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text):
+            return gguf.TokenType.BYTE
+
+        # Determine token type based on whether it's a special token
+        return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
+
+    def get_token_score(self, token_id: int) -> float:
+        # Placeholder for actual logic to determine the token's score
+        # This needs to be implemented based on specific requirements
+        return -1000.0  # Default score
+
+    def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        for text in self.added_tokens_list:
+            if text in self.specials:
+                toktype = self.get_token_type(self.specials[text], b'', self.special_ids)
+                score = self.get_token_score(self.specials[text])
+            else:
+                toktype = gguf.TokenType.USER_DEFINED
+                score = -1000.0
+
+            yield text.encode("utf-8"), score, toktype
+
+    def has_newline_token(self):
+        return "<0x0A>" in self.tokenizer.vocab or "\n" in self.tokenizer.vocab
+
+    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        yield from self.hf_tokens()
+        yield from self.added_tokens()
+
+    def __repr__(self) -> str:
+        return f"<LlamaHfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
+
+
 #
 # data loading
 # TODO: reuse (probably move to gguf.py?)
@@ -456,13 +790,12 @@ class LazyTensor:

 LazyModel: TypeAlias = 'dict[str, LazyTensor]'

-ModelFormat: TypeAlias = Literal['ggml', 'torch', 'safetensors', 'none']

@dataclass
 class ModelPlus:
    model: LazyModel
    paths: list[Path]  # Where this was read from.
-    format: ModelFormat
+    format: Literal['ggml', 'torch', 'safetensors', 'none']
    vocab: BaseVocab | None  # For GGML models (which have vocab built in), the vocab.


@@ -501,7 +834,7 @@ def merge_sharded(models: list[LazyModel]) -> LazyModel:


 def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:
-    formats: set[ModelFormat] = set(mp.format for mp in models_plus)
+    formats = set(mp.format for mp in models_plus)
    assert len(formats) == 1, "different formats?"
    format = formats.pop()
    paths = [path for mp in models_plus for path in mp.paths]
@@ -520,7 +853,7 @@ def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:
    else:
        model = merge_sharded([mp.model for mp in models_plus])

-    return ModelPlus(model, paths, format, vocab)
+    return ModelPlus(model, paths, format, vocab)  # pytype: disable=wrong-arg-types


 def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTensor:
@@ -770,7 +1103,7 @@ class OutputFile:
    def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE):
        self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)

-    def add_meta_model(self, params: Params, metadata: gguf.Metadata | None) -> None:
+    def add_meta_model(self, params: Params, metadata: Metadata) -> None:
        # Metadata About The Model And Its Provenence
        name = "LLaMA"
        if metadata is not None and metadata.name is not None:
@@ -788,73 +1121,16 @@ class OutputFile:
                self.gguf.add_author(metadata.author)
            if metadata.version is not None:
                self.gguf.add_version(metadata.version)
-            if metadata.organization is not None:
-                self.gguf.add_organization(metadata.organization)
-
-            if metadata.finetune is not None:
-                self.gguf.add_finetune(metadata.finetune)
-            if metadata.basename is not None:
-                self.gguf.add_basename(metadata.basename)
-
-            if metadata.description is not None:
-                self.gguf.add_description(metadata.description)
-            if metadata.quantized_by is not None:
-                self.gguf.add_quantized_by(metadata.quantized_by)
-
-            if metadata.size_label is not None:
-                self.gguf.add_size_label(metadata.size_label)
-
-            if metadata.license is not None:
-                self.gguf.add_license(metadata.license)
-            if metadata.license_name is not None:
-                self.gguf.add_license_name(metadata.license_name)
-            if metadata.license_link is not None:
-                self.gguf.add_license_link(metadata.license_link)
-
            if metadata.url is not None:
                self.gguf.add_url(metadata.url)
-            if metadata.doi is not None:
-                self.gguf.add_doi(metadata.doi)
-            if metadata.uuid is not None:
-                self.gguf.add_uuid(metadata.uuid)
-            if metadata.repo_url is not None:
-                self.gguf.add_repo_url(metadata.repo_url)
-
+            if metadata.description is not None:
+                self.gguf.add_description(metadata.description)
+            if metadata.licence is not None:
+                self.gguf.add_licence(metadata.licence)
            if metadata.source_url is not None:
                self.gguf.add_source_url(metadata.source_url)
-            if metadata.source_doi is not None:
-                self.gguf.add_source_doi(metadata.source_doi)
-            if metadata.source_uuid is not None:
-                self.gguf.add_source_uuid(metadata.source_uuid)
-            if metadata.source_repo_url is not None:
-                self.gguf.add_source_repo_url(metadata.source_repo_url)
-
-            if metadata.base_models is not None:
-                self.gguf.add_base_model_count(len(metadata.base_models))
-                for key, base_model_entry in enumerate(metadata.base_models):
-                    if "name" in base_model_entry:
-                        self.gguf.add_base_model_name(key, base_model_entry["name"])
-                    if "author" in base_model_entry:
-                        self.gguf.add_base_model_author(key, base_model_entry["author"])
-                    if "version" in base_model_entry:
-                        self.gguf.add_base_model_version(key, base_model_entry["version"])
-                    if "organization" in base_model_entry:
-                        self.gguf.add_base_model_organization(key, base_model_entry["organization"])
-                    if "url" in base_model_entry:
-                        self.gguf.add_base_model_url(key, base_model_entry["url"])
-                    if "doi" in base_model_entry:
-                        self.gguf.add_base_model_doi(key, base_model_entry["doi"])
-                    if "uuid" in base_model_entry:
-                        self.gguf.add_base_model_uuid(key, base_model_entry["uuid"])
-                    if "repo_url" in base_model_entry:
-                        self.gguf.add_base_model_repo_url(key, base_model_entry["repo_url"])
-
-            if metadata.tags is not None:
-                self.gguf.add_tags(metadata.tags)
-            if metadata.languages is not None:
-                self.gguf.add_languages(metadata.languages)
-            if metadata.datasets is not None:
-                self.gguf.add_datasets(metadata.datasets)
+            if metadata.source_hf_repo is not None:
+                self.gguf.add_source_hf_repo(metadata.source_hf_repo)

    def add_meta_arch(self, params: Params) -> None:
        # Metadata About The Neural Architecture Itself
@@ -886,8 +1162,8 @@ class OutputFile:
            self.gguf.add_rope_scaling_type(params.rope_scaling_type)
            self.gguf.add_rope_scaling_factor(params.f_rope_scale)

-        if params.n_ctx_orig is not None:
-            self.gguf.add_rope_scaling_orig_ctx_len(params.n_ctx_orig)
+        if params.n_orig_ctx is not None:
+            self.gguf.add_rope_scaling_orig_ctx_len(params.n_orig_ctx)

        if params.rope_finetuned is not None:
            self.gguf.add_rope_scaling_finetuned(params.rope_finetuned)
@@ -965,7 +1241,7 @@ class OutputFile:
    @staticmethod
    def write_vocab_only(
        fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
-        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, metadata: gguf.Metadata | None = None,
+        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, metadata: Metadata = None,
    ) -> None:
        check_vocab_size(params, vocab, pad_vocab=pad_vocab)

@@ -999,7 +1275,7 @@ class OutputFile:
        fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab,
        concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
        pad_vocab: bool = False,
-        metadata: gguf.Metadata | None = None,
+        metadata: Metadata = None,
    ) -> None:
        check_vocab_size(params, vocab, pad_vocab=pad_vocab)

@@ -1042,32 +1318,35 @@ def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileT
    raise ValueError(f"Unexpected combination of types: {name_to_type}")


-def per_model_weight_count_estimation(tensors: Iterable[tuple[str, LazyTensor]]) -> tuple[int, int, int]:
-    total_params = 0
-    shared_params = 0
-    expert_params = 0
-
-    for name, lazy_tensor in tensors:
-        # We don't need these
-        if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
-            continue
-
-        # Got A Tensor
-        sum_weights_in_tensor: int = 1
-
-        # Tensor Volume
+def model_parameter_count(model: LazyModel) -> int:
+    total_model_parameters = 0
+    for i, (name, lazy_tensor) in enumerate(model.items()):
+        sum_weights_in_tensor = 1
        for dim in lazy_tensor.shape:
            sum_weights_in_tensor *= dim
+        total_model_parameters += sum_weights_in_tensor
+    return total_model_parameters

-        if ".experts." in name:
-            if ".experts.0." in name:
-                expert_params += sum_weights_in_tensor
-        else:
-            shared_params += sum_weights_in_tensor

-        total_params += sum_weights_in_tensor
+def model_parameter_count_rounded_notation(model_params_count: int) -> str:
+    if model_params_count > 1e12 :
+        # Trillions Of Parameters
+        scaled_model_params = model_params_count * 1e-12
+        scale_suffix = "T"
+    elif model_params_count > 1e9 :
+        # Billions Of Parameters
+        scaled_model_params = model_params_count * 1e-9
+        scale_suffix = "B"
+    elif model_params_count > 1e6 :
+        # Millions Of Parameters
+        scaled_model_params = model_params_count * 1e-6
+        scale_suffix = "M"
+    else:
+        # Thousands Of Parameters
+        scaled_model_params = model_params_count * 1e-3
+        scale_suffix = "K"

-    return total_params, shared_params, expert_params
+    return f"{round(scaled_model_params)}{scale_suffix}"


 def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
@@ -1249,24 +1528,34 @@ class VocabFactory:
        return vocab, special_vocab


-def default_convention_outfile(file_type: GGMLFileType, expert_count: int | None, model_params_count: tuple[int, int, int], metadata: gguf.Metadata) -> str:
-    name = metadata.name if metadata.name is not None else None
-    basename = metadata.basename if metadata.basename is not None else None
-    finetune = metadata.finetune if metadata.finetune is not None else None
-    version = metadata.version if metadata.version is not None else None
-    size_label = metadata.size_label if metadata.size_label is not None else gguf.size_label(*model_params_count, expert_count=expert_count or 0)
-
-    output_type = {
+def default_convention_outfile(file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> str:
+    quantization = {
        GGMLFileType.AllF32:    "F32",
        GGMLFileType.MostlyF16: "F16",
        GGMLFileType.MostlyQ8_0: "Q8_0",
    }[file_type]

-    return gguf.naming_convention(name, basename, finetune, version, size_label, output_type)
+    parameters = model_parameter_count_rounded_notation(model_params_count)
+
+    expert_count = ""
+    if params.n_experts is not None:
+        expert_count = f"{params.n_experts}x"
+
+    version = ""
+    if metadata is not None and metadata.version is not None:
+        version = f"-{metadata.version}"
+
+    name = "ggml-model"
+    if metadata is not None and metadata.name is not None:
+        name = metadata.name
+    elif params.path_model is not None:
+        name = params.path_model.name
+
+    return f"{name}{version}-{expert_count}{parameters}-{quantization}"


-def default_outfile(model_paths: list[Path], file_type: GGMLFileType, expert_count: int | None, model_params_count: tuple[int, int, int], metadata: gguf.Metadata) -> Path:
-    default_filename = default_convention_outfile(file_type, expert_count, model_params_count, metadata)
+def default_outfile(model_paths: list[Path], file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> Path:
+    default_filename = default_convention_outfile(file_type, params, model_params_count, metadata)
    ret = model_paths[0].parent / f"{default_filename}.gguf"
    if ret in model_paths:
        logger.error(
@@ -1305,9 +1594,8 @@ def main(args_in: list[str] | None = None) -> None:
    parser.add_argument("--pad-vocab",    action="store_true",    help="add pad tokens when model vocab expects more than tokenizer metadata provides")
    parser.add_argument("--skip-unknown", action="store_true",    help="skip unknown tensor names instead of failing")
    parser.add_argument("--verbose",      action="store_true",    help="increase output verbosity")
-    parser.add_argument("--metadata",     type=Path,              help="Specify the path for an authorship metadata override file")
+    parser.add_argument("--metadata",     type=Path,              help="Specify the path for a metadata file")
    parser.add_argument("--get-outfile",  action="store_true",    help="get calculated default outfile name")
-    parser.add_argument("--model-name",   type=str, default=None, help="name of the model")

    args = parser.parse_args(args_in)

@@ -1319,36 +1607,32 @@ def main(args_in: list[str] | None = None) -> None:
    else:
        logging.basicConfig(level=logging.INFO)

-    model_name = args.model_name
-    dir_model = args.model
-
-    metadata = gguf.Metadata.load(args.metadata, dir_model, model_name)
+    metadata = Metadata.load(args.metadata)

    if args.get_outfile:
-        model_plus = load_some_model(dir_model)
+        model_plus = load_some_model(args.model)
        params = Params.load(model_plus)
-        model = convert_model_names(model_plus.model, params, args.skip_unknown)
-        model_params_count = per_model_weight_count_estimation(model_plus.model.items())
-        ftype = pick_output_type(model, args.outtype)
-
-        if (metadata is None or metadata.name is None) and params.path_model is not None:
-            metadata.name = params.path_model.name
-
-        print(f"{default_convention_outfile(ftype, params.n_experts, model_params_count, metadata)}") # noqa: NP100
+        model   = convert_model_names(model_plus.model, params, args.skip_unknown)
+        model_params_count = model_parameter_count(model_plus.model)
+        ftype   = pick_output_type(model, args.outtype)
+        print(f"{default_convention_outfile(ftype, params, model_params_count, metadata)}") # noqa: NP100
        return

    if args.no_vocab and args.vocab_only:
        raise ValueError("--vocab-only does not make sense with --no-vocab")

    if args.dump_single:
-        model_plus = lazy_load_file(dir_model)
+        model_plus = lazy_load_file(args.model)
        do_dump_model(model_plus)
        return

    if not args.vocab_only:
-        model_plus = load_some_model(dir_model)
+        model_plus = load_some_model(args.model)
    else:
-        model_plus = ModelPlus(model = {}, paths = [dir_model / 'dummy'], format = 'none', vocab = None)
+        model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None)
+
+    model_params_count = model_parameter_count(model_plus.model)
+    logger.info(f"model parameters count : {model_params_count} ({model_parameter_count_rounded_notation(model_params_count)})")

    if args.dump:
        do_dump_model(model_plus)
@@ -1381,7 +1665,7 @@ def main(args_in: list[str] | None = None) -> None:
        logger.info(f"params = {params}")

    model_parent_path = model_plus.paths[0].parent
-    vocab_path = Path(args.vocab_dir or dir_model or model_parent_path)
+    vocab_path = Path(args.vocab_dir or args.model or model_parent_path)
    vocab_factory = VocabFactory(vocab_path)
    vocab_types = None if args.no_vocab else args.vocab_type.split(",")
    vocab, special_vocab = vocab_factory.load_vocab(vocab_types, model_parent_path)
@@ -1410,23 +1694,13 @@ def main(args_in: list[str] | None = None) -> None:
    if model_plus.vocab is not None and args.vocab_dir is None and not args.no_vocab:
        vocab = model_plus.vocab

-    assert params is not None
-
-    if metadata.name is None and params.path_model is not None:
-        metadata.name = params.path_model.name
-
-    model_params_count = per_model_weight_count_estimation(model_plus.model.items())
-    logger.info(f"model parameters count : {model_params_count} ({gguf.model_weight_count_rounded_notation(model_params_count[0])})")
-
    logger.info(f"Vocab info: {vocab}")
    logger.info(f"Special vocab info: {special_vocab}")
    model   = model_plus.model
    model   = convert_model_names(model, params, args.skip_unknown)
    ftype   = pick_output_type(model, args.outtype)
    model   = convert_to_output_type(model, ftype)
-    outfile = args.outfile or default_outfile(model_plus.paths, ftype, params.n_experts, model_params_count, metadata=metadata)
-
-    metadata.size_label = gguf.size_label(*model_params_count, expert_count=params.n_experts or 0)
+    outfile = args.outfile or default_outfile(model_plus.paths, ftype, params, model_params_count, metadata)

    params.ftype = ftype
    logger.info(f"Writing {outfile}, format {ftype}")
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@@ -1,400 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-from __future__ import annotations
-
-from dataclasses import dataclass
-import logging
-import argparse
-import os
-import sys
-import json
-from math import prod
-from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Sequence, SupportsIndex, cast
-
-import torch
-
-if TYPE_CHECKING:
-    from torch import Tensor
-
-if 'NO_LOCAL_GGUF' not in os.environ:
-    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
-import gguf
-
-# reuse model definitions from convert_hf_to_gguf.py
-from convert_hf_to_gguf import LazyTorchTensor, Model
-
-logger = logging.getLogger("lora-to-gguf")
-
-
-@dataclass
-class PartialLoraTensor:
-    A: Tensor | None = None
-    B: Tensor | None = None
-
-
-# magic to support tensor shape modifications and splitting
-class LoraTorchTensor:
-    _lora_A: Tensor  # (n_rank, row_size)
-    _lora_B: Tensor  # (col_size, n_rank)
-    _rank: int
-
-    def __init__(self, A: Tensor, B: Tensor):
-        assert len(A.shape) == len(B.shape)
-        assert A.shape[-2] == B.shape[-1]
-        if A.dtype != B.dtype:
-            A = A.to(torch.float32)
-            B = B.to(torch.float32)
-        self._lora_A = A
-        self._lora_B = B
-        self._rank = B.shape[-1]
-
-    def get_lora_A_B(self) -> tuple[Tensor, Tensor]:
-        return (self._lora_A, self._lora_B)
-
-    def __getitem__(
-        self,
-        indices: (
-            SupportsIndex
-            | slice
-            | tuple[SupportsIndex | slice | Tensor, ...]  # TODO: add ellipsis in the type signature
-        ),
-    ) -> LoraTorchTensor:
-        shape = self.shape
-        if isinstance(indices, SupportsIndex):
-            if len(shape) > 2:
-                return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices])
-            else:
-                raise NotImplementedError  # can't return a vector
-        elif isinstance(indices, slice):
-            if len(shape) > 2:
-                return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices])
-            else:
-                return LoraTorchTensor(self._lora_A, self._lora_B[indices])
-        elif isinstance(indices, tuple):
-            assert len(indices) > 0
-            if indices[-1] is Ellipsis:
-                return self[indices[:-1]]
-            # expand ellipsis
-            indices = tuple(
-                u
-                for v in (
-                    (
-                        (slice(None, None) for _ in range(len(indices) - 1))
-                        if i is Ellipsis
-                        else (i,)
-                    )
-                    for i in indices
-                )
-                for u in v
-            )
-
-            if len(indices) < len(shape):
-                indices = (*indices, *(slice(None, None) for _ in range(len(indices), len(shape))))
-
-            # TODO: make sure this is correct
-            indices_A = (
-                *(
-                    (
-                        j.__index__() % self._lora_A.shape[i]
-                        if isinstance(j, SupportsIndex)
-                        else slice(None, None)
-                    )
-                    for i, j in enumerate(indices[:-2])
-                ),
-                slice(None, None),
-                indices[-1],
-            )
-            indices_B = indices[:-1]
-            return LoraTorchTensor(self._lora_A[indices_A], self._lora_B[indices_B])
-        else:
-            raise NotImplementedError  # unknown indice type
-
-    @property
-    def dtype(self) -> torch.dtype:
-        assert self._lora_A.dtype == self._lora_B.dtype
-        return self._lora_A.dtype
-
-    @property
-    def shape(self) -> tuple[int, ...]:
-        assert len(self._lora_A.shape) == len(self._lora_B.shape)
-        return (*self._lora_B.shape[:-1], self._lora_A.shape[-1])
-
-    def size(self, dim=None):
-        assert dim is None
-        return self.shape
-
-    def reshape(self, *shape: int | tuple[int, ...]) -> LoraTorchTensor:
-        if isinstance(shape[0], tuple):
-            new_shape: tuple[int, ...] = shape[0]
-        else:
-            new_shape = cast(tuple[int, ...], shape)
-        orig_shape = self.shape
-        if len(new_shape) < 2:
-            raise NotImplementedError  # can't become a vector
-
-        # expand -1 in the shape
-        if any(dim == -1 for dim in new_shape):
-            n_elems = prod(orig_shape)
-            n_new_elems = prod(dim if dim != -1 else 1 for dim in new_shape)
-            assert n_elems % n_new_elems == 0
-            new_shape = (*(dim if dim != -1 else n_elems // n_new_elems for dim in new_shape),)
-
-        if new_shape[-1] != orig_shape[-1]:
-            raise NotImplementedError  # can't reshape the row size trivially
-
-        shape_A = (*(1 for _ in new_shape[:-2]), self._rank, orig_shape[-1])
-        shape_B = (*new_shape[:-1], self._rank)
-        return LoraTorchTensor(
-            self._lora_A.reshape(shape_A),
-            self._lora_B.reshape(shape_B),
-        )
-
-    def reshape_as(self, other: Tensor) -> LoraTorchTensor:
-        return self.reshape(*other.shape)
-
-    def view(self, *size: int) -> LoraTorchTensor:
-        return self.reshape(*size)
-
-    def permute(self, *dims: int) -> LoraTorchTensor:
-        shape = self.shape
-        dims = tuple(dim - len(shape) if dim >= 0 else dim for dim in dims)
-        if dims[-1] == -1:
-            # TODO: support higher dimensional A shapes bigger than 1
-            assert all(dim == 1 for dim in self._lora_A.shape[:-2])
-            return LoraTorchTensor(self._lora_A, self._lora_B.permute(*dims))
-        if len(shape) == 2 and dims[-1] == -2 and dims[-2] == -1:
-            return LoraTorchTensor(self._lora_B.permute(*dims), self._lora_A.permute(*dims))
-        else:
-            # TODO: compose the above two
-            raise NotImplementedError
-
-    def transpose(self, dim0: int, dim1: int) -> LoraTorchTensor:
-        shape = self.shape
-        dims = [i for i in range(len(shape))]
-        dims[dim0], dims[dim1] = dims[dim1], dims[dim0]
-        return self.permute(*dims)
-
-    def swapaxes(self, axis0: int, axis1: int) -> LoraTorchTensor:
-        return self.transpose(axis0, axis1)
-
-    def to(self, *args, **kwargs):
-        return LoraTorchTensor(self._lora_A.to(*args, **kwargs), self._lora_B.to(*args, **kwargs))
-
-    @classmethod
-    def __torch_function__(cls, func: Callable, types, args=(), kwargs=None):
-        del types  # unused
-
-        if kwargs is None:
-            kwargs = {}
-
-        if func is torch.permute:
-            return type(args[0]).permute(*args, **kwargs)
-        elif func is torch.reshape:
-            return type(args[0]).reshape(*args, **kwargs)
-        elif func is torch.stack:
-            assert isinstance(args[0], Sequence)
-            dim = kwargs.get("dim", 0)
-            assert dim == 0
-            return LoraTorchTensor(
-                torch.stack([a._lora_A for a in args[0]], dim),
-                torch.stack([b._lora_B for b in args[0]], dim),
-            )
-        elif func is torch.cat:
-            assert isinstance(args[0], Sequence)
-            dim = kwargs.get("dim", 0)
-            assert dim == 0
-            if len(args[0][0].shape) > 2:
-                return LoraTorchTensor(
-                    torch.cat([a._lora_A for a in args[0]], dim),
-                    torch.cat([b._lora_B for b in args[0]], dim),
-                )
-            elif all(torch.equal(args[0][0]._lora_A, t._lora_A) for t in args[0][1:]):
-                return LoraTorchTensor(
-                    args[0][0]._lora_A,
-                    torch.cat([b._lora_B for b in args[0]], dim),
-                )
-            else:
-                raise NotImplementedError
-        else:
-            raise NotImplementedError
-
-
-def get_base_tensor_name(lora_tensor_name: str) -> str:
-    base_name = lora_tensor_name.replace("base_model.model.", "")
-    base_name = base_name.replace(".lora_A.weight", ".weight")
-    base_name = base_name.replace(".lora_B.weight", ".weight")
-    return base_name
-
-
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(
-        description="Convert a huggingface PEFT LoRA adapter to a GGML compatible file")
-    parser.add_argument(
-        "--outfile", type=Path,
-        help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
-    )
-    parser.add_argument(
-        "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
-        help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
-    )
-    parser.add_argument(
-        "--bigendian", action="store_true",
-        help="model is executed on big endian machine",
-    )
-    parser.add_argument(
-        "--no-lazy", action="store_true",
-        help="use more RAM by computing all outputs before writing (use in case lazy evaluation is broken)",
-    )
-    parser.add_argument(
-        "--verbose", action="store_true",
-        help="increase output verbosity",
-    )
-    parser.add_argument(
-        "--dry-run", action="store_true",
-        help="only print out what will be done, without writing any new files",
-    )
-    parser.add_argument(
-        "--base", type=Path, required=True,
-        help="directory containing base model file",
-    )
-    parser.add_argument(
-        "lora_path", type=Path,
-        help="directory containing LoRA adapter file",
-    )
-
-    return parser.parse_args()
-
-
-if __name__ == '__main__':
-    args = parse_args()
-    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
-
-    ftype_map: dict[str, gguf.LlamaFileType] = {
-        "f32": gguf.LlamaFileType.ALL_F32,
-        "f16": gguf.LlamaFileType.MOSTLY_F16,
-        "bf16": gguf.LlamaFileType.MOSTLY_BF16,
-        "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
-        "auto": gguf.LlamaFileType.GUESSED,
-    }
-
-    ftype = ftype_map[args.outtype]
-
-    dir_base_model: Path = args.base
-    dir_lora: Path = args.lora_path
-    lora_config = dir_lora / "adapter_config.json"
-    input_model = dir_lora / "adapter_model.safetensors"
-
-    if args.outfile is not None:
-        fname_out = args.outfile
-    else:
-        # output in the same directory as the model by default
-        fname_out = dir_lora
-
-    if os.path.exists(input_model):
-        # lazy import load_file only if lora is in safetensors format.
-        from safetensors.torch import load_file
-
-        lora_model = load_file(input_model, device="cpu")
-    else:
-        input_model = os.path.join(dir_lora, "adapter_model.bin")
-        lora_model = torch.load(input_model, map_location="cpu", weights_only=True)
-
-    # load base model
-    logger.info(f"Loading base model: {dir_base_model.name}")
-    hparams = Model.load_hparams(dir_base_model)
-    with torch.inference_mode():
-        try:
-            model_class = Model.from_model_architecture(hparams["architectures"][0])
-        except NotImplementedError:
-            logger.error(f"Model {hparams['architectures'][0]} is not supported")
-            sys.exit(1)
-
-        class LoraModel(model_class):
-            model_arch = model_class.model_arch
-
-            lora_alpha: float
-
-            def __init__(self, *args, dir_lora_model: Path, lora_alpha: float, **kwargs):
-
-                super().__init__(*args, **kwargs)
-
-                self.dir_model_card = dir_lora_model
-                self.lora_alpha = float(lora_alpha)
-
-            def set_type(self):
-                self.gguf_writer.add_type(gguf.GGUFType.ADAPTER)
-                self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
-
-            def set_gguf_parameters(self):
-                self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha)
-                super().set_gguf_parameters()
-
-            def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
-                tensor_map: dict[str, PartialLoraTensor] = {}
-
-                for name, tensor in lora_model.items():
-                    if self.lazy:
-                        tensor = LazyTorchTensor.from_eager(tensor)
-                    base_name = get_base_tensor_name(name)
-                    is_lora_a = ".lora_A.weight" in name
-                    is_lora_b = ".lora_B.weight" in name
-                    if not is_lora_a and not is_lora_b:
-                        if ".base_layer.weight" in name:
-                            continue
-                        logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
-                        sys.exit(1)
-
-                    if base_name in tensor_map:
-                        if is_lora_a:
-                            tensor_map[base_name].A = tensor
-                        else:
-                            tensor_map[base_name].B = tensor
-                    else:
-                        if is_lora_a:
-                            tensor_map[base_name] = PartialLoraTensor(A=tensor)
-                        else:
-                            tensor_map[base_name] = PartialLoraTensor(B=tensor)
-
-                for name, tensor in tensor_map.items():
-                    assert tensor.A is not None
-                    assert tensor.B is not None
-                    yield (name, cast(torch.Tensor, LoraTorchTensor(tensor.A, tensor.B)))
-
-            def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-                dest = list(super().modify_tensors(data_torch, name, bid))
-                # some archs may have the same tensor for lm_head and output (tie word embeddings)
-                # in this case, adapters targeting lm_head will fail when using llama-export-lora
-                # therefore, we ignore them for now
-                # see: https://github.com/ggerganov/llama.cpp/issues/9065
-                if name == "lm_head.weight" and len(dest) == 0:
-                    raise ValueError("lm_head is present in adapter, but is ignored in base model")
-                for dest_name, dest_data in dest:
-                    assert isinstance(dest_data, LoraTorchTensor)
-                    lora_a, lora_b = dest_data.get_lora_A_B()
-
-                    yield (dest_name + ".lora_a", lora_a)
-                    yield (dest_name + ".lora_b", lora_b)
-
-        with open(lora_config, "r") as f:
-            lparams: dict[str, Any] = json.load(f)
-
-        alpha: float = lparams["lora_alpha"]
-
-        model_instance = LoraModel(
-            dir_base_model,
-            ftype,
-            fname_out,
-            is_big_endian=args.bigendian,
-            use_temp_file=False,
-            eager=args.no_lazy,
-            dry_run=args.dry_run,
-            dir_lora_model=dir_lora,
-            lora_alpha=alpha,
-            is_lora=True,
-        )
-
-        logger.info("Exporting model...")
-        model_instance.write()
-        logger.info(f"Model successfully exported to {model_instance.fname_out}")
--- a/docs/backend/BLIS.md
+++ b/docs/backend/BLIS.md
@@ -30,8 +30,8 @@ We recommend using openmp since it's easier to modify the cores being used.
 Makefile:

 ```bash
-make GGML_BLIS=1 -j
-# make GGML_BLIS=1 llama-benchmark-matmult
+make LLAMA_BLIS=1 -j
+# make LLAMA_BLIS=1 benchmark-matmult
 ```

 CMake:
@@ -39,7 +39,7 @@ CMake:
 ```bash
 mkdir build
 cd build
-cmake -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=FLAME ..
+cmake -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=FLAME ..
 make -j
 ```

--- a/docs/development/HOWTO-add-model.md
+++ b/docs/development/HOWTO-add-model.md
@@ -1,4 +1,4 @@
-# Add a new model architecture to `llama.cpp`
+## Add a new model architecture to `llama.cpp`

 Adding a model requires few steps:

@@ -9,15 +9,15 @@ Adding a model requires few steps:
 After following these steps, you can open PR.

 Also, it is important to check that the examples and main ggml backends (CUDA, METAL, CPU) are working with the new architecture, especially:
- [main](/examples/main/)
- [imatrix](/examples/imatrix/)
- [quantize](/examples/quantize/)
- [server](/examples/server/)
+- [main](../examples/main)
+- [imatrix](../examples/imatrix)
+- [quantize](../examples/quantize)
+- [server](../examples/server)

 ### 1. Convert the model to GGUF

 This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library.
-Depending on the model architecture, you can use either [convert_hf_to_gguf.py](/convert_hf_to_gguf.py) or [examples/convert_legacy_llama.py](/examples/convert_legacy_llama.py) (for `llama/llama2` models in `.pth` format).
+Depending on the model architecture, you can use either [convert.py](../convert.py) or [convert-hf-to-gguf.py](../convert-hf-to-gguf.py).

 The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors.

@@ -31,7 +31,7 @@ class MyModel(Model):
    model_arch = gguf.MODEL_ARCH.GROK
 ```

-2. Define the layout of the GGUF tensors in [constants.py](/gguf-py/gguf/constants.py)
+2. Define the layout of the GGUF tensors in [constants.py](../gguf-py/gguf/constants.py)

 Add an enum entry in `MODEL_ARCH`, the model human friendly name in `MODEL_ARCH_NAMES` and the GGUF tensor names in `MODEL_TENSORS`.

@@ -54,7 +54,7 @@ Example for `falcon` model:

 As a general rule, before adding a new tensor name to GGUF, be sure the equivalent naming does not already exist.

-Once you have found the GGUF tensor name equivalent, add it to the [tensor_mapping.py](/gguf-py/gguf/tensor_mapping.py) file.
+Once you have found the GGUF tensor name equivalent, add it to the [tensor_mapping.py](../gguf-py/gguf/tensor_mapping.py) file.

 If the tensor name is part of a repetitive layer/block, the key word `bid` substitutes it.

@@ -100,7 +100,7 @@ Have a look at existing implementation like `build_llama`, `build_dbrx` or `buil

 When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR.

-Note: to debug the inference graph: you can use [llama-eval-callback](/examples/eval-callback/).
+Note: to debug the inference graph: you can use [eval-callback](../examples/eval-callback).

 ## GGUF specification

--- a/docs/android.md
+++ b/docs/android.md
@@ -1,56 +0,0 @@
-
-# Android
-
-## Build on Android using Termux
-[Termux](https://github.com/termux/termux-app#installation) is a method to execute `llama.cpp` on an Android device (no root required).
-```
-apt update && apt upgrade -y
-apt install git make cmake
-```
-
-It's recommended to move your model inside the `~/` directory for best performance:
-```
-cd storage/downloads
-mv model.gguf ~/
-```
-
-[Get the code](https://github.com/ggerganov/llama.cpp#get-the-code) & [follow the Linux build instructions](https://github.com/ggerganov/llama.cpp#build) to build `llama.cpp`.
-
-## Building the Project using Android NDK
-Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake.
-
-Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux:
-```
-$ mkdir build-android
-$ cd build-android
-$ export NDK=<your_ndk_directory>
-$ cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
-$ make
-```
-
-Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).
-
-Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
-
-(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
-```
-$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/
-$cd /data/data/com.termux/files/home/bin
-$chmod +x ./*
-```
-
-Download model [llama-2-7b-chat.Q4_K_M.gguf](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/blob/main/llama-2-7b-chat.Q4_K_M.gguf), and push it to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/`
-
-```
-$mv /sdcard/llama.cpp/llama-2-7b-chat.Q4_K_M.gguf /data/data/com.termux/files/home/model/
-```
-
-Now, you can start chatting:
-```
-$cd /data/data/com.termux/files/home/bin
-$./llama-cli -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml
-```
-
-Here's a demo of an interactive session running on Pixel 5 phone:
-
-https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
--- a/docs/backend/CANN.md
+++ b/docs/backend/CANN.md
@@ -1,259 +0,0 @@
-# llama.cpp for CANN
-
- - [Background](#background)
- - [News](#news)
- - [OS](#os)
- - [Hardware](#hardware)
- - [Model Supports](#model-supports)
- - [DataType Supports](#datatype-supports)
- - [Docker](#docker)
- - [Linux](#linux)
- - [TODO](#todo)
-
-
-## Background
-
-**Ascend NPU** is a range of AI processors using Neural Processing Unit. It will efficiently handle matrix-matrix multiplication, dot-product and scalars.
-
-**CANN** (Compute Architecture for Neural Networks) is a heterogeneous computing architecture for AI scenarios, providing support for multiple AI frameworks on the top and serving AI processors and programming at the bottom. It plays a crucial role in bridging the gap between upper and lower layers, and is a key platform for improving the computing efficiency of Ascend AI processors. Meanwhile, it offers a highly efficient and easy-to-use programming interface for diverse application scenarios, allowing users to rapidly build AI applications and services based on the Ascend platform.
-
-**Llama.cpp + CANN**
-
-The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the ability of AscendC and ACLNN which are intergrated to CANN Toolkit and kernels to using Ascend NPU directly.
-
-## News
-
- 2024.8
-  - Support `Q4_0` and `Q8_0` data type for Ascend NPU.
- 2024.7
-  - Create CANN backend for Ascend NPU.
-
-## OS
-
-| OS      | Status  | Verified                                       |
-|:-------:|:-------:|:----------------------------------------------:|
-| Linux   | Support | Ubuntu 22.04, OpenEuler22.03                   |
-
-
-## Hardware
-
-### Ascend NPU
-
-**Verified devices**
-| Ascend NPU                    | Status  |
-|:-----------------------------:|:-------:|
-| Atlas 300T A2                 | Support |
-
-*Notes:*
-
- If you have trouble with Ascend NPU device, please create a issue with **[CANN]** prefix/tag.
- If you run successfully with your Ascend NPU device, please help update the upper table.
-
-
-## Model Supports
-
-| Model Name                  | FP16  | Q8_0 | Q4_0 |
-|:----------------------------|:-----:|:----:|:----:|
-| AquilaChat2-7B              |   √   |   √  |   √  |
-| Baichuan-7b                 |   √   |   √  |   √  |
-| Baichuan2-7B-Chat           |   √   |   √  |   √  |
-| bitnet_b1_58-large          |   √   |   √  |   √  |
-| bloom-560m                  |   √   |   x  |   √  |
-| bloomz-alpaca-560m          |   √   |   x  |   √  |
-| c4ai-command-r-35B-v01      |   x   |   x  |   x  |
-| chatglm3-6B                 |   x   |   x  |   x  |
-| chinese-alpaca-2-1.3b       |   √   |   √  |   √  |
-| CodeShell-7B                |   √   |   √  |   √  |
-| deepseek-ai_deepseek-coder-1.3B-base | x |   x  |   x  |
-| deepseek-ai_DeepSeek-V2-Lite | x   |   x  |   x   |
-| deepseek-coder-6.7B-instruct | x   |   x  |   x   |
-| DeepSeek-V2-Lite-64x1.5B    |   x   |   x  |   x  |
-| falcon-7b-instruct          |   √   |   √  |   √  |
-| flan-t5-large               |   √   |   √  |   √  |
-| gemma-2-9b-it               |   √   |   √  |   √  |
-| glm-4-9B                    |   x   |   x  |   x  |
-| gpt2                        |   √   |   √  |   √  |
-| Gpt2-163M                   |   √   |   √  |   √  |
-| granite-3B-code-instruct    |   √   |   √  |   √  |
-| GritLM-7B                   |   √   |   √  |   √  |
-| internlm2_5-7b-chat         |   √   |   √  |   √  |
-| koala-7B-HF                 |   √   |   √  |   √  |
-| Llama-2-7b-chat-hf          |   √   |   √  |   √  |
-| Llama-3-Smaug-8B            |   √   |   √  |   √  |
-| Llama2-Chinese-7b-Chat      |   √   |   √  |   √  |
-| Llama3-8B                   |   √   |   √  |   √  |
-| Llama3-8b-chinese           |   √   |   √  |   √  |
-| mamba-130m-hf               |   √   |   √  |   √  |
-| Mistral-7B-Instruct-v0.2    |   √   |   √  |   √  |
-| Mixtral-8x7B-Instruct-v0.1  |   x   |   √  |   √  |
-| mpt-7B                      |   √   |   √  |   √  |
-| OLMo-1B-hf                  |   √   |   √  |   √  |
-| OpenELM-3B-Instruct         |   √   |   √  |   √  |
-| Orion-14b-base              |   √   |   √  |   √  |
-| phi1                        |   x   |   x  |   x  |
-| phi2                        |   x   |   x  |   x  |
-| Phi-3-mini-4k-instruct      |   √   |   √  |   √  |
-| plamo-13b                   |   √   |   √  |   √  |
-| pythia-70M                  |   x   |   x  |   x  |
-| Qwen-7B                     |   √   |   √  |   √  |
-| Qwen2-1.5B-Instruct         |   √   |   x  |   √  |
-| Refact-1_6B-fim             |   √   |   √  |   √  |
-| SmolLM-135M                 |   √   |   √  |   √  |
-| stablelm-zephyr             |   x   |   x  |   x  |
-| stablelm-2-zephyr-1_6b      |   x   |   x  |   x  |
-| starcoderbase-1b            |   √   |   √  |   √  |
-| starcoder2-3b               |   √   |   √  |   √  |
-| vigogne-7b-chat             |   √   |   √  |   √  |
-| xverse-7b-chat              |   √   |   √  |   √  |
-| Yi-6b-Chat                  |   √   |   √  |   √  |
-
-
-
-## DataType Supports
-
-| DataType               | Status  |
-|:----------------------:|:-------:|
-| FP16                   | Support |
-| Q8_0                   | Support |
-| Q4_0                   | Support |
-
-## Docker
-
-### Build Images
-You can get a image with llama.cpp in one command.
-```sh
-docker build -t llama-cpp-cann -f .devops/llama-cli-cann.Dockerfile .
-```
-
-### Run container
-
-```sh
-# Find all cards.
-npu-smi info
-
-# Select the cards that you want to use, make sure these cards are not used by someone.
-# Following using cards of device0.
-docker run --name llamacpp --device /dev/davinci0  --device /dev/davinci_manager --device /dev/devmm_svm --device /dev/hisi_hdc -v /usr/local/dcmi:/usr/local/dcmi -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info -v /PATH_TO_YOUR_MODELS/:/app/models -it llama-cpp-cann -m /app/models/MODEL_PATH -ngl 32 -p "Building a website can be done in 10 simple steps:"
-```
-
-*Notes:*
-
- You may need to install Ascend Driver and firmware on the **host** machine *(Please refer to the [Linux configuration](#linux) for details)*.
-
-## Linux
-
-### I. Setup Environment
-
-1. **Install Ascend Driver and firmware**
-
-    ```sh
-    # create driver running user.
-    sudo groupadd -g HwHiAiUser
-    sudo useradd -g HwHiAiUser -d /home/HwHiAiUser -m HwHiAiUser -s /bin/bash
-    sudo usermod -aG HwHiAiUser $USER
-
-    # download driver from https://www.hiascend.com/hardware/firmware-drivers/community according to your system
-    # and install driver.
-    sudo sh Ascend-hdk-910b-npu-driver_x.x.x_linux-{arch}.run --full --install-for-all
-    ```
-
-    Once installed, run `npu-smi info` to check whether driver is installed successfully.
-    ```sh
-    +-------------------------------------------------------------------------------------------+
-    | npu-smi 24.1.rc2               Version: 24.1.rc2                                          |
-    +----------------------+---------------+----------------------------------------------------+
-    | NPU   Name           | Health        | Power(W)    Temp(C)           Hugepages-Usage(page)|
-    | Chip                 | Bus-Id        | AICore(%)   Memory-Usage(MB)  HBM-Usage(MB)        |
-    +======================+===============+====================================================+
-    | 2     xxx            | OK            | 64.4        51                15   / 15            |
-    | 0                    | 0000:01:00.0  | 0           1873 / 15077      0    / 32768         |
-    +======================+===============+====================================================+
-    | 5     xxx            | OK            | 64.0        52                15   / 15            |
-    | 0                    | 0000:81:00.0  | 0           1874 / 15077      0    / 32768         |
-    +======================+===============+====================================================+
-    | No running processes found in NPU 2                                                       |
-    +======================+===============+====================================================+
-    | No running processes found in NPU 5                                                       |
-    +======================+===============+====================================================+
-    ```
-
-2. **Install Ascend Firmware**
-    ```sh
-    # download driver from https://www.hiascend.com/hardware/firmware-drivers/community according to your system
-    # and install driver.
-    sudo sh Ascend-hdk-910b-npu-firmware_x.x.x.x.X.run --full
-    ```
-    If the following messaage appers, firmware is installed successfully.
-    ```sh
-    Firmware package installed successfully!
-    ```
-
-
-3. **Install CANN toolkit and kernels**
-
-    CANN toolkit and kernels can be obtained from the official [CANN Toolkit](https://www.hiascend.com/zh/developer/download/community/result?module=cann) page.
-
-    Please download the corresponding version that satified your system. The minimum version required is 8.0.RC2.alpha002 and here is the install command.
-    ```sh
-    pip3 install attrs numpy decorator sympy cffi pyyaml pathlib2 psutil protobuf scipy requests absl-py wheel typing_extensions
-    sh Ascend-cann-toolkit_8.0.RC2.alpha002_linux-aarch64.run --install
-    sh Ascend-cann-kernels-910b_8.0.RC2.alpha002_linux.run --install
-    ```
-
-    Set Ascend Variables:
-    ```sh
-    echo "source ~/Ascend/ascend-toolkit/set_env.sh" >> ~/.bashrc
-    source ~/.bashrc
-    ```
-
-Upon a successful installation, CANN is enabled for the available ascend devices.
-
-### II. Build llama.cpp
-
-```sh
-cmake -B build -DGGML_CANN=on -DCMAKE_BUILD_TYPE=release
-cmake --build build --config release
-```
-
-### III. Run the inference
-
-1. **Retrieve and prepare model**
-
-    You can refer to the general [*Prepare and Quantize*](../../README.md#prepare-and-quantize) guide for model prepration.
-
-    **Notes**:
-
-      - CANN backend only supports FP16/Q4_0/Q8_0 models currently.
-
-2. **Launch inference**
-
-    There are two device selection modes:
-
-    - Single device: Use one device target specified by the user.
-    - Multiple devices: Automatically choose the devices with the same backend.
-
-    | Device selection | Parameter                              |
-    |:----------------:|:--------------------------------------:|
-    | Single device    | --split-mode none --main-gpu DEVICE_ID |
-    | Multiple devices | --split-mode layer (default)           |
-
-    Examples:
-
-    - Use device 0:
-
-    ```sh
-    ./build/bin/llama-cli -m path_to_model -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
-    ```
-
-    - Use multiple devices:
-
-    ```sh
-    ./build/bin/llama-cli -m path_to_model -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
-    ```
-
-### **GitHub contribution**:
-Please add the **[CANN]** prefix/tag in issues/PRs titles to help the CANN-team check/address them without delay.
-
-
-## TODO
- Support more models and data types.
--- a/docs/build.md
+++ b/docs/build.md
@@ -1,388 +0,0 @@
-# Build llama.cpp locally
-
-**To get the Code:**
-
-```bash
-git clone https://github.com/ggerganov/llama.cpp
-cd llama.cpp
-```
-
-In order to build llama.cpp you have four different options.
-
- Using `make`:
-  - On Linux or MacOS:
-
-      ```bash
-      make
-      ```
-
-  - On Windows (x86/x64 only, arm64 requires cmake):
-
-    1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
-    2. Extract `w64devkit` on your pc.
-    3. Run `w64devkit.exe`.
-    4. Use the `cd` command to reach the `llama.cpp` folder.
-    5. From here you can run:
-        ```bash
-        make
-        ```
-
-  - Notes:
-    - For `Q4_0_4_4` quantization type build, add the `GGML_NO_LLAMAFILE=1` flag. For example, use `make GGML_NO_LLAMAFILE=1`.
-    - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `make -j 8` will run 8 jobs in parallel.
-    - For faster repeated compilation, install [ccache](https://ccache.dev/).
-    - For debug builds, run `make LLAMA_DEBUG=1`
-
- Using `CMake`:
-
-  ```bash
-  cmake -B build
-  cmake --build build --config Release
-  ```
-
-  **Notes**:
-
-    - For `Q4_0_4_4` quantization type build, add the `-DGGML_LLAMAFILE=OFF` cmake option. For example, use `cmake -B build -DGGML_LLAMAFILE=OFF`.
-    - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
-    - For faster repeated compilation, install [ccache](https://ccache.dev/).
-    - For debug builds, there are two cases:
-
-      1. Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
-
-      ```bash
-      cmake -B build -DCMAKE_BUILD_TYPE=Debug
-      cmake --build build
-      ```
-
-      2. Multi-config generators (`-G` param set to Visual Studio, XCode...):
-
-      ```bash
-      cmake -B build -G "Xcode"
-      cmake --build build --config Debug
-      ```
-    - Building for Windows (x86, x64 and arm64) with MSVC or clang as compilers:
-      - Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/de/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...):
-        - Tab Workload: Desktop-development with C++
-        - Tab Components (select quickly via search): C++-_CMake_ Tools for Windows, _Git_ for Windows, C++-_Clang_ Compiler for Windows, MS-Build Support for LLVM-Toolset (clang)
-      - Please remember to always use a Developer Command Prompt / PowerShell for VS2022 for git, build, test
-      - For Windows on ARM (arm64, WoA) build with:
-        ```bash
-        cmake --preset arm64-windows-llvm-release -D GGML_OPENMP=OFF
-        cmake --build build-arm64-windows-llvm-release
-        ```
-        Note: Building for arm64 could also be done just with MSVC (with the build-arm64-windows-MSVC preset, or the standard CMake build instructions). But MSVC does not support inline ARM assembly-code, used e.g. for the accelerated Q4_0_4_8 CPU kernels.
-
-   Using `gmake` (FreeBSD):
-
-    1. Install and activate [DRM in FreeBSD](https://wiki.freebsd.org/Graphics)
-    2. Add your user to **video** group
-    3. Install compilation dependencies.
-
-        ```bash
-        sudo pkg install gmake automake autoconf pkgconf llvm15 openblas
-
-        gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4
-        ```
-
-## Metal Build
-
-On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
-To disable the Metal build at compile time use the `GGML_NO_METAL=1` flag or the `GGML_METAL=OFF` cmake option.
-
-When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers|-ngl 0` command-line
-argument.
-
-## BLAS Build
-
-Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS. There are currently several different BLAS implementations available for build and use:
-
-### Accelerate Framework:
-
-This is only available on Mac PCs and it's enabled by default. You can just build using the normal instructions.
-
-### OpenBLAS:
-
-This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS installed on your machine.
-
- Using `make`:
-  - On Linux:
-    ```bash
-    make GGML_OPENBLAS=1
-    ```
-
-  - On Windows:
-
-    1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
-    2. Download the latest version of [OpenBLAS for Windows](https://github.com/xianyi/OpenBLAS/releases).
-    3. Extract `w64devkit` on your pc.
-    4. From the OpenBLAS zip that you just downloaded copy `libopenblas.a`, located inside the `lib` folder, inside `w64devkit\x86_64-w64-mingw32\lib`.
-    5. From the same OpenBLAS zip copy the content of the `include` folder inside `w64devkit\x86_64-w64-mingw32\include`.
-    6. Run `w64devkit.exe`.
-    7. Use the `cd` command to reach the `llama.cpp` folder.
-    8. From here you can run:
-
-        ```bash
-        make GGML_OPENBLAS=1
-        ```
-
- Using `CMake` on Linux:
-
-    ```bash
-    cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
-    cmake --build build --config Release
-    ```
-
-### BLIS
-
-Check [BLIS.md](./backend/BLIS.md) for more information.
-
-### SYCL
-
-SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
-
-llama.cpp based on SYCL is used to **support Intel GPU** (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
-
-For detailed info, please refer to [llama.cpp for SYCL](./backend/SYCL.md).
-
-### Intel oneMKL
-
-Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [llama.cpp for SYCL](./backend/SYCL.md).
-
- Using manual oneAPI installation:
-  By default, `GGML_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DGGML_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps:
-    ```bash
-    source /opt/intel/oneapi/setvars.sh # You can skip this step if  in oneapi-basekit docker image, only required for manual installation
-    cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_NATIVE=ON
-    cmake --build build --config Release
-    ```
-
- Using oneAPI docker image:
-  If you do not want to source the environment vars and install oneAPI manually, you can also build the code using intel docker container: [oneAPI-basekit](https://hub.docker.com/r/intel/oneapi-basekit). Then, you can use the commands given above.
-
-Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.
-
-### CUDA
-
-This provides GPU acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
-
-For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](https://www.jetson-ai-lab.com/tutorial_text-generation.html). If you are using an old model(nano/TX2), need some additional operations before compiling.
-
- Using `make`:
-  ```bash
-  make GGML_CUDA=1
-  ```
- Using `CMake`:
-
-  ```bash
-  cmake -B build -DGGML_CUDA=ON
-  cmake --build build --config Release
-  ```
-
-The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used.
-
-The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted. In Windows this setting is available in the NVIDIA control panel as `System Memory Fallback`.
-
-The following compilation options are also available to tweak performance:
-
-| Option                        | Legal values           | Default | Description                                                                                                                                                                                                                                                                             |
-|-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| GGML_CUDA_FORCE_DMMV          | Boolean                | false   | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
-| GGML_CUDA_DMMV_X              | Positive integer >= 32 | 32      | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants.                                         |
-| GGML_CUDA_MMV_Y               | Positive integer       | 1       | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended.                                                                                                                                         |
-| GGML_CUDA_FORCE_MMQ           | Boolean                | false   | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower.                       |
-| GGML_CUDA_FORCE_CUBLAS        | Boolean                | false   | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models                                                                                                                                                                                       |
-| GGML_CUDA_F16                 | Boolean                | false   | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs.                                                           |
-| GGML_CUDA_KQUANTS_ITER        | 1 or 2                 | 2       | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs.                                                                                                                     |
-| GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer       | 128     | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial.                                                                         |
-| GGML_CUDA_FA_ALL_QUANTS       | Boolean                | false   | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer.                                                                                                  |
-
-### MUSA
-
- Using `make`:
-  ```bash
-  make GGML_MUSA=1
-  ```
- Using `CMake`:
-
-  ```bash
-  cmake -B build -DGGML_MUSA=ON
-  cmake --build build --config Release
-  ```
-
-### hipBLAS
-
-This provides BLAS acceleration on HIP-supported AMD GPUs.
-Make sure to have ROCm installed.
-You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html#rocm-install-quick).
-
- Using `make`:
-  ```bash
-  make GGML_HIPBLAS=1
-  ```
- Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
-  ```bash
-  HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
-      cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
-      && cmake --build build --config Release -- -j 16
-  ```
-  On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`.
-  However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
-
-  Note that if you get the following error:
-  ```
-  clang: error: cannot find ROCm device library; provide its path via '--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build without ROCm device library
-  ```
-  Try searching for a directory under `HIP_PATH` that contains the file
-  `oclc_abi_version_400.bc`. Then, add the following to the start of the
-  command: `HIP_DEVICE_LIB_PATH=<directory-you-just-found>`, so something
-  like:
-  ```bash
-  HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \
-  HIP_DEVICE_LIB_PATH=<directory-you-just-found> \
-      cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
-      && cmake --build build -- -j 16
-  ```
-
- Using `make` (example for target gfx1030, build with 16 CPU threads):
-  ```bash
-  make -j16 GGML_HIPBLAS=1 GGML_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
-  ```
-
- Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
-  ```bash
-  set PATH=%HIP_PATH%\bin;%PATH%
-  cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DGGML_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
-  cmake --build build
-  ```
-  Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
-  Find your gpu version string by matching the most significant version information from `rocminfo | grep gfx | head -1 | awk '{print $2}'` with the list of processors, e.g. `gfx1035` maps to `gfx1030`.
-
-
-The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
-If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
-The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
-
-| Option                 | Legal values           | Default | Description                                                                                                                                                                                                                                    |
-|------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| GGML_CUDA_DMMV_X       | Positive integer >= 32 | 32      | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
-| GGML_CUDA_MMV_Y        | Positive integer       | 1       | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants.                                                                       |
-| GGML_CUDA_KQUANTS_ITER | 1 or 2                 | 2       | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs.                                                                             |
-
-### Vulkan
-
-**Windows**
-
-#### w64devkit
-
-Download and extract [w64devkit](https://github.com/skeeto/w64devkit/releases).
-
-Download and install the [Vulkan SDK](https://vulkan.lunarg.com/sdk/home#windows). When selecting components, only the Vulkan SDK Core is required.
-
-Launch `w64devkit.exe` and run the following commands to copy Vulkan dependencies:
-```sh
-SDK_VERSION=1.3.283.0
-cp /VulkanSDK/$SDK_VERSION/Bin/glslc.exe $W64DEVKIT_HOME/bin/
-cp /VulkanSDK/$SDK_VERSION/Lib/vulkan-1.lib $W64DEVKIT_HOME/x86_64-w64-mingw32/lib/
-cp -r /VulkanSDK/$SDK_VERSION/Include/* $W64DEVKIT_HOME/x86_64-w64-mingw32/include/
-cat > $W64DEVKIT_HOME/x86_64-w64-mingw32/lib/pkgconfig/vulkan.pc <<EOF
-Name: Vulkan-Loader
-Description: Vulkan Loader
-Version: $SDK_VERSION
-Libs: -lvulkan-1
-EOF
-
-```
-Switch into the `llama.cpp` directory and run `make GGML_VULKAN=1`.
-
-#### MSYS2
-Install [MSYS2](https://www.msys2.org/) and then run the following commands in a UCRT terminal to install dependencies.
-  ```sh
-  pacman -S git \
-      mingw-w64-ucrt-x86_64-gcc \
-      mingw-w64-ucrt-x86_64-cmake \
-      mingw-w64-ucrt-x86_64-vulkan-devel \
-      mingw-w64-ucrt-x86_64-shaderc
-  ```
-Switch into `llama.cpp` directory and build using CMake.
-```sh
-cmake -B build -DGGML_VULKAN=ON
-cmake --build build --config Release
-```
-
-**With docker**:
-
-You don't need to install Vulkan SDK. It will be installed inside the container.
-
-```sh
-# Build the image
-docker build -t llama-cpp-vulkan -f .devops/llama-cli-vulkan.Dockerfile .
-
-# Then, use it:
-docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
-```
-
-**Without docker**:
-
-Firstly, you need to make sure you have installed [Vulkan SDK](https://vulkan.lunarg.com/doc/view/latest/linux/getting_started_ubuntu.html)
-
-For example, on Ubuntu 22.04 (jammy), use the command below:
-
-```bash
-wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add -
-wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
-apt update -y
-apt-get install -y vulkan-sdk
-# To verify the installation, use the command below:
-vulkaninfo
-```
-
-Alternatively your package manager might be able to provide the appropriate libraries.
-For example for Ubuntu 22.04 you can install `libvulkan-dev` instead.
-For Fedora 40, you can install `vulkan-devel`, `glslc` and `glslang` packages.
-
-Then, build llama.cpp using the cmake command below:
-
-```bash
-cmake -B build -DGGML_VULKAN=1
-cmake --build build --config Release
-# Test the output binary (with "-ngl 33" to offload all layers to GPU)
-./bin/llama-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
-
-# You should see in the output, ggml_vulkan detected your GPU. For example:
-# ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
-```
-
-### CANN
-This provides NPU acceleration using the AI cores of your Ascend NPU. And [CANN](https://www.hiascend.com/en/software/cann) is a hierarchical APIs to help you to quickly build AI applications and service based on Ascend NPU.
-
-For more information about Ascend NPU in [Ascend Community](https://www.hiascend.com/en/).
-
-Make sure to have the CANN toolkit installed. You can download it from here: [CANN Toolkit](https://www.hiascend.com/developer/download/community/result?module=cann)
-
-Go to `llama.cpp` directory and build using CMake.
-```bash
-cmake -B build -DGGML_CANN=on -DCMAKE_BUILD_TYPE=release
-cmake --build build --config release
-```
-
-You can test with:
-
-`./build/llama-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32`
-
-If the fllowing info is output on screen, you are using `llama.cpp by CANN backend`:
-```bash
-llm_load_tensors:       CANN buffer size = 13313.00 MiB
-llama_new_context_with_model:       CANN compute buffer size =  1260.81 MiB
-```
-
-For detailed info, such as model/device supports, CANN install, please refer to [llama.cpp for CANN](./backend/CANN.md).
-
-### Android
-
-To read documentation for how to build on Android, [click here](./android.md)
-
-### Arm CPU optimized mulmat kernels
-
-Llama.cpp includes a set of optimized mulmat kernels for the Arm architecture, leveraging Arm® Neon™, int8mm and SVE instructions. These kernels are enabled at build time through the appropriate compiler cpu-type flags, such as `-DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+sve`. Note that these optimized kernels require the model to be quantized into one of the formats: `Q4_0_4_4` (Arm Neon), `Q4_0_4_8` (int8mm) or `Q4_0_8_8` (SVE). The SVE mulmat kernel specifically requires a vector width of 256 bits. When running on devices with a different vector width, it is recommended to use the `Q4_0_4_8` (int8mm) or `Q4_0_4_4` (Arm Neon) formats for better performance. Refer to [examples/quantize/README.md](../examples/quantize/README.md) for more information on the quantization formats.
-
-To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_LLAMAFILE=OFF` (`cmake`).
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
slaren	e9095e6098	async direct io per tensor test	2024-05-22 01:08:52 +02:00
Pavel Fatin	46db3506aa	address review comments	2024-05-21 20:05:26 +02:00
Pavel Fatin	1b17ed7ab6	Direct I/O and Transparent HugePages --direct-io for bypassing page cache (and using THP on Linux) Up to 3-6x faster uncached loading, fewer pageouts, no page cache pollution.	2024-05-21 01:35:23 +02:00