llama : add OLMo November 2024 support (#10394 )

* Add OLMo November 2024 constants * Add OLMo November 2024 converter * Add loading of OLMo November 2024 tensors and hyper parameters * Add building of OLMo November 2024 model
sycl : Add option to set the SYCL architecture for all targets (#10266 )
2026-02-05 13:53:23 +02:00 · 2024-11-19 11:04:08 +02:00 · 2024-11-19 08:02:23 +00:00 · 2024-11-19 08:25:17 +01:00 · 2024-11-19 08:50:04 +08:00 · 2024-11-18 18:43:40 +01:00
476 changed files with 110254 additions and 65498 deletions
--- a/.devops/full-cuda.Dockerfile
+++ b/.devops/full-cuda.Dockerfile
@@ -1,18 +1,16 @@
 ARG UBUNTU_VERSION=22.04
-
 # This needs to generally match the container host's environment.
-ARG CUDA_VERSION=11.7.1
-
+ARG CUDA_VERSION=12.6.0
 # Target the CUDA build image
 ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}

 FROM ${BASE_CUDA_DEV_CONTAINER} AS build

-# Unless otherwise specified, we make a fat build.
-ARG CUDA_DOCKER_ARCH=all
+# CUDA architecture to build for (defaults to all supported archs)
+ARG CUDA_DOCKER_ARCH=default

 RUN apt-get update && \
-    apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
+    apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1

 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
@@ -24,13 +22,12 @@ WORKDIR /app

 COPY . .

-# Set nvcc architecture
-ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
-# Enable CUDA
-ENV GGML_CUDA=1
-# Enable cURL
-ENV LLAMA_CURL=1
-
-RUN make -j$(nproc)
+# Use the default CUDA archs if not specified
+RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
+        export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
+    fi && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake --build build --config Release -j$(nproc) && \
+    cp build/bin/* .

 ENTRYPOINT ["/app/.devops/tools.sh"]
--- a/.devops/full-musa.Dockerfile
+++ b/.devops/full-musa.Dockerfile
@@ -0,0 +1,26 @@
+ARG UBUNTU_VERSION=22.04
+# This needs to generally match the container host's environment.
+ARG MUSA_VERSION=rc3.1.0
+# Target the MUSA build image
+ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+
+FROM ${BASE_MUSA_DEV_CONTAINER} AS build
+
+RUN apt-get update && \
+    apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
+
+COPY requirements.txt   requirements.txt
+COPY requirements       requirements
+
+RUN pip install --upgrade pip setuptools wheel \
+    && pip install -r requirements.txt
+
+WORKDIR /app
+
+COPY . .
+
+RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake --build build --config Release -j$(nproc) && \
+    cp build/bin/* .
+
+ENTRYPOINT ["/app/.devops/tools.sh"]
--- a/.devops/full-rocm.Dockerfile
+++ b/.devops/full-rocm.Dockerfile
@@ -11,7 +11,7 @@ FROM ${BASE_ROCM_DEV_CONTAINER} AS build
 # Unless otherwise specified, we make a fat build.
 # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
 # This is mostly tied to rocBLAS supported archs.
-ARG ROCM_DOCKER_ARCH=\
+ARG ROCM_DOCKER_ARCH="\
    gfx803 \
    gfx900 \
    gfx906 \
@@ -21,7 +21,7 @@ ARG ROCM_DOCKER_ARCH=\
    gfx1030 \
    gfx1100 \
    gfx1101 \
-    gfx1102
+    gfx1102"

 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
@@ -34,7 +34,7 @@ WORKDIR /app
 COPY . .

 # Set nvcc architecture
-ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
+ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
 # Enable ROCm
 ENV GGML_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
--- a/.devops/llama-cli-cann.Dockerfile
+++ b/.devops/llama-cli-cann.Dockerfile
@@ -1,6 +1,6 @@
 ARG ASCEND_VERSION=8.0.rc2.alpha003-910b-openeuler22.03-py3.8

-FROM cosdt/cann:$ASCEND_VERSION AS build
+FROM ascendai/cann:$ASCEND_VERSION AS build

 WORKDIR /app

@@ -22,11 +22,11 @@ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH

 RUN echo "Building with static libs" && \
    source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
-    cmake -B build -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF  && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF  && \
    cmake --build build --config Release --target llama-cli

 # TODO: use image with NNRT
-FROM cosdt/cann:$ASCEND_VERSION AS runtime
+FROM ascendai/cann:$ASCEND_VERSION AS runtime
 COPY --from=build /app/build/bin/llama-cli /llama-cli

 ENV LC_ALL=C.utf8
--- a/.devops/llama-cli-cuda.Dockerfile
+++ b/.devops/llama-cli-cuda.Dockerfile
@@ -1,6 +1,6 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
-ARG CUDA_VERSION=11.7.1
+ARG CUDA_VERSION=12.6.0
 # Target the CUDA build image
 ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
 # Target the CUDA runtime image
@@ -8,28 +8,31 @@ ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_V

 FROM ${BASE_CUDA_DEV_CONTAINER} AS build

-# Unless otherwise specified, we make a fat build.
-ARG CUDA_DOCKER_ARCH=all
+# CUDA architecture to build for (defaults to all supported archs)
+ARG CUDA_DOCKER_ARCH=default

 RUN apt-get update && \
-    apt-get install -y build-essential git
+    apt-get install -y build-essential git cmake

 WORKDIR /app

 COPY . .

-# Set nvcc architecture
-ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
-# Enable CUDA
-ENV GGML_CUDA=1
-
-RUN make -j$(nproc) llama-cli
+# Use the default CUDA archs if not specified
+RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
+        export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
+    fi && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake --build build --config Release --target llama-cli -j$(nproc) && \
+    mkdir -p /app/lib && \
+    find build -name "*.so" -exec cp {} /app/lib \;

 FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime

 RUN apt-get update && \
    apt-get install -y libgomp1

-COPY --from=build /app/llama-cli /llama-cli
+COPY --from=build /app/lib/ /
+COPY --from=build /app/build/bin/llama-cli /

 ENTRYPOINT [ "/llama-cli" ]
--- a/.devops/llama-cli-intel.Dockerfile
+++ b/.devops/llama-cli-intel.Dockerfile
@@ -1,4 +1,4 @@
-ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
+ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04

 FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build

@@ -15,7 +15,7 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
        export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
    fi && \
    echo "Building with static libs" && \
-    cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \
    ${OPT_SYCL_F16} -DBUILD_SHARED_LIBS=OFF && \
    cmake --build build --config Release --target llama-cli

--- a/.devops/llama-cli-musa.Dockerfile
+++ b/.devops/llama-cli-musa.Dockerfile
@@ -0,0 +1,31 @@
+ARG UBUNTU_VERSION=22.04
+# This needs to generally match the container host's environment.
+ARG MUSA_VERSION=rc3.1.0
+# Target the MUSA build image
+ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+# Target the MUSA runtime image
+ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
+
+FROM ${BASE_MUSA_DEV_CONTAINER} AS build
+
+RUN apt-get update && \
+    apt-get install -y build-essential git cmake
+
+WORKDIR /app
+
+COPY . .
+
+RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake --build build --config Release --target llama-cli -j$(nproc) && \
+    mkdir -p /app/lib && \
+    find build -name "*.so" -exec cp {} /app/lib \;
+
+FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
+
+RUN apt-get update && \
+    apt-get install -y libgomp1
+
+COPY --from=build /app/lib/ /
+COPY --from=build /app/build/bin/llama-cli /llama-cli
+
+ENTRYPOINT [ "/llama-cli" ]
--- a/.devops/llama-cli-rocm.Dockerfile
+++ b/.devops/llama-cli-rocm.Dockerfile
@@ -11,7 +11,7 @@ FROM ${BASE_ROCM_DEV_CONTAINER} AS build
 # Unless otherwise specified, we make a fat build.
 # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
 # This is mostly tied to rocBLAS supported archs.
-ARG ROCM_DOCKER_ARCH=\
+ARG ROCM_DOCKER_ARCH="\
    gfx803 \
    gfx900 \
    gfx906 \
@@ -21,7 +21,7 @@ ARG ROCM_DOCKER_ARCH=\
    gfx1030 \
    gfx1100 \
    gfx1101 \
-    gfx1102
+    gfx1102"

 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
@@ -34,7 +34,7 @@ WORKDIR /app
 COPY . .

 # Set nvcc architecture
-ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
+ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
 # Enable ROCm
 ENV GGML_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
--- a/.devops/llama-cli-vulkan.Dockerfile
+++ b/.devops/llama-cli-vulkan.Dockerfile
@@ -14,7 +14,7 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key
 # Build it
 WORKDIR /app
 COPY . .
-RUN cmake -B build -DGGML_VULKAN=1 && \
+RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 && \
    cmake --build build --config Release --target llama-cli

 # Clean up
--- a/.devops/llama-server-cuda.Dockerfile
+++ b/.devops/llama-server-cuda.Dockerfile
@@ -1,6 +1,6 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
-ARG CUDA_VERSION=11.7.1
+ARG CUDA_VERSION=12.6.0
 # Target the CUDA build image
 ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
 # Target the CUDA runtime image
@@ -8,33 +8,35 @@ ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_V

 FROM ${BASE_CUDA_DEV_CONTAINER} AS build

-# Unless otherwise specified, we make a fat build.
-ARG CUDA_DOCKER_ARCH=all
+# CUDA architecture to build for (defaults to all supported archs)
+ARG CUDA_DOCKER_ARCH=default

 RUN apt-get update && \
-    apt-get install -y build-essential git libcurl4-openssl-dev
+    apt-get install -y build-essential git cmake libcurl4-openssl-dev

 WORKDIR /app

 COPY . .

-# Set nvcc architecture
-ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
-# Enable CUDA
-ENV GGML_CUDA=1
-# Enable cURL
-ENV LLAMA_CURL=1
-# Must be set to 0.0.0.0 so it can listen to requests from host machine
-ENV LLAMA_ARG_HOST=0.0.0.0
-
-RUN make -j$(nproc) llama-server
+# Use the default CUDA archs if not specified
+RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
+        export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
+    fi && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake --build build --config Release --target llama-server -j$(nproc) && \
+    mkdir -p /app/lib && \
+    find build -name "*.so" -exec cp {} /app/lib \;

 FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime

 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev libgomp1 curl

-COPY --from=build /app/llama-server /llama-server
+COPY --from=build /app/lib/ /
+COPY --from=build /app/build/bin/llama-server /llama-server
+
+# Must be set to 0.0.0.0 so it can listen to requests from host machine
+ENV LLAMA_ARG_HOST=0.0.0.0

 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]

--- a/.devops/llama-server-intel.Dockerfile
+++ b/.devops/llama-server-intel.Dockerfile
@@ -1,4 +1,4 @@
-ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
+ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04

 FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build

@@ -15,7 +15,7 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
        export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
    fi && \
    echo "Building with dynamic libs" && \
-    cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
    cmake --build build --config Release --target llama-server

 FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
--- a/.devops/llama-server-musa.Dockerfile
+++ b/.devops/llama-server-musa.Dockerfile
@@ -0,0 +1,36 @@
+ARG UBUNTU_VERSION=22.04
+# This needs to generally match the container host's environment.
+ARG MUSA_VERSION=rc3.1.0
+# Target the MUSA build image
+ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+# Target the MUSA runtime image
+ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
+
+FROM ${BASE_MUSA_DEV_CONTAINER} AS build
+
+RUN apt-get update && \
+    apt-get install -y build-essential git cmake libcurl4-openssl-dev
+
+WORKDIR /app
+
+COPY . .
+
+RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake --build build --config Release --target llama-server -j$(nproc) && \
+    mkdir -p /app/lib && \
+    find build -name "*.so" -exec cp {} /app/lib \;
+
+FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
+
+RUN apt-get update && \
+    apt-get install -y libcurl4-openssl-dev libgomp1 curl
+
+COPY --from=build /app/lib/ /
+COPY --from=build /app/build/bin/llama-server /llama-server
+
+# Must be set to 0.0.0.0 so it can listen to requests from host machine
+ENV LLAMA_ARG_HOST=0.0.0.0
+
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+
+ENTRYPOINT [ "/llama-server" ]
--- a/.devops/llama-server-rocm.Dockerfile
+++ b/.devops/llama-server-rocm.Dockerfile
@@ -11,7 +11,7 @@ FROM ${BASE_ROCM_DEV_CONTAINER} AS build
 # Unless otherwise specified, we make a fat build.
 # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
 # This is mostly tied to rocBLAS supported archs.
-ARG ROCM_DOCKER_ARCH=\
+ARG ROCM_DOCKER_ARCH="\
    gfx803 \
    gfx900 \
    gfx906 \
@@ -21,7 +21,7 @@ ARG ROCM_DOCKER_ARCH=\
    gfx1030 \
    gfx1100 \
    gfx1101 \
-    gfx1102
+    gfx1102"

 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
@@ -34,7 +34,7 @@ WORKDIR /app
 COPY . .

 # Set nvcc architecture
-ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
+ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
 # Enable ROCm
 ENV GGML_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
--- a/.devops/llama-server-vulkan.Dockerfile
+++ b/.devops/llama-server-vulkan.Dockerfile
@@ -14,7 +14,7 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key
 # Build it
 WORKDIR /app
 COPY . .
-RUN cmake -B build -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
+RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
    cmake --build build --config Release --target llama-server

 # Clean up
--- a/.devops/nix/devshells.nix
+++ b/.devops/nix/devshells.nix
@@ -1,13 +1,52 @@
+{ inputs, ... }:
+
 {
  perSystem =
-    { config, lib, ... }:
+    {
+      config,
+      lib,
+      system,
+      ...
+    }:
    {
      devShells =
-        lib.concatMapAttrs
-          (name: package: {
-            ${name} = package.passthru.shell;
-            ${name + "-extra"} = package.passthru.shell-extra;
-          })
-          config.packages;
+        let
+          pkgs = import inputs.nixpkgs { inherit system; };
+          stdenv = pkgs.stdenv;
+          scripts = config.packages.python-scripts;
+        in
+        lib.pipe (config.packages) [
+          (lib.concatMapAttrs (
+            name: package: {
+              ${name} = pkgs.mkShell {
+                name = "${name}";
+                inputsFrom = [ package ];
+                shellHook = ''
+                  echo "Entering ${name} devShell"
+                '';
+              };
+              "${name}-extra" =
+                if (name == "python-scripts") then
+                  null
+                else
+                  pkgs.mkShell {
+                    name = "${name}-extra";
+                    inputsFrom = [
+                      package
+                      scripts
+                    ];
+                    # Extra packages that *may* be used by some scripts
+                    packages = [
+                        pkgs.python3Packages.tiktoken
+                    ];
+                    shellHook = ''
+                      echo "Entering ${name} devShell"
+                      addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib stdenv.cc.cc}/lib"
+                    '';
+                  };
+            }
+          ))
+          (lib.filterAttrs (name: value: value != null))
+        ];
    };
 }
--- a/.devops/nix/nixpkgs-instances.nix
+++ b/.devops/nix/nixpkgs-instances.nix
@@ -26,16 +26,14 @@
          config.cudaSupport = true;
          config.allowUnfreePredicate =
            p:
-            builtins.all
-              (
-                license:
-                license.free
-                || builtins.elem license.shortName [
-                  "CUDA EULA"
-                  "cuDNN EULA"
-                ]
-              )
-              (p.meta.licenses or [ p.meta.license ]);
+            builtins.all (
+              license:
+              license.free
+              || builtins.elem license.shortName [
+                "CUDA EULA"
+                "cuDNN EULA"
+              ]
+            ) (p.meta.licenses or [ p.meta.license ]);
        };
        # Ensure dependencies use ROCm consistently
        pkgsRocm = import inputs.nixpkgs {
--- a/.devops/nix/package-gguf-py.nix
+++ b/.devops/nix/package-gguf-py.nix
@@ -0,0 +1,36 @@
+{
+  lib,
+  llamaVersion,
+  numpy,
+  tqdm,
+  sentencepiece,
+  pyyaml,
+  poetry-core,
+  buildPythonPackage,
+  pytestCheckHook,
+}:
+
+buildPythonPackage {
+  pname = "gguf";
+  version = llamaVersion;
+  pyproject = true;
+  nativeBuildInputs = [ poetry-core ];
+  propagatedBuildInputs = [
+    numpy
+    tqdm
+    sentencepiece
+    pyyaml
+  ];
+  src = lib.cleanSource ../../gguf-py;
+  pythonImportsCheck = [
+    "numpy"
+    "gguf"
+  ];
+  nativeCheckInputs = [ pytestCheckHook ];
+  doCheck = true;
+  meta = with lib; {
+    description = "Python package for writing binary files in the GGUF format";
+    license = licenses.mit;
+    maintainers = [ maintainers.ditsuke ];
+  };
+}
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -3,31 +3,33 @@
  glibc,
  config,
  stdenv,
-  mkShell,
  runCommand,
  cmake,
  ninja,
  pkg-config,
  git,
-  python3,
  mpi,
  blas,
  cudaPackages,
+  autoAddDriverRunpath,
  darwin,
  rocmPackages,
  vulkan-headers,
  vulkan-loader,
  curl,
  shaderc,
-  useBlas ? builtins.all (x: !x) [
-    useCuda
-    useMetalKit
-    useRocm
-    useVulkan
-  ] && blas.meta.available,
+  useBlas ?
+    builtins.all (x: !x) [
+      useCuda
+      useMetalKit
+      useRocm
+      useVulkan
+    ]
+    && blas.meta.available,
  useCuda ? config.cudaSupport,
  useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin,
-  useMpi ? false, # Increases the runtime closure size by ~700M
+  # Increases the runtime closure size by ~700M
+  useMpi ? false,
  useRocm ? config.rocmSupport,
  enableCurl ? true,
  useVulkan ? false,
@@ -37,8 +39,8 @@
  # otherwise we get libstdc++ errors downstream.
  effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv,
  enableStatic ? effectiveStdenv.hostPlatform.isStatic,
-  precompileMetalShaders ? false
-}@inputs:
+  precompileMetalShaders ? false,
+}:

 let
  inherit (lib)
@@ -46,7 +48,6 @@ let
    cmakeFeature
    optionals
    strings
-    versionOlder
    ;

  stdenv = throw "Use effectiveStdenv instead";
@@ -62,54 +63,11 @@ let
  pnameSuffix =
    strings.optionalString (suffices != [ ])
      "-${strings.concatMapStringsSep "-" strings.toLower suffices}";
-  descriptionSuffix =
-    strings.optionalString (suffices != [ ])
-      ", accelerated with ${strings.concatStringsSep ", " suffices}";
+  descriptionSuffix = strings.optionalString (
+    suffices != [ ]
+  ) ", accelerated with ${strings.concatStringsSep ", " suffices}";

-  executableSuffix = effectiveStdenv.hostPlatform.extensions.executable;
-
-  # TODO: package the Python in this repository in a Nix-like way.
-  # It'd be nice to migrate to buildPythonPackage, as well as ensure this repo
-  # is PEP 517-compatible, and ensure the correct .dist-info is generated.
-  # https://peps.python.org/pep-0517/
-  #
-  # TODO: Package up each Python script or service appropriately, by making
-  # them into "entrypoints"
-  llama-python = python3.withPackages (
-    ps: [
-      ps.numpy
-      ps.sentencepiece
-    ]
-  );
-
-  # TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime
-  llama-python-extra = python3.withPackages (
-    ps: [
-      ps.numpy
-      ps.sentencepiece
-      ps.tiktoken
-      ps.torchWithoutCuda
-      ps.transformers
-
-      # server bench
-      ps.matplotlib
-
-      # server tests
-      ps.openai
-      ps.behave
-      ps.prometheus-client
-
-      # for examples/pydantic-models-to-grammar-examples.py
-      ps.docstring-parser
-      ps.pydantic
-
-      # for scripts/compare-llama-bench.py
-      ps.gitpython
-      ps.tabulate
-    ]
-  );
-
-  xcrunHost = runCommand "xcrunHost" {} ''
+  xcrunHost = runCommand "xcrunHost" { } ''
    mkdir -p $out/bin
    ln -s /usr/bin/xcrun $out/bin
  '';
@@ -144,181 +102,145 @@ let
  ];
 in

-effectiveStdenv.mkDerivation (
-  finalAttrs: {
-    pname = "llama-cpp${pnameSuffix}";
-    version = llamaVersion;
+effectiveStdenv.mkDerivation (finalAttrs: {
+  pname = "llama-cpp${pnameSuffix}";
+  version = llamaVersion;

-    # Note: none of the files discarded here are visible in the sandbox or
-    # affect the output hash. This also means they can be modified without
-    # triggering a rebuild.
-    src = lib.cleanSourceWith {
-      filter =
-        name: type:
-        let
-          noneOf = builtins.all (x: !x);
-          baseName = baseNameOf name;
-        in
-        noneOf [
-          (lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths
-          (lib.hasSuffix ".md" name) # Ignore *.md changes whe computing outPaths
-          (lib.hasPrefix "." baseName) # Skip hidden files and directories
-          (baseName == "flake.lock")
-        ];
-      src = lib.cleanSource ../../.;
-    };
-
-    postPatch = ''
-      substituteInPlace ./ggml/src/ggml-metal.m \
-        --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
-      substituteInPlace ./ggml/src/ggml-metal.m \
-        --replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
-    '';
-
-    # With PR#6015 https://github.com/ggerganov/llama.cpp/pull/6015,
-    # `default.metallib` may be compiled with Metal compiler from XCode
-    # and we need to escape sandbox on MacOS to access Metal compiler.
-    # `xcrun` is used find the path of the Metal compiler, which is varible
-    # and not on $PATH
-    # see https://github.com/ggerganov/llama.cpp/pull/6118 for discussion
-    __noChroot = effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders;
-
-    nativeBuildInputs =
-      [
-        cmake
-        ninja
-        pkg-config
-        git
-      ]
-      ++ optionals useCuda [
-        cudaPackages.cuda_nvcc
-
-        # TODO: Replace with autoAddDriverRunpath
-        # once https://github.com/NixOS/nixpkgs/pull/275241 has been merged
-        cudaPackages.autoAddOpenGLRunpathHook
-      ]
-      ++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [
-        glibc.static
-      ] ++ optionals (effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders) [
-        xcrunHost
+  # Note: none of the files discarded here are visible in the sandbox or
+  # affect the output hash. This also means they can be modified without
+  # triggering a rebuild.
+  src = lib.cleanSourceWith {
+    filter =
+      name: type:
+      let
+        noneOf = builtins.all (x: !x);
+        baseName = baseNameOf name;
+      in
+      noneOf [
+        (lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths
+        (lib.hasSuffix ".md" name) # Ignore *.md changes whe computing outPaths
+        (lib.hasPrefix "." baseName) # Skip hidden files and directories
+        (baseName == "flake.lock")
      ];
+    src = lib.cleanSource ../../.;
+  };

-    buildInputs =
-      optionals effectiveStdenv.isDarwin darwinBuildInputs
-      ++ optionals useCuda cudaBuildInputs
-      ++ optionals useMpi [ mpi ]
-      ++ optionals useRocm rocmBuildInputs
-      ++ optionals useBlas [ blas ]
-      ++ optionals useVulkan vulkanBuildInputs
-      ++ optionals enableCurl [ curl ];
+  postPatch = ''
+    substituteInPlace ./ggml/src/ggml-metal/ggml-metal.m \
+      --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
+    substituteInPlace ./ggml/src/ggml-metal/ggml-metal.m \
+      --replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
+  '';

-    cmakeFlags =
-      [
-        (cmakeBool "LLAMA_BUILD_SERVER" true)
-        (cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
-        (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
-        (cmakeBool "LLAMA_CURL" enableCurl)
-        (cmakeBool "GGML_NATIVE" false)
-        (cmakeBool "GGML_BLAS" useBlas)
-        (cmakeBool "GGML_CUDA" useCuda)
-        (cmakeBool "GGML_HIPBLAS" useRocm)
-        (cmakeBool "GGML_METAL" useMetalKit)
-        (cmakeBool "GGML_VULKAN" useVulkan)
-        (cmakeBool "GGML_STATIC" enableStatic)
-      ]
-      ++ optionals useCuda [
-        (
-          with cudaPackages.flags;
-          cmakeFeature "CMAKE_CUDA_ARCHITECTURES" (
-            builtins.concatStringsSep ";" (map dropDot cudaCapabilities)
-          )
+  # With PR#6015 https://github.com/ggerganov/llama.cpp/pull/6015,
+  # `default.metallib` may be compiled with Metal compiler from XCode
+  # and we need to escape sandbox on MacOS to access Metal compiler.
+  # `xcrun` is used find the path of the Metal compiler, which is varible
+  # and not on $PATH
+  # see https://github.com/ggerganov/llama.cpp/pull/6118 for discussion
+  __noChroot = effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders;
+
+  nativeBuildInputs =
+    [
+      cmake
+      ninja
+      pkg-config
+      git
+    ]
+    ++ optionals useCuda [
+      cudaPackages.cuda_nvcc
+
+      autoAddDriverRunpath
+    ]
+    ++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [ glibc.static ]
+    ++ optionals (effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders) [ xcrunHost ];
+
+  buildInputs =
+    optionals effectiveStdenv.isDarwin darwinBuildInputs
+    ++ optionals useCuda cudaBuildInputs
+    ++ optionals useMpi [ mpi ]
+    ++ optionals useRocm rocmBuildInputs
+    ++ optionals useBlas [ blas ]
+    ++ optionals useVulkan vulkanBuildInputs
+    ++ optionals enableCurl [ curl ];
+
+  cmakeFlags =
+    [
+      (cmakeBool "LLAMA_BUILD_SERVER" true)
+      (cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
+      (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
+      (cmakeBool "LLAMA_CURL" enableCurl)
+      (cmakeBool "GGML_NATIVE" false)
+      (cmakeBool "GGML_BLAS" useBlas)
+      (cmakeBool "GGML_CUDA" useCuda)
+      (cmakeBool "GGML_HIP" useRocm)
+      (cmakeBool "GGML_METAL" useMetalKit)
+      (cmakeBool "GGML_VULKAN" useVulkan)
+      (cmakeBool "GGML_STATIC" enableStatic)
+    ]
+    ++ optionals useCuda [
+      (
+        with cudaPackages.flags;
+        cmakeFeature "CMAKE_CUDA_ARCHITECTURES" (
+          builtins.concatStringsSep ";" (map dropDot cudaCapabilities)
        )
-      ]
-      ++ optionals useRocm [
-        (cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
-        (cmakeFeature "CMAKE_HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets))
-      ]
-      ++ optionals useMetalKit [
-        (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
-        (cmakeBool "GGML_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
-      ];
+      )
+    ]
+    ++ optionals useRocm [
+      (cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
+      (cmakeFeature "CMAKE_HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets))
+    ]
+    ++ optionals useMetalKit [
+      (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
+      (cmakeBool "GGML_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
+    ];

-    # Environment variables needed for ROCm
-    env = optionals useRocm {
-      ROCM_PATH = "${rocmPackages.clr}";
-      HIP_DEVICE_LIB_PATH = "${rocmPackages.rocm-device-libs}/amdgcn/bitcode";
-    };
+  # Environment variables needed for ROCm
+  env = optionals useRocm {
+    ROCM_PATH = "${rocmPackages.clr}";
+    HIP_DEVICE_LIB_PATH = "${rocmPackages.rocm-device-libs}/amdgcn/bitcode";
+  };

-    # TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
-    # if they haven't been added yet.
-    postInstall = ''
-      mkdir -p $out/include
-      cp $src/include/llama.h $out/include/
-    '';
+  # TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
+  # if they haven't been added yet.
+  postInstall = ''
+    mkdir -p $out/include
+    cp $src/include/llama.h $out/include/
+  '';

-    # Define the shells here, but don't add in the inputsFrom to avoid recursion.
-    passthru = {
-      inherit
-        useBlas
-        useCuda
-        useMetalKit
-        useMpi
-        useRocm
-        useVulkan
-        ;
+  meta = {
+    # Configurations we don't want even the CI to evaluate. Results in the
+    # "unsupported platform" messages. This is mostly a no-op, because
+    # cudaPackages would've refused to evaluate anyway.
+    badPlatforms = optionals useCuda lib.platforms.darwin;

-      shell = mkShell {
-        name = "shell-${finalAttrs.finalPackage.name}";
-        description = "contains numpy and sentencepiece";
-        buildInputs = [ llama-python ];
-        inputsFrom = [ finalAttrs.finalPackage ];
-        shellHook = ''
-          addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib effectiveStdenv.cc.cc}/lib"
-        '';
-      };
+    # Configurations that are known to result in build failures. Can be
+    # overridden by importing Nixpkgs with `allowBroken = true`.
+    broken = (useMetalKit && !effectiveStdenv.isDarwin);

-      shell-extra = mkShell {
-        name = "shell-extra-${finalAttrs.finalPackage.name}";
-        description = "contains numpy, sentencepiece, torchWithoutCuda, and transformers";
-        buildInputs = [ llama-python-extra ];
-        inputsFrom = [ finalAttrs.finalPackage ];
-      };
-    };
+    description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
+    homepage = "https://github.com/ggerganov/llama.cpp/";
+    license = lib.licenses.mit;

-    meta = {
-      # Configurations we don't want even the CI to evaluate. Results in the
-      # "unsupported platform" messages. This is mostly a no-op, because
-      # cudaPackages would've refused to evaluate anyway.
-      badPlatforms = optionals useCuda lib.platforms.darwin;
+    # Accommodates `nix run` and `lib.getExe`
+    mainProgram = "llama-cli";

-      # Configurations that are known to result in build failures. Can be
-      # overridden by importing Nixpkgs with `allowBroken = true`.
-      broken = (useMetalKit && !effectiveStdenv.isDarwin);
+    # These people might respond, on the best effort basis, if you ping them
+    # in case of Nix-specific regressions or for reviewing Nix-specific PRs.
+    # Consider adding yourself to this list if you want to ensure this flake
+    # stays maintained and you're willing to invest your time. Do not add
+    # other people without their consent. Consider removing people after
+    # they've been unreachable for long periods of time.

-      description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
-      homepage = "https://github.com/ggerganov/llama.cpp/";
-      license = lib.licenses.mit;
+    # Note that lib.maintainers is defined in Nixpkgs, but you may just add
+    # an attrset following the same format as in
+    # https://github.com/NixOS/nixpkgs/blob/f36a80e54da29775c78d7eff0e628c2b4e34d1d7/maintainers/maintainer-list.nix
+    maintainers = with lib.maintainers; [
+      philiptaron
+      SomeoneSerge
+    ];

-      # Accommodates `nix run` and `lib.getExe`
-      mainProgram = "llama-cli";
-
-      # These people might respond, on the best effort basis, if you ping them
-      # in case of Nix-specific regressions or for reviewing Nix-specific PRs.
-      # Consider adding yourself to this list if you want to ensure this flake
-      # stays maintained and you're willing to invest your time. Do not add
-      # other people without their consent. Consider removing people after
-      # they've been unreachable for long periods of time.
-
-      # Note that lib.maintainers is defined in Nixpkgs, but you may just add
-      # an attrset following the same format as in
-      # https://github.com/NixOS/nixpkgs/blob/f36a80e54da29775c78d7eff0e628c2b4e34d1d7/maintainers/maintainer-list.nix
-      maintainers = with lib.maintainers; [
-        philiptaron
-        SomeoneSerge
-      ];
-
-      # Extend `badPlatforms` instead
-      platforms = lib.platforms.all;
-    };
-  }
-)
+    # Extend `badPlatforms` instead
+    platforms = lib.platforms.all;
+  };
+})
--- a/.devops/nix/python-scripts.nix
+++ b/.devops/nix/python-scripts.nix
@@ -0,0 +1,66 @@
+{
+  lib,
+  stdenv,
+  buildPythonPackage,
+  poetry-core,
+  mkShell,
+  python3Packages,
+  gguf-py,
+}@inputs:
+
+let
+  llama-python-deps = with python3Packages; [
+    numpy
+    sentencepiece
+    transformers
+    protobuf
+    torchWithoutCuda
+    gguf-py
+    tqdm
+
+    # for scripts/compare-llama-bench.py
+    gitpython
+    tabulate
+
+    # for examples/pydantic-models-to-grammar-examples.py
+    docstring-parser
+    pydantic
+
+  ];
+
+  llama-python-test-deps = with python3Packages; [
+    # Server bench
+    matplotlib
+
+    # server tests
+    openai
+    behave
+    prometheus-client
+  ];
+in
+
+buildPythonPackage ({
+  pname = "llama-scripts";
+  version = "0.0.0";
+  pyproject = true;
+
+  # NOTE: The files filtered out here are not visible in the build sandbox, neither
+  # do they affect the output hash. They can be modified without triggering a rebuild.
+  src = lib.cleanSourceWith {
+    filter =
+      name: type:
+      let
+        any = builtins.any (x: x);
+        baseName = builtins.baseNameOf name;
+      in
+      any [
+        (lib.hasSuffix ".py" name)
+        (baseName == "README.md")
+        (baseName == "pyproject.toml")
+      ];
+    src = lib.cleanSource ../../.;
+  };
+  nativeBuildInputs = [ poetry-core ];
+  nativeCheckInputs = llama-python-test-deps;
+  dependencies = llama-python-deps;
+})
--- a/.devops/nix/scope.nix
+++ b/.devops/nix/scope.nix
@@ -1,19 +1,41 @@
 {
  lib,
  newScope,
+  python3,
  llamaVersion ? "0.0.0",
 }:

+let
+  pythonPackages = python3.pkgs;
+  buildPythonPackage = pythonPackages.buildPythonPackage;
+  numpy = pythonPackages.numpy;
+  tqdm = pythonPackages.tqdm;
+  sentencepiece = pythonPackages.sentencepiece;
+  pyyaml = pythonPackages.pyyaml;
+  poetry-core = pythonPackages.poetry-core;
+  pytestCheckHook = pythonPackages.pytestCheckHook;
+in
+
 # We're using `makeScope` instead of just writing out an attrset
 # because it allows users to apply overlays later using `overrideScope'`.
 # Cf. https://noogle.dev/f/lib/makeScope

-lib.makeScope newScope (
-  self: {
-    inherit llamaVersion;
-    llama-cpp = self.callPackage ./package.nix { };
-    docker = self.callPackage ./docker.nix { };
-    docker-min = self.callPackage ./docker.nix { interactive = false; };
-    sif = self.callPackage ./sif.nix { };
-  }
-)
+lib.makeScope newScope (self: {
+  inherit llamaVersion;
+  gguf-py = self.callPackage ./package-gguf-py.nix {
+    inherit
+      buildPythonPackage
+      numpy
+      tqdm
+      sentencepiece
+      poetry-core
+      pyyaml
+      pytestCheckHook
+      ;
+  };
+  python-scripts = self.callPackage ./python-scripts.nix { inherit buildPythonPackage poetry-core; };
+  llama-cpp = self.callPackage ./package.nix { };
+  docker = self.callPackage ./docker.nix { };
+  docker-min = self.callPackage ./docker.nix { interactive = false; };
+  sif = self.callPackage ./sif.nix { };
+})
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,7 +1,7 @@
 *.o
 *.a
 .cache/
-.git/
+# Do not ignore .git directory, otherwise the reported build number will always be 0
 .github/
 .gitignore
 .vs/
--- a/.editorconfig
+++ b/.editorconfig
@@ -24,6 +24,16 @@ insert_final_newline = unset
 [examples/server/public/*]
 indent_size = 2

+[examples/server/public/deps_*]
+trim_trailing_whitespace = unset
+indent_style = unset
+indent_size = unset
+
+[examples/server/deps_*]
+trim_trailing_whitespace = unset
+indent_style = unset
+indent_size = unset
+
 [examples/llama.swiftui/llama.swiftui.xcodeproj/*]
 indent_style = tab

--- a/.github/workflows/bench.yml.disabled
+++ b/.github/workflows/bench.yml.disabled
@@ -27,10 +27,10 @@ on:
  push:
    branches:
      - master
-    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
+    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
  pull_request_target:
    types: [opened, synchronize, reopened]
-    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
+    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
  schedule:
    -  cron: '04 2 * * *'

--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -19,10 +19,18 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true

+# Fine-grant permission
+# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
+permissions:
+  contents: write # for creating release
+
 env:
  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
  GGML_NLOOP: 3
  GGML_N_THREADS: 1
+  LLAMA_LOG_COLORS: 1
+  LLAMA_LOG_PREFIX: 1
+  LLAMA_LOG_TIMESTAMPS: 1

 jobs:
  macOS-latest-cmake-arm64:
@@ -47,7 +55,13 @@ jobs:
          sysctl -a
          mkdir build
          cd build
-          cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF ..
+          cmake .. \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DLLAMA_CURL=ON \
+            -DGGML_METAL_USE_BF16=ON \
+            -DGGML_METAL_EMBED_LIBRARY=ON \
+            -DGGML_RPC=ON \
+            -DBUILD_SHARED_LIBS=OFF
          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)

      - name: Test
@@ -84,7 +98,7 @@ jobs:
          name: llama-bin-macos-arm64.zip

  macOS-latest-cmake-x64:
-    runs-on: macos-12
+    runs-on: macos-13

    steps:
      - name: Clone
@@ -105,7 +119,12 @@ jobs:
          sysctl -a
          # Metal is disabled due to intermittent failures with Github runners not having a GPU:
          # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
-          cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF
+          cmake -B build \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DLLAMA_CURL=ON \
+            -DGGML_METAL=OFF \
+            -DGGML_RPC=ON \
+            -DBUILD_SHARED_LIBS=OFF
          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)

      - name: Test
@@ -375,7 +394,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4

      - name: Dependencies
        id: depends
@@ -386,22 +405,43 @@ jobs:
      - name: Build with native CMake HIP support
        id: cmake_build
        run: |
-          cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DGGML_HIPBLAS=ON
+          cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DGGML_HIP=ON
          cmake --build build --config Release -j $(nproc)

      - name: Build with legacy HIP support
        id: cmake_build_legacy_hip
        run: |
-          cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DGGML_HIPBLAS=ON
+          cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DGGML_HIP=ON
          cmake --build build2 --config Release -j $(nproc)

+  ubuntu-22-cmake-musa:
+    runs-on: ubuntu-22.04
+    container: mthreads/musa:rc3.1.0-devel-ubuntu22.04
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Dependencies
+        id: depends
+        run: |
+          apt-get update
+          apt-get install -y build-essential git cmake libcurl4-openssl-dev
+
+      - name: Build with native CMake MUSA support
+        id: cmake_build
+        run: |
+          cmake -B build -S . -DGGML_MUSA=ON
+          cmake --build build --config Release -j $(nproc)
+
  ubuntu-22-cmake-sycl:
    runs-on: ubuntu-22.04

    continue-on-error: true

    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4

      - name: add oneAPI to apt
        shell: bash
@@ -442,7 +482,7 @@ jobs:
    continue-on-error: true

    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4

      - name: add oneAPI to apt
        shell: bash
@@ -546,7 +586,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v1
+        uses: actions/checkout@v4

      - name: Dependencies
        id: depends
@@ -561,6 +601,7 @@ jobs:
          mkdir build
          cd build
          cmake -G Xcode .. \
+            -DGGML_METAL_USE_BF16=ON \
            -DGGML_METAL_EMBED_LIBRARY=ON \
            -DLLAMA_BUILD_EXAMPLES=OFF \
            -DLLAMA_BUILD_TESTS=OFF \
@@ -576,7 +617,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v1
+        uses: actions/checkout@v4

      - name: Dependencies
        id: depends
@@ -591,6 +632,7 @@ jobs:
          mkdir build
          cd build
          cmake -G Xcode .. \
+            -DGGML_METAL_USE_BF16=ON \
            -DGGML_METAL_EMBED_LIBRARY=ON \
            -DLLAMA_BUILD_EXAMPLES=OFF \
            -DLLAMA_BUILD_TESTS=OFF \
@@ -610,7 +652,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v1
+        uses: actions/checkout@v4

      - name: Dependencies
        id: depends
@@ -726,7 +768,7 @@ jobs:
        id: clone_kompute
        if: ${{ matrix.build == 'kompute-x64' }}
        run: |
-          git submodule update --init ggml/src/kompute
+          git submodule update --init ggml/src/ggml-kompute/kompute

      - name: Download OpenBLAS
        id: get_openblas
@@ -857,7 +899,7 @@ jobs:
        run: |
          mkdir build
          cd build
-          cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON
+          cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON
          cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1)) -t ggml
          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}

@@ -909,7 +951,7 @@ jobs:
        shell: bash

    env:
-      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7dff44ba-e3af-4448-841c-0d616c8da6e7/w_BaseKit_p_2024.1.0.595_offline.exe
+      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b380d914-366b-4b77-a74a-05e3c38b3514/intel-oneapi-base-toolkit-2025.0.0.882_offline.exe
      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel
      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
    steps:
@@ -953,6 +995,7 @@ jobs:
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl7.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
          echo "cp oneAPI running time dll files to ./build/bin done"
          7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*

@@ -964,19 +1007,20 @@ jobs:
          name: llama-bin-win-sycl-x64.zip

  windows-latest-cmake-hip:
+    if: ${{ github.event.inputs.create_release != 'true' }}
    runs-on: windows-latest

    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4

      - name: Install
        id: depends
        run: |
          $ErrorActionPreference = "Stop"
          write-host "Downloading AMD HIP SDK Installer"
-          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-23.Q4-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
+          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
          write-host "Installing AMD HIP SDK"
          Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
          write-host "Completed AMD HIP SDK installation"
@@ -991,8 +1035,72 @@ jobs:
        run: |
          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
-          cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON
-          cmake --build build --config Release
+          cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIP=ON -DCMAKE_BUILD_TYPE=Release -DGGML_RPC=ON
+          cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
+
+  windows-latest-cmake-hip-release:
+    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+    runs-on: windows-latest
+
+    strategy:
+      matrix:
+        gpu_target: [gfx1100, gfx1101, gfx1030]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Install
+        id: depends
+        run: |
+          $ErrorActionPreference = "Stop"
+          write-host "Downloading AMD HIP SDK Installer"
+          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
+          write-host "Installing AMD HIP SDK"
+          Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
+          write-host "Completed AMD HIP SDK installation"
+
+      - name: Verify ROCm
+        id: verify
+        run: |
+          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
+
+      - name: Build
+        id: cmake_build
+        run: |
+          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
+          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
+          cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIP=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=${{ matrix.gpu_target }} -DGGML_RPC=ON
+          cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
+          md "build\bin\rocblas\library\"
+          cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
+          cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
+          cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
+
+      - name: Determine tag name
+        id: tag
+        shell: bash
+        run: |
+          BUILD_NUMBER="$(git rev-list --count HEAD)"
+          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+          else
+            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        run: |
+          7z a llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip .\build\bin\*
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
+          name: llama-bin-win-hip-x64-${{ matrix.gpu_target }}.zip

  ios-xcode-build:
    runs-on: macos-latest
@@ -1057,6 +1165,7 @@ jobs:
      - macOS-latest-cmake
      - windows-latest-cmake
      - windows-latest-cmake-cuda
+      - windows-latest-cmake-hip-release
      - macOS-latest-cmake-arm64
      - macOS-latest-cmake-x64

--- a/.github/workflows/close-issue.yml
+++ b/.github/workflows/close-issue.yml
@@ -3,6 +3,11 @@ on:
  schedule:
    - cron: "42 0 * * *"

+# Fine-grant permission
+# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
+permissions:
+  issues: write
+
 jobs:
  close-issues:
    runs-on: ubuntu-latest
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -15,11 +15,17 @@ on:
    branches:
      - master
    paths: ['.github/workflows/docker.yml', '.devops/*.Dockerfile', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
+  workflow_dispatch: # allows manual triggering, useful for debugging

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true

+# Fine-grant permission
+# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
+permissions:
+  packages: write
+
 jobs:
  push_to_registry:
    name: Push Docker image to Docker Hub
@@ -37,15 +43,20 @@ jobs:
          - { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
          - { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
          - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
-          - { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          - { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          # Note: the full-rocm image is failing due to a "no space left on device" error. It is disabled for now to allow the workflow to complete.
+          - { tag: "light-musa", dockerfile: ".devops/llama-cli-musa.Dockerfile", platforms: "linux/amd64" }
+          - { tag: "server-musa", dockerfile: ".devops/llama-server-musa.Dockerfile", platforms: "linux/amd64" }
+          - { tag: "full-musa", dockerfile: ".devops/full-musa.Dockerfile", platforms: "linux/amd64" }
+          # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
+          #- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          #- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          #- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          - { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" }
          - { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" }
    steps:
      - name: Check out the repo
        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0 # preserve git history, so we can determine the build number

      - name: Set up QEMU
        uses: docker/setup-qemu-action@v2
@@ -60,6 +71,34 @@ jobs:
          username: ${{ github.repository_owner }}
          password: ${{ secrets.GITHUB_TOKEN }}

+      - name: Determine tag name
+        id: tag
+        shell: bash
+        run: |
+          BUILD_NUMBER="$(git rev-list --count HEAD)"
+          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+          REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}"  # to lower case
+          REPO_NAME="${{ github.event.repository.name }}"
+
+          # determine tag name postfix (build number, commit hash)
+          if [[ "${{ env.GITHUB_BRANCH_NAME }}" == "master" ]]; then
+            TAG_POSTFIX="b${BUILD_NUMBER}"
+          else
+            SAFE_NAME=$(echo "${{ env.GITHUB_BRANCH_NAME }}" | tr '/' '-')
+            TAG_POSTFIX="${SAFE_NAME}-${SHORT_HASH}"
+          fi
+
+          # list all tags possible
+          TAGS=""
+          TAGS="${TAGS}ghcr.io/${REPO_OWNER}/${REPO_NAME}:${{ matrix.config.tag }},"
+          TAGS="${TAGS}ghcr.io/${REPO_OWNER}/${REPO_NAME}:${{ matrix.config.tag }}-${TAG_POSTFIX}"
+
+          echo "output_tags=$TAGS" >> $GITHUB_OUTPUT
+          echo "output_tags=$TAGS"  # print out for debugging
+        env:
+          GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
+          GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
+
      # https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example
      - name: Free Disk Space (Ubuntu)
        uses: jlumbroso/free-disk-space@main
@@ -77,40 +116,13 @@ jobs:
          docker-images: true
          swap-storage: true

-      - name: Determine tag name
-        id: tag
-        shell: bash
-        run: |
-          BUILD_NUMBER="$(git rev-list --count HEAD)"
-          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
-          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
-            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
-          else
-            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
-            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Downcase github.repository_owner
-        run: |
-          echo "repository_owner_lowercase=${GITHUB_REPOSITORY_OWNER@L}" >> $GITHUB_ENV
-        env:
-          GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
-
-      - name: Build and push Docker image (versioned)
+      - name: Build and push Docker image (tagged + versioned)
        if: github.event_name == 'push'
-        uses: docker/build-push-action@v4
+        uses: docker/build-push-action@v6
        with:
          context: .
          push: true
          platforms: ${{ matrix.config.platforms }}
-          tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
-          file: ${{ matrix.config.dockerfile }}
-
-      - name: Build and push Docker image (tagged)
-        uses: docker/build-push-action@v4
-        with:
-          context: .
-          push: ${{ github.event_name == 'push' }}
-          platforms: ${{ matrix.config.platforms }}
-          tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
+          # tag list is generated from step above
+          tags: ${{ steps.tag.outputs.output_tags }}
          file: ${{ matrix.config.dockerfile }}
--- a/.github/workflows/nix-ci-aarch64.yml
+++ b/.github/workflows/nix-ci-aarch64.yml
@@ -21,6 +21,13 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true

+# Fine-grant permission
+# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
+permissions:
+  # https://github.com/DeterminateSystems/nix-installer-action?tab=readme-ov-file#with-flakehub
+  id-token: write
+  contents: read
+
 jobs:
  nix-build-aarch64:
    runs-on: ubuntu-latest
--- a/.github/workflows/nix-ci.yml
+++ b/.github/workflows/nix-ci.yml
@@ -12,6 +12,13 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true

+# Fine-grant permission
+# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
+permissions:
+  # https://github.com/DeterminateSystems/nix-installer-action?tab=readme-ov-file#with-flakehub
+  id-token: write
+  contents: read
+
 jobs:
  nix-eval:
    strategy:
--- a/.github/workflows/python-type-check.yml
+++ b/.github/workflows/python-type-check.yml
@@ -4,11 +4,13 @@ on:
  push:
    paths:
      - '.github/workflows/python-type-check.yml'
+      - 'pyrightconfig.json'
      - '**.py'
      - '**/requirements*.txt'
  pull_request:
    paths:
      - '.github/workflows/python-type-check.yml'
+      - 'pyrightconfig.json'
      - '**.py'
      - '**/requirements*.txt'

@@ -33,6 +35,6 @@ jobs:
      - name: Type-check with Pyright
        uses: jakebailey/pyright-action@v2
        with:
-          version: 1.1.370
+          version: 1.1.382
          level: warning
          warnings: true
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -20,6 +20,12 @@ on:
    types: [opened, synchronize, reopened]
    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']

+env:
+  LLAMA_LOG_COLORS: 1
+  LLAMA_LOG_PREFIX: 1
+  LLAMA_LOG_TIMESTAMPS: 1
+  LLAMA_LOG_VERBOSITY: 10
+
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
  cancel-in-progress: true
@@ -173,6 +179,7 @@ jobs:
        if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
        run: |
          cd examples/server/tests
+          $env:PYTHONIOENCODING = ":replace"
          behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp

      - name: Slow tests
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,7 @@
 *.a
 *.bat
 *.bin
+*.d
 *.dll
 *.dot
 *.etag
@@ -61,6 +62,7 @@ llama-batched-swift
 /rpc-server
 out/
 tmp/
+autogen-*.md

 # Deprecated

@@ -132,3 +134,7 @@ poetry.toml

 # Test models for lora adapters
 /lora-tests
+
+# Local scripts
+/run-vim.sh
+/run-chat.sh
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,3 @@
 [submodule "kompute"]
-	path = ggml/src/kompute
+	path = ggml/src/ggml-kompute/kompute
 	url = https://github.com/nomic-ai/kompute.git
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -62,6 +62,9 @@ option(LLAMA_SANITIZE_THREAD    "llama: enable thread sanitizer"    OFF)
 option(LLAMA_SANITIZE_ADDRESS   "llama: enable address sanitizer"   OFF)
 option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)

+# utils
+option(LLAMA_BUILD_COMMON "llama: build common utils library" ${LLAMA_STANDALONE})
+
 # extra artifacts
 option(LLAMA_BUILD_TESTS    "llama: build tests"          ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES "llama: build examples"       ${LLAMA_STANDALONE})
@@ -82,11 +85,15 @@ set(GGML_FATAL_WARNINGS     ${LLAMA_FATAL_WARNINGS})

 # change the default for these ggml options
 if (NOT DEFINED GGML_LLAMAFILE)
-    set(GGML_LLAMAFILE ON)
+    set(GGML_LLAMAFILE_DEFAULT ON)
 endif()

-if (NOT DEFINED GGML_CUDA_USE_GRAPHS)
-    set(GGML_CUDA_USE_GRAPHS ON)
+if (NOT DEFINED GGML_AMX)
+    set(GGML_AMX ON)
+endif()
+
+if (NOT DEFINED GGML_CUDA_GRAPHS)
+    set(GGML_CUDA_GRAPHS_DEFAULT ON)
 endif()

 # transition helpers
@@ -133,16 +140,21 @@ set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location o
 set(LLAMA_LIB_INSTALL_DIR     ${CMAKE_INSTALL_LIBDIR}     CACHE PATH "Location of library files")
 set(LLAMA_BIN_INSTALL_DIR     ${CMAKE_INSTALL_BINDIR}     CACHE PATH "Location of binary  files")

-
 # At the moment some compile definitions are placed within the ggml/src
 # directory but not exported on the `ggml` target. This could be improved by
 # determining _precisely_ which defines are necessary for the llama-config
 # package.
 #
+set(GGML_TRANSIENT_DEFINES)
 get_target_property(GGML_DIRECTORY ggml SOURCE_DIR)
 get_directory_property(GGML_DIR_DEFINES DIRECTORY ${GGML_DIRECTORY} COMPILE_DEFINITIONS)
+if (GGML_DIR_DEFINES)
+    list(APPEND GGML_TRANSIENT_DEFINES ${GGML_DIR_DEFINES})
+endif()
 get_target_property(GGML_TARGET_DEFINES ggml COMPILE_DEFINITIONS)
-set(GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES} ${GGML_DIR_DEFINES})
+if (GGML_TARGET_DEFINES)
+    list(APPEND GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES})
+endif()
 get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)

 set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h)
@@ -185,17 +197,19 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
        DESTINATION lib/pkgconfig)

 #
-# programs, examples and tests
+# utils, programs, examples and tests
 #

-add_subdirectory(common)
+if (LLAMA_BUILD_COMMON)
+    add_subdirectory(common)
+endif()

-if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
+if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
    include(CTest)
    add_subdirectory(tests)
-endif ()
+endif()

-if (LLAMA_BUILD_EXAMPLES)
+if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES)
    add_subdirectory(examples)
    add_subdirectory(pocs)
 endif()
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -24,16 +24,17 @@
            "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
        }
    },
-    { "name": "debug",   "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
-    { "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
-    { "name": "reldbg",  "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
-    { "name": "static",  "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } },
-    { "name": "sycl_f16",  "hidden": true, "cacheVariables": { "GGML_SYCL_F16": "ON" } },
+    { "name": "debug",    "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
+    { "name": "release",  "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
+    { "name": "reldbg",   "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
+    { "name": "static",   "hidden": true, "cacheVariables": { "GGML_STATIC":      "ON" } },
+    { "name": "sycl_f16", "hidden": true, "cacheVariables": { "GGML_SYCL_F16":    "ON" } },
+    { "name": "vulkan",   "hidden": true, "cacheVariables": { "GGML_VULKAN":      "ON" } },

    {
        "name": "arm64-windows-msvc", "hidden": true,
-        "architecture": { "value": "arm64",       "strategy": "external" },
-        "toolset":      { "value": "host=x86_64", "strategy": "external" },
+        "architecture": { "value": "arm64",    "strategy": "external" },
+        "toolset":      { "value": "host=x64", "strategy": "external" },
        "cacheVariables": {
            "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
        }
@@ -41,28 +42,44 @@

    {
        "name": "arm64-windows-llvm", "hidden": true,
-        "architecture": { "value": "arm64",       "strategy": "external" },
-        "toolset":      { "value": "host=x86_64", "strategy": "external" },
+        "architecture": { "value": "arm64",    "strategy": "external" },
+        "toolset":      { "value": "host=x64", "strategy": "external" },
        "cacheVariables": {
            "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-llvm.cmake"
        }
    },

-    { "name": "arm64-windows-llvm-debug"  , "inherits": [ "base", "arm64-windows-llvm",  "debug"   ] },
-    { "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm",  "reldbg" ] },
-    { "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm",  "reldbg", "static" ] },
+    {
+        "name": "arm64-apple-clang", "hidden": true,
+        "architecture": { "value": "arm64",    "strategy": "external" },
+        "toolset":      { "value": "host=x64", "strategy": "external" },
+        "cacheVariables": {
+            "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-apple-clang.cmake"
+        }
+    },

-    { "name": "arm64-windows-msvc-debug"  , "inherits": [ "base", "arm64-windows-msvc",  "debug"   ] },
+    { "name": "arm64-windows-llvm-debug", "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
+    { "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] },
+    { "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg", "static" ] },
+
+    { "name": "arm64-apple-clang-debug", "inherits": [ "base", "arm64-apple-clang", "debug" ] },
+    { "name": "arm64-apple-clang-release", "inherits": [ "base", "arm64-apple-clang", "reldbg" ] },
+    { "name": "arm64-apple-clang+static-release", "inherits": [ "base", "arm64-apple-clang",  "reldbg", "static" ] },
+
+    { "name": "arm64-windows-msvc-debug", "inherits": [ "base", "arm64-windows-msvc",  "debug"   ] },
    { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc",  "reldbg" ] },
    { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc",  "reldbg", "static" ] },

-    { "name": "x64-windows-msvc-debug"  , "inherits": [ "base", "debug"   ] },
+    { "name": "x64-windows-msvc-debug", "inherits": [ "base", "debug" ] },
    { "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] },
    { "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },

-    { "name": "x64-windows-sycl-debug"  , "inherits": [ "sycl-base", "debug"   ] },
+    { "name": "x64-windows-sycl-debug", "inherits": [ "sycl-base", "debug" ] },
    { "name": "x64-windows-sycl-debug-f16", "inherits": [ "sycl-base", "debug", "sycl_f16" ] },
    { "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] },
-    { "name": "x64-windows-sycl-release-f16", "inherits": [ "sycl-base", "release", "sycl_f16" ] }
+    { "name": "x64-windows-sycl-release-f16", "inherits": [ "sycl-base", "release", "sycl_f16" ] },
+
+    { "name": "x64-windows-vulkan-debug", "inherits": [ "base", "vulkan", "debug" ] },
+    { "name": "x64-windows-vulkan-release", "inherits": [ "base", "vulkan", "release" ] }
  ]
 }
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,24 +1,23 @@
 # Pull requests (for contributors)

 - Test your changes:
-  - Using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the GGML library
+  - Using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the `ggml` library
  - Execute [the full CI locally on your machine](ci/README.md) before publishing
- Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs.
-  - The PR template has a series of review complexity checkboxes `[ ]` that [you can mark as](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) `[X]` for your convenience
- Consider allowing write access to your branch for faster review
+- Optionally rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs
+- Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
 - If your PR becomes stale, don't hesitate to ping the maintainers in the comments

 # Pull requests (for collaborators)

 - Squash-merge PRs
 - Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
- Optionally, pick a `<module>` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules
+- Optionally pick a `<module>` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules

 # Coding guidelines

 - Avoid adding third-party dependencies, extra files, extra headers, etc.
 - Always consider cross-compatibility with other operating systems and architectures
- Avoid fancy looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple
+- Avoid fancy-looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple
 - There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit
 - Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
 - Naming usually optimizes for common prefix (see https://github.com/ggerganov/ggml/pull/302#discussion_r1243240963)
@@ -27,3 +26,8 @@

 ![matmul](media/matmul.png)

+# Resources
+
+The Github issues, PRs and discussions contain a lot of information that can be useful to get familiar with the codebase. For convenience, some of the more important information is referenced from Github projects:
+
+https://github.com/ggerganov/llama.cpp/projects
--- a/667
+++ b/667
@@ -1,11 +1,9 @@
 # Define the default target now so that it is always the first target
 BUILD_TARGETS = \
 	libllava.a \
-	llama-baby-llama \
 	llama-batched \
 	llama-batched-bench \
 	llama-bench \
-	llama-benchmark-matmult \
 	llama-cli \
 	llama-convert-llama2c-to-ggml \
 	llama-embedding \
@@ -35,25 +33,27 @@ BUILD_TARGETS = \
 	llama-save-load-state \
 	llama-server \
 	llama-simple \
+	llama-simple-chat \
 	llama-speculative \
 	llama-tokenize \
 	llama-vdot \
 	llama-cvector-generator \
+	llama-gen-docs \
 	tests/test-c.o

 # Binaries only useful for tests
 TEST_TARGETS = \
+	tests/test-arg-parser \
 	tests/test-autorelease \
 	tests/test-backend-ops \
 	tests/test-chat-template \
 	tests/test-double-float \
-	tests/test-grad0 \
 	tests/test-grammar-integration \
 	tests/test-grammar-parser \
 	tests/test-json-schema-to-grammar \
 	tests/test-llama-grammar \
+	tests/test-log \
 	tests/test-model-load-cancel \
-	tests/test-opt \
 	tests/test-quantize-fns \
 	tests/test-quantize-perf \
 	tests/test-rope \
@@ -61,11 +61,12 @@ TEST_TARGETS = \
 	tests/test-tokenizer-0 \
 	tests/test-tokenizer-1-bpe \
 	tests/test-tokenizer-1-spm
+#	tests/test-opt \

 # Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned
 LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-llama2c-to-ggml \
 	simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
-	retrieval speculative infill tokenize benchmark-matmult parallel export-lora lookahead lookup passkey gritlm
+	retrieval speculative infill tokenize parallel export-lora lookahead lookup passkey gritlm

 # Legacy build targets that were renamed in #7809, but we want to build binaries that for them that output a deprecation warning if people try to use them.
 #  We don't want to clutter things too much, so we only build replacements for the most commonly used binaries.
@@ -91,11 +92,6 @@ GGML_METAL := 1
 DEPRECATE_WARNING := 1
 endif

-ifdef LLAMA_OPENMP
-GGML_OPENMP := 1
-DEPRECATE_WARNING := 1
-endif
-
 ifdef LLAMA_RPC
 GGML_RPC := 1
 DEPRECATE_WARNING := 1
@@ -146,6 +142,14 @@ GGML_NO_METAL := 1
 DEPRECATE_WARNING := 1
 endif

+ifdef LLAMA_DISABLE_LOGS
+REMOVE_WARNING := 1
+endif
+
+ifdef LLAMA_SERVER_VERBOSE
+REMOVE_WARNING := 1
+endif
+
 ifndef UNAME_S
 UNAME_S := $(shell uname -s)
 endif
@@ -349,18 +353,14 @@ ifdef LLAMA_SANITIZE_UNDEFINED
 	MK_LDFLAGS  += -fsanitize=undefined -g
 endif

-ifdef LLAMA_SERVER_VERBOSE
-	MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
-endif
-
 ifdef LLAMA_SERVER_SSL
 	MK_CPPFLAGS += -DCPPHTTPLIB_OPENSSL_SUPPORT
 	MK_LDFLAGS += -lssl -lcrypto
 endif

-ifdef LLAMA_DISABLE_LOGS
-	MK_CPPFLAGS += -DLOG_DISABLE_LOGS
-endif # LLAMA_DISABLE_LOGS
+ifndef GGML_NO_CPU_AARCH64
+	MK_CPPFLAGS += -DGGML_USE_CPU_AARCH64
+endif

 # warnings
 WARN_FLAGS = \
@@ -432,7 +432,7 @@ endif
 # TODO: probably these flags need to be tweaked on some architectures
 #       feel free to update the Makefile for your architecture and send a pull request or issue

-ifndef RISCV
+ifndef RISCV_CROSS_COMPILE

 ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
 	# Use all CPU extensions that are available:
@@ -512,7 +512,12 @@ ifneq ($(filter loongarch64%,$(UNAME_M)),)
 	MK_CXXFLAGS += -mlasx
 endif

-else
+ifneq ($(filter riscv64%,$(UNAME_M)),)
+	MK_CFLAGS   += -march=rv64gcv -mabi=lp64d
+	MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
+endif
+
+else # RISC-V CROSS COMPILATION
 	MK_CFLAGS   += -march=rv64gcv -mabi=lp64d
 	MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
 endif
@@ -521,65 +526,59 @@ ifndef GGML_NO_ACCELERATE
 	# Mac OS - include Accelerate framework.
 	# `-framework Accelerate` works both with Apple Silicon and Mac Intel
 	ifeq ($(UNAME_S),Darwin)
-		MK_CPPFLAGS += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS
-		MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK
-		MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64
-		MK_LDFLAGS  += -framework Accelerate
-		OBJ_GGML    += ggml/src/ggml-blas.o
+		MK_CPPFLAGS  += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS -DGGML_BLAS_USE_ACCELERATE
+		MK_CPPFLAGS  += -DACCELERATE_NEW_LAPACK
+		MK_CPPFLAGS  += -DACCELERATE_LAPACK_ILP64
+		MK_LDFLAGS   += -framework Accelerate
+		OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o
 	endif
 endif # GGML_NO_ACCELERATE

-ifdef GGML_MUSA
-	CC := clang
-	CXX := clang++
-	GGML_CUDA := 1
-	MK_CPPFLAGS += -DGGML_USE_MUSA
-endif
-
 ifndef GGML_NO_OPENMP
 	MK_CPPFLAGS += -DGGML_USE_OPENMP
 	MK_CFLAGS   += -fopenmp
 	MK_CXXFLAGS += -fopenmp
-	ifdef GGML_MUSA
-		MK_CPPFLAGS += -I/usr/lib/llvm-10/include/openmp
-		MK_LDFLAGS  += -L/usr/lib/llvm-10/lib
-	endif # GGML_MUSA
 endif # GGML_NO_OPENMP

 ifdef GGML_OPENBLAS
-	MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas)
-	MK_CFLAGS   += $(shell pkg-config --cflags-only-other openblas)
-	MK_LDFLAGS  += $(shell pkg-config --libs openblas)
-	OBJ_GGML    += ggml/src/ggml-blas.o
+	MK_CPPFLAGS  += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas)
+	MK_CFLAGS    += $(shell pkg-config --cflags-only-other openblas)
+	MK_LDFLAGS   += $(shell pkg-config --libs openblas)
+	OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o
 endif # GGML_OPENBLAS

 ifdef GGML_OPENBLAS64
-	MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas64)
-	MK_CFLAGS   += $(shell pkg-config --cflags-only-other openblas64)
-	MK_LDFLAGS  += $(shell pkg-config --libs openblas64)
-	OBJ_GGML    += ggml/src/ggml-blas.o
+	MK_CPPFLAGS  += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas64)
+	MK_CFLAGS    += $(shell pkg-config --cflags-only-other openblas64)
+	MK_LDFLAGS   += $(shell pkg-config --libs openblas64)
+	OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o
 endif # GGML_OPENBLAS64

 ifdef GGML_BLIS
-	MK_CPPFLAGS += -DGGML_USE_BLAS -DGGML_BLAS_USE_BLIS -I/usr/local/include/blis -I/usr/include/blis
-	MK_LDFLAGS  += -lblis -L/usr/local/lib
-	OBJ_GGML    += ggml/src/ggml-blas.o
+	MK_CPPFLAGS  += -DGGML_USE_BLAS -DGGML_BLAS_USE_BLIS -I/usr/local/include/blis -I/usr/include/blis
+	MK_LDFLAGS   += -lblis -L/usr/local/lib
+	OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o
 endif # GGML_BLIS

 ifdef GGML_NVPL
-	MK_CPPFLAGS += -DGGML_USE_BLAS -DGGML_BLAS_USE_NVPL -DNVPL_ILP64 -I/usr/local/include/nvpl_blas -I/usr/include/nvpl_blas
-	MK_LDFLAGS  += -L/usr/local/lib -lnvpl_blas_core -lnvpl_blas_ilp64_gomp
-	OBJ_GGML    += ggml/src/ggml-blas.o
+	MK_CPPFLAGS  += -DGGML_USE_BLAS -DGGML_BLAS_USE_NVPL -DNVPL_ILP64 -I/usr/local/include/nvpl_blas -I/usr/include/nvpl_blas
+	MK_LDFLAGS   += -L/usr/local/lib -lnvpl_blas_core -lnvpl_blas_ilp64_gomp
+	OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o
 endif # GGML_NVPL

 ifndef GGML_NO_LLAMAFILE
-	MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
-	OBJ_GGML    += ggml/src/llamafile/sgemm.o
+	MK_CPPFLAGS  += -DGGML_USE_LLAMAFILE
+	OBJ_GGML_EXT += ggml/src/ggml-cpu/llamafile/sgemm.o
+endif
+
+ifndef GGML_NO_AMX
+	MK_CPPFLAGS += -DGGML_USE_AMX
+	OBJ_GGML_EXT += ggml/src/ggml-amx/ggml-amx.o ggml/src/ggml-amx/mmq.o
 endif

 ifdef GGML_RPC
-	MK_CPPFLAGS += -DGGML_USE_RPC
-	OBJ_GGML    += ggml/src/ggml-rpc.o
+	MK_CPPFLAGS  += -DGGML_USE_RPC
+	OBJ_GGML_EXT += ggml/src/ggml-rpc.o
 endif # GGML_RPC

 OBJ_CUDA_TMPL      = $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-wmma*.cu))
@@ -594,41 +593,27 @@ else
 endif # GGML_CUDA_FA_ALL_QUANTS

 ifdef GGML_CUDA
-	ifdef GGML_MUSA
-		ifneq ('', '$(wildcard /opt/musa)')
-			CUDA_PATH ?= /opt/musa
-		else
-			CUDA_PATH ?= /usr/local/musa
-		endif
-
-		MK_CPPFLAGS  += -DGGML_USE_CUDA -I$(CUDA_PATH)/include
-		MK_LDFLAGS   += -lmusa -lmublas -lmusart -lpthread -ldl -lrt -L$(CUDA_PATH)/lib -L/usr/lib64
-		MK_NVCCFLAGS += -x musa -mtgpu --cuda-gpu-arch=mp_22
+	ifneq ('', '$(wildcard /opt/cuda)')
+		CUDA_PATH ?= /opt/cuda
 	else
-		ifneq ('', '$(wildcard /opt/cuda)')
-			CUDA_PATH ?= /opt/cuda
-		else
-			CUDA_PATH ?= /usr/local/cuda
-		endif
+		CUDA_PATH ?= /usr/local/cuda
+	endif

-		MK_CPPFLAGS  += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
-		MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
-		MK_NVCCFLAGS += -use_fast_math
-	endif # GGML_MUSA
+	MK_CPPFLAGS  += -DGGML_USE_CUDA -DGGML_CUDA_USE_GRAPHS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
+	MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
+	MK_NVCCFLAGS += -use_fast_math

-	OBJ_GGML += ggml/src/ggml-cuda.o
-	OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
-	OBJ_GGML += $(OBJ_CUDA_TMPL)
+	OBJ_GGML_EXT += ggml/src/ggml-cuda/ggml-cuda.o
+	OBJ_GGML_EXT += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
+	OBJ_GGML_EXT += $(OBJ_CUDA_TMPL)

 ifdef LLAMA_FATAL_WARNINGS
 	MK_NVCCFLAGS += -Werror all-warnings
 endif # LLAMA_FATAL_WARNINGS

-ifndef GGML_MUSA
 ifndef JETSON_EOL_MODULE_DETECT
 	MK_NVCCFLAGS += --forward-unknown-to-host-compiler
 endif # JETSON_EOL_MODULE_DETECT
-endif # GGML_MUSA

 ifdef LLAMA_DEBUG
 	MK_NVCCFLAGS += -lineinfo
@@ -641,11 +626,7 @@ endif # GGML_CUDA_DEBUG
 ifdef GGML_CUDA_NVCC
 	NVCC = $(CCACHE) $(GGML_CUDA_NVCC)
 else
-	ifdef GGML_MUSA
-		NVCC = $(CCACHE) mcc
-	else
-		NVCC = $(CCACHE) nvcc
-	endif # GGML_MUSA
+	NVCC = $(CCACHE) nvcc
 endif # GGML_CUDA_NVCC

 ifdef CUDA_DOCKER_ARCH
@@ -654,10 +635,6 @@ else ifndef CUDA_POWER_ARCH
 	MK_NVCCFLAGS += -arch=native
 endif # CUDA_DOCKER_ARCH

-ifdef GGML_CUDA_FORCE_DMMV
-	MK_NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
-endif # GGML_CUDA_FORCE_DMMV
-
 ifdef GGML_CUDA_FORCE_MMQ
 	MK_NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
 endif # GGML_CUDA_FORCE_MMQ
@@ -666,20 +643,6 @@ ifdef GGML_CUDA_FORCE_CUBLAS
 	MK_NVCCFLAGS += -DGGML_CUDA_FORCE_CUBLAS
 endif # GGML_CUDA_FORCE_CUBLAS

-ifdef GGML_CUDA_DMMV_X
-	MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(GGML_CUDA_DMMV_X)
-else
-	MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
-endif # GGML_CUDA_DMMV_X
-
-ifdef GGML_CUDA_MMV_Y
-	MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_MMV_Y)
-else ifdef GGML_CUDA_DMMV_Y
-	MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_DMMV_Y) # for backwards compatibility
-else
-	MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
-endif # GGML_CUDA_MMV_Y
-
 ifdef GGML_CUDA_F16
 	MK_NVCCFLAGS += -DGGML_CUDA_F16
 endif # GGML_CUDA_F16
@@ -688,12 +651,6 @@ ifdef GGML_CUDA_DMMV_F16
 	MK_NVCCFLAGS += -DGGML_CUDA_F16
 endif # GGML_CUDA_DMMV_F16

-ifdef GGML_CUDA_KQUANTS_ITER
-	MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(GGML_CUDA_KQUANTS_ITER)
-else
-	MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
-endif
-
 ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE
 	MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(GGML_CUDA_PEER_MAX_BATCH_SIZE)
 else
@@ -717,15 +674,9 @@ define NVCC_COMPILE
 	$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
 endef # NVCC_COMPILE
 else
-	ifdef GGML_MUSA
-define NVCC_COMPILE
-	$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -c $< -o $@
-endef # NVCC_COMPILE
-	else
 define NVCC_COMPILE
 	$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
 endef # NVCC_COMPILE
-	endif # GGML_MUSA
 endif # JETSON_EOL_MODULE_DETECT

 ggml/src/ggml-cuda/%.o: \
@@ -735,8 +686,8 @@ ggml/src/ggml-cuda/%.o: \
 	ggml/src/ggml-cuda/common.cuh
 	$(NVCC_COMPILE)

-ggml/src/ggml-cuda.o: \
-	ggml/src/ggml-cuda.cu \
+ggml/src/ggml-cuda/ggml-cuda.o: \
+	ggml/src/ggml-cuda/ggml-cuda.cu \
 	ggml/include/ggml-cuda.h \
 	ggml/include/ggml.h \
 	ggml/include/ggml-backend.h \
@@ -747,9 +698,9 @@ ggml/src/ggml-cuda.o: \
 endif # GGML_CUDA

 ifdef GGML_VULKAN
-	MK_CPPFLAGS += -DGGML_USE_VULKAN
-	MK_LDFLAGS  += $(shell pkg-config --libs vulkan)
-	OBJ_GGML    += ggml/src/ggml-vulkan.o ggml/src/ggml-vulkan-shaders.o
+	MK_CPPFLAGS  += -DGGML_USE_VULKAN
+	MK_LDFLAGS   += $(shell pkg-config --libs vulkan)
+	OBJ_GGML_EXT += ggml/src/ggml-vulkan.o ggml/src/ggml-vulkan-shaders.o

 ifdef GGML_VULKAN_CHECK_RESULTS
 	MK_CPPFLAGS  += -DGGML_VULKAN_CHECK_RESULTS
@@ -808,11 +759,7 @@ ifdef GGML_HIPBLAS
 		AMDGPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
 	endif

-	GGML_CUDA_DMMV_X       ?= 32
-	GGML_CUDA_MMV_Y        ?= 1
-	GGML_CUDA_KQUANTS_ITER ?= 2
-
-	MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA
+	MK_CPPFLAGS += -DGGML_USE_HIP -DGGML_USE_CUDA

 ifdef GGML_HIP_UMA
 	MK_CPPFLAGS += -DGGML_HIP_UMA
@@ -825,13 +772,6 @@ endif # GGML_HIP_UMA
 	HIPCC ?= $(CCACHE) $(ROCM_PATH)/bin/hipcc

 	HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS))
-	HIPFLAGS += -DGGML_CUDA_DMMV_X=$(GGML_CUDA_DMMV_X)
-	HIPFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_MMV_Y)
-	HIPFLAGS += -DK_QUANTS_PER_ITERATION=$(GGML_CUDA_KQUANTS_ITER)
-
-ifdef GGML_CUDA_FORCE_DMMV
-	HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
-endif # GGML_CUDA_FORCE_DMMV

 ifdef GGML_CUDA_FORCE_MMQ
 	HIPFLAGS += -DGGML_CUDA_FORCE_MMQ
@@ -845,12 +785,12 @@ ifdef GGML_CUDA_NO_PEER_COPY
 	HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY
 endif # GGML_CUDA_NO_PEER_COPY

-	OBJ_GGML += ggml/src/ggml-cuda.o
-	OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
-	OBJ_GGML += $(OBJ_CUDA_TMPL)
+	OBJ_GGML_EXT += ggml/src/ggml-cuda/ggml-cuda.o
+	OBJ_GGML_EXT += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
+	OBJ_GGML_EXT += $(OBJ_CUDA_TMPL)

-ggml/src/ggml-cuda.o: \
-	ggml/src/ggml-cuda.cu \
+ggml/src/ggml-cuda/ggml-cuda.o: \
+	ggml/src/ggml-cuda/ggml-cuda.cu \
 	ggml/include/ggml-cuda.h \
 	ggml/include/ggml.h \
 	ggml/include/ggml-backend.h \
@@ -867,69 +807,167 @@ ggml/src/ggml-cuda/%.o: \
 	$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
 endif # GGML_HIPBLAS

+ifdef GGML_MUSA
+	ifeq ($(wildcard /opt/musa),)
+		MUSA_PATH ?= /usr/local/musa
+	else
+		MUSA_PATH ?= /opt/musa
+	endif
+	MTGPU_TARGETS ?= mp_21 mp_22
+
+	MK_CPPFLAGS += -DGGML_USE_MUSA -DGGML_USE_CUDA
+	MK_LDFLAGS += -L$(MUSA_PATH)/lib -Wl,-rpath=$(MUSA_PATH)/lib
+	MK_LDFLAGS += -lmusa -lmusart -lmublas
+
+	ifndef GGML_NO_OPENMP
+		# For Ubuntu Focal
+		MK_CPPFLAGS += -I/usr/lib/llvm-10/include/openmp
+		MK_LDFLAGS  += -L/usr/lib/llvm-10/lib
+		# For Ubuntu Jammy
+		MK_CPPFLAGS += -I/usr/lib/llvm-14/lib/clang/14.0.0/include
+		MK_LDFLAGS  += -L/usr/lib/llvm-14/lib
+	endif # GGML_NO_OPENMP
+
+	CC  := $(MUSA_PATH)/bin/clang
+	CXX := $(MUSA_PATH)/bin/clang++
+	MCC := $(CCACHE) $(MUSA_PATH)/bin/mcc
+
+	MUSAFLAGS += $(addprefix --cuda-gpu-arch=, $(MTGPU_TARGETS))
+
+ifdef GGML_CUDA_FORCE_MMQ
+	MUSAFLAGS += -DGGML_CUDA_FORCE_MMQ
+endif # GGML_CUDA_FORCE_MMQ
+
+ifdef GGML_CUDA_FORCE_CUBLAS
+	MUSAFLAGS += -DGGML_CUDA_FORCE_CUBLAS
+endif # GGML_CUDA_FORCE_CUBLAS
+
+ifdef GGML_CUDA_F16
+	MUSAFLAGS += -DGGML_CUDA_F16
+endif # GGML_CUDA_F16
+
+ifdef GGML_CUDA_DMMV_F16
+	MUSAFLAGS += -DGGML_CUDA_F16
+endif # GGML_CUDA_DMMV_F16
+
+ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE
+	MUSAFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(GGML_CUDA_PEER_MAX_BATCH_SIZE)
+else
+	MUSAFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
+endif # GGML_CUDA_PEER_MAX_BATCH_SIZE
+
+ifdef GGML_CUDA_NO_PEER_COPY
+	MUSAFLAGS += -DGGML_CUDA_NO_PEER_COPY
+endif # GGML_CUDA_NO_PEER_COPY
+
+ifdef GGML_CUDA_FA_ALL_QUANTS
+	MUSAFLAGS += -DGGML_CUDA_FA_ALL_QUANTS
+endif # GGML_CUDA_FA_ALL_QUANTS
+
+	OBJ_GGML_EXT += ggml/src/ggml-cuda/ggml-cuda.o
+	OBJ_GGML_EXT += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
+	OBJ_GGML_EXT += $(OBJ_CUDA_TMPL)
+
+ggml/src/ggml-cuda/ggml-cuda.o: \
+	ggml/src/ggml-cuda/ggml-cuda.cu \
+	ggml/include/ggml-cuda.h \
+	ggml/include/ggml.h \
+	ggml/include/ggml-backend.h \
+	ggml/src/ggml-backend-impl.h \
+	ggml/src/ggml-common.h \
+	$(wildcard ggml/src/ggml-cuda/*.cuh)
+	$(MCC) $(CXXFLAGS) $(MUSAFLAGS) -x musa -mtgpu -c -o $@ $<
+
+ggml/src/ggml-cuda/%.o: \
+	ggml/src/ggml-cuda/%.cu \
+	ggml/include/ggml.h \
+	ggml/src/ggml-common.h \
+	ggml/src/ggml-cuda/common.cuh
+	$(MCC) $(CXXFLAGS) $(MUSAFLAGS) -x musa -mtgpu -c -o $@ $<
+endif # GGML_MUSA
+
 ifdef GGML_METAL
-	MK_CPPFLAGS += -DGGML_USE_METAL
-	MK_LDFLAGS  += -framework Foundation -framework Metal -framework MetalKit
-	OBJ_GGML	+= ggml/src/ggml-metal.o
+	MK_CPPFLAGS  += -DGGML_USE_METAL
+	MK_LDFLAGS   += -framework Foundation -framework Metal -framework MetalKit
+	OBJ_GGML_EXT += ggml/src/ggml-metal/ggml-metal.o
+
+ifdef GGML_METAL_USE_BF16
+	MK_CPPFLAGS += -DGGML_METAL_USE_BF16
+endif # GGML_METAL_USE_BF16
 ifdef GGML_METAL_NDEBUG
 	MK_CPPFLAGS += -DGGML_METAL_NDEBUG
 endif
 ifdef GGML_METAL_EMBED_LIBRARY
-	MK_CPPFLAGS += -DGGML_METAL_EMBED_LIBRARY
-	OBJ_GGML   += ggml/src/ggml-metal-embed.o
+	MK_CPPFLAGS  += -DGGML_METAL_EMBED_LIBRARY
+	OBJ_GGML_EXT += ggml/src/ggml-metal-embed.o
 endif
 endif # GGML_METAL

 ifdef GGML_METAL
-ggml/src/ggml-metal.o: \
-	ggml/src/ggml-metal.m \
+ggml/src/ggml-metal/ggml-metal.o: \
+	ggml/src/ggml-metal/ggml-metal.m \
+	ggml/src/ggml-metal/ggml-metal-impl.h \
 	ggml/include/ggml-metal.h \
 	ggml/include/ggml.h
 	$(CC) $(CFLAGS) -c $< -o $@

 ifdef GGML_METAL_EMBED_LIBRARY
 ggml/src/ggml-metal-embed.o: \
-	ggml/src/ggml-metal.metal \
+	ggml/src/ggml-metal/ggml-metal.metal \
+	ggml/src/ggml-metal/ggml-metal-impl.h \
 	ggml/src/ggml-common.h
 	@echo "Embedding Metal library"
-	@sed -e '/#include "ggml-common.h"/r ggml/src/ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml/src/ggml-metal.metal > ggml/src/ggml-metal-embed.metal
+	@sed -e '/__embed_ggml-common.h__/r      ggml/src/ggml-common.h'                -e '/__embed_ggml-common.h__/d'      < ggml/src/ggml-metal/ggml-metal.metal           > ggml/src/ggml-metal/ggml-metal-embed.metal.tmp
+	@sed -e '/#include "ggml-metal-impl.h"/r ggml/src/ggml-metal/ggml-metal-impl.h' -e '/#include "ggml-metal-impl.h"/d' < ggml/src/ggml-metal/ggml-metal-embed.metal.tmp > ggml/src/ggml-metal/ggml-metal-embed.metal
 	$(eval TEMP_ASSEMBLY=$(shell mktemp -d))
-	@echo ".section __DATA, __ggml_metallib"            >  $(TEMP_ASSEMBLY)/ggml-metal-embed.s
-	@echo ".globl _ggml_metallib_start"                 >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
-	@echo "_ggml_metallib_start:"                       >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
-	@echo ".incbin \"ggml/src/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
-	@echo ".globl _ggml_metallib_end"                   >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
-	@echo "_ggml_metallib_end:"                         >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
+	@echo ".section __DATA, __ggml_metallib"                       >  $(TEMP_ASSEMBLY)/ggml-metal-embed.s
+	@echo ".globl _ggml_metallib_start"                            >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
+	@echo "_ggml_metallib_start:"                                  >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
+	@echo ".incbin \"ggml/src/ggml-metal/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
+	@echo ".globl _ggml_metallib_end"                              >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
+	@echo "_ggml_metallib_end:"                                    >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
 	$(CC) $(CFLAGS) -c $(TEMP_ASSEMBLY)/ggml-metal-embed.s -o $@
 	@rm -f ${TEMP_ASSEMBLY}/ggml-metal-embed.s
 	@rmdir ${TEMP_ASSEMBLY}
 endif
 endif # GGML_METAL

-OBJ_GGML += \
-	ggml/src/ggml.o \
-	ggml/src/ggml-alloc.o \
-	ggml/src/ggml-backend.o \
-	ggml/src/ggml-quants.o \
-	ggml/src/ggml-aarch64.o
+DIR_GGML = ggml
+DIR_LLAMA = src
+DIR_COMMON = common
+
+OBJ_GGML = \
+	$(DIR_GGML)/src/ggml.o \
+	$(DIR_GGML)/src/ggml-aarch64.o \
+	$(DIR_GGML)/src/ggml-alloc.o \
+	$(DIR_GGML)/src/ggml-backend.o \
+	$(DIR_GGML)/src/ggml-backend-reg.o \
+	$(DIR_GGML)/src/ggml-opt.o \
+	$(DIR_GGML)/src/ggml-quants.o \
+	$(DIR_GGML)/src/ggml-threading.o \
+	$(DIR_GGML)/src/ggml-cpu/ggml-cpu.o \
+	$(DIR_GGML)/src/ggml-cpu/ggml-cpu-cpp.o \
+	$(DIR_GGML)/src/ggml-cpu/ggml-cpu-aarch64.o \
+	$(DIR_GGML)/src/ggml-cpu/ggml-cpu-quants.o \
+	$(OBJ_GGML_EXT)

 OBJ_LLAMA = \
-	src/llama.o \
-	src/llama-vocab.o \
-	src/llama-grammar.o \
-	src/llama-sampling.o \
-	src/unicode.o \
-	src/unicode-data.o
+	$(DIR_LLAMA)/llama.o \
+	$(DIR_LLAMA)/llama-vocab.o \
+	$(DIR_LLAMA)/llama-grammar.o \
+	$(DIR_LLAMA)/llama-sampling.o \
+	$(DIR_LLAMA)/unicode.o \
+	$(DIR_LLAMA)/unicode-data.o

 OBJ_COMMON = \
-	common/common.o \
-	common/console.o \
-	common/ngram-cache.o \
-	common/sampling.o \
-	common/train.o \
-	common/grammar-parser.o \
-	common/build-info.o \
-	common/json-schema-to-grammar.o
+	$(DIR_COMMON)/common.o \
+	$(DIR_COMMON)/arg.o \
+	$(DIR_COMMON)/log.o \
+	$(DIR_COMMON)/console.o \
+	$(DIR_COMMON)/ngram-cache.o \
+	$(DIR_COMMON)/sampling.o \
+	$(DIR_COMMON)/build-info.o \
+	$(DIR_COMMON)/json-schema-to-grammar.o

 OBJ_ALL = $(OBJ_GGML) $(OBJ_LLAMA) $(OBJ_COMMON)

@@ -985,7 +1023,6 @@ $(info I CXX:       $(shell $(CXX)  --version | head -n 1))
 ifdef GGML_CUDA
 $(info I NVCC:      $(shell $(NVCC) --version | tail -n 1))
 CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
-ifndef GGML_MUSA
 ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)

 ifndef CUDA_DOCKER_ARCH
@@ -995,7 +1032,6 @@ endif # CUDA_POWER_ARCH
 endif # CUDA_DOCKER_ARCH

 endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
-endif # GGML_MUSA
 endif # GGML_CUDA
 $(info )

@@ -1020,207 +1056,90 @@ $(info   - LLAMA_NO_CCACHE)
 $(info )
 endif

+ifdef REMOVE_WARNING
+$(info !!! REMOVAL WARNING !!!)
+$(info The following LLAMA_ options have been removed and are no longer supported)
+$(info   - LLAMA_DISABLE_LOGS   (https://github.com/ggerganov/llama.cpp/pull/9418))
+$(info   - LLAMA_SERVER_VERBOSE (https://github.com/ggerganov/llama.cpp/pull/9418))
+$(info )
+endif
+
 #
 # Build libraries
 #

-# ggml
+# Libraries
+LIB_GGML   = libggml.so
+LIB_GGML_S = libggml.a

-ggml/src/ggml.o: \
-	ggml/src/ggml.c \
-	ggml/include/ggml.h
-	$(CC)  $(CFLAGS)   -c $< -o $@
+LIB_LLAMA   = libllama.so
+LIB_LLAMA_S = libllama.a

-ggml/src/ggml-alloc.o: \
-	ggml/src/ggml-alloc.c \
-	ggml/include/ggml.h \
-	ggml/include/ggml-alloc.h
-	$(CC)  $(CFLAGS)   -c $< -o $@
+LIB_COMMON   = libcommon.so
+LIB_COMMON_S = libcommon.a

-ggml/src/ggml-backend.o: \
-	ggml/src/ggml-backend.c \
-	ggml/include/ggml.h \
-	ggml/include/ggml-backend.h
-	$(CC)  $(CFLAGS)   -c $< -o $@
+# Targets
+BUILD_TARGETS += $(LIB_GGML) $(LIB_GGML_S) $(LIB_LLAMA) $(LIB_LLAMA_S) $(LIB_COMMON) $(LIB_COMMON_S)

-ggml/src/ggml-quants.o: \
-	ggml/src/ggml-quants.c \
-	ggml/include/ggml.h \
-	ggml/src/ggml-quants.h \
-	ggml/src/ggml-common.h
-	$(CC) $(CFLAGS)    -c $< -o $@
+# Dependency files
+DEP_FILES = $(OBJ_GGML:.o=.d) $(OBJ_LLAMA:.o=.d) $(OBJ_COMMON:.o=.d)

-ggml/src/ggml-aarch64.o: \
-	ggml/src/ggml-aarch64.c \
-	ggml/include/ggml.h \
-	ggml/src/ggml-aarch64.h \
-	ggml/src/ggml-common.h
-	$(CC) $(CFLAGS)    -c $< -o $@
+# Default target
+all: $(BUILD_TARGETS)

-ggml/src/ggml-blas.o: \
-	ggml/src/ggml-blas.cpp \
-	ggml/include/ggml-blas.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-ifndef GGML_NO_LLAMAFILE
-ggml/src/llamafile/sgemm.o: \
-	ggml/src/llamafile/sgemm.cpp \
-	ggml/src/llamafile/sgemm.h \
-	ggml/include/ggml.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-endif # GGML_NO_LLAMAFILE
-
-ifdef GGML_RPC
-ggml/src/ggml-rpc.o: \
-	ggml/src/ggml-rpc.cpp \
-	ggml/include/ggml-rpc.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-endif # GGML_RPC
-
-$(LIB_GGML): \
-	$(OBJ_GGML)
-	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
-
-$(LIB_GGML_S): \
-	$(OBJ_GGML)
-	ar rcs $(LIB_GGML_S) $^
-
-# llama
-
-src/unicode.o: \
-	src/unicode.cpp \
-	src/unicode.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-src/unicode-data.o: \
-	src/unicode-data.cpp \
-	src/unicode-data.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-src/llama.o: \
-	src/llama.cpp \
-	src/llama-impl.h \
-	src/llama-vocab.h \
-	src/llama-grammar.h \
-	src/llama-sampling.h \
-	src/unicode.h \
-	include/llama.h \
-	ggml/include/ggml-cuda.h \
-	ggml/include/ggml-metal.h \
+# Note: need this exception because `ggml-cpu.c` and `ggml-cpu.cpp` both produce the same obj/dep files
+#       g++ -M -I ./ggml/include/ -I ./ggml/src ggml/src/ggml-cpu/ggml-cpu.cpp | grep ggml
+$(DIR_GGML)/src/ggml-cpu/ggml-cpu-cpp.o: \
+	ggml/src/ggml-cpu/ggml-cpu.cpp \
+	ggml/include/ggml-backend.h \
 	ggml/include/ggml.h \
 	ggml/include/ggml-alloc.h \
-	ggml/include/ggml-backend.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
+	ggml/src/ggml-backend-impl.h \
+	ggml/include/ggml-cpu.h \
+	ggml/src/ggml-impl.h
+	$(CXX) $(CXXFLAGS)   -c $< -o $@

-src/llama-vocab.o: \
-	src/llama-vocab.cpp \
-	src/llama-vocab.h \
-	src/llama-impl.h \
-	include/llama.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
+# Rules for building object files
+$(DIR_GGML)/%.o: $(DIR_GGML)/%.c
+	$(CC) $(CFLAGS) -MMD -c $< -o $@

-src/llama-grammar.o: \
-	src/llama-grammar.cpp \
-	src/llama-grammar.h \
-	src/llama-impl.h \
-	src/llama-vocab.h \
-	src/llama-sampling.h \
-	include/llama.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
+$(DIR_GGML)/%.o: $(DIR_GGML)/%.cpp
+	$(CXX) $(CXXFLAGS) -MMD -c $< -o $@

-src/llama-sampling.o: \
-	src/llama-sampling.cpp \
-	src/llama-sampling.h \
-	src/llama-impl.h \
-	include/llama.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
+$(DIR_LLAMA)/%.o: $(DIR_LLAMA)/%.cpp
+	$(CXX) $(CXXFLAGS) -MMD -c $< -o $@

-$(LIB_LLAMA): \
-	$(OBJ_LLAMA) \
-	$(LIB_GGML)
+$(DIR_COMMON)/%.o: $(DIR_COMMON)/%.cpp
+	$(CXX) $(CXXFLAGS) -MMD -c $< -o $@
+
+# Rules for building libraries
+$(LIB_GGML): $(OBJ_GGML)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)

-$(LIB_LLAMA_S): \
-	$(OBJ_LLAMA)
+$(LIB_GGML_S): $(OBJ_GGML)
+	ar rcs $(LIB_GGML_S) $^
+
+$(LIB_LLAMA): $(OBJ_LLAMA) $(LIB_GGML)
+	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
+
+$(LIB_LLAMA_S): $(OBJ_LLAMA)
 	ar rcs $(LIB_LLAMA_S) $^

-# common
-
-common/common.o: \
-	common/common.cpp \
-	common/common.h \
-	common/console.h \
-	common/sampling.h \
-	common/json.hpp \
-	common/json-schema-to-grammar.h \
-	include/llama.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-common/sampling.o: \
-	common/sampling.cpp \
-	common/sampling.h \
-	include/llama.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-common/console.o: \
-	common/console.cpp \
-	common/console.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-common/grammar-parser.o: \
-	common/grammar-parser.cpp \
-	common/grammar-parser.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-common/json-schema-to-grammar.o: \
-	common/json-schema-to-grammar.cpp \
-	common/json-schema-to-grammar.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-common/train.o: \
-	common/train.cpp \
-	common/train.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-common/ngram-cache.o: \
-	common/ngram-cache.cpp \
-	common/ngram-cache.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-$(LIB_COMMON): \
-	$(OBJ_COMMON) \
-	$(LIB_LLAMA) \
-	$(LIB_GGML)
+$(LIB_COMMON): $(OBJ_COMMON) $(LIB_LLAMA) $(LIB_GGML)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)

-$(LIB_COMMON_S): \
-	$(OBJ_COMMON)
+$(LIB_COMMON_S): $(OBJ_COMMON)
 	ar rcs $(LIB_COMMON_S) $^

+# Include dependency files
+-include $(DEP_FILES)
+
+# Clean rule
 clean:
-	rm -vrf *.dot $(BUILD_TARGETS) $(TEST_TARGETS)
-	rm -rvf src/*.o
-	rm -rvf tests/*.o
-	rm -rvf examples/*.o
-	rm -rvf common/*.o
-	rm -rvf *.a
-	rm -rvf *.dll
-	rm -rvf *.so
-	rm -rvf *.dot
-	rm -rvf ggml/*.a
-	rm -rvf ggml/*.dll
-	rm -rvf ggml/*.so
-	rm -vrf ggml/src/*.o
-	rm -rvf ggml/src/llamafile/*.o
-	rm -rvf common/build-info.cpp
-	rm -vrf ggml/src/ggml-metal-embed.metal
-	rm -vrf ggml/src/ggml-cuda/*.o
-	rm -vrf ggml/src/ggml-cuda/template-instances/*.o
-	rm -rvf $(BUILD_TARGETS)
-	rm -rvf $(TEST_TARGETS)
-	rm -f vulkan-shaders-gen ggml/src/ggml-vulkan-shaders.hpp ggml/src/ggml-vulkan-shaders.cpp
-	rm -rvf $(LEGACY_TARGETS_CLEAN)
-	find examples pocs -type f -name "*.o" -delete
+	rm -vrf $(BUILD_TARGETS) $(TEST_TARGETS)
+	rm -rvf *.a *.dll *.so *.dot
+	find ggml src common tests examples pocs -type f -name "*.o" -delete
+	find ggml src common tests examples pocs -type f -name "*.d" -delete

 #
 # Examples
@@ -1251,6 +1170,11 @@ llama-simple: examples/simple/simple.cpp \
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

+llama-simple-chat: examples/simple-chat/simple-chat.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
 llama-tokenize: examples/tokenize/tokenize.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
@@ -1339,16 +1263,11 @@ llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp \
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

 llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp \
-	$(OBJ_GGML) $(OBJ_LLAMA)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-bench: examples/llama-bench/llama-bench.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-baby-llama: examples/baby-llama/baby-llama.cpp \
+llama-bench: examples/llama-bench/llama-bench.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -1418,21 +1337,13 @@ llama-server: \
 	examples/server/server.cpp \
 	examples/server/utils.hpp \
 	examples/server/httplib.h \
-	examples/server/colorthemes.css.hpp \
-	examples/server/style.css.hpp \
-	examples/server/theme-beeninorder.css.hpp \
-	examples/server/theme-ketivah.css.hpp \
-	examples/server/theme-mangotango.css.hpp \
-	examples/server/theme-playground.css.hpp \
-	examples/server/theme-polarnight.css.hpp \
-	examples/server/theme-snowstorm.css.hpp \
 	examples/server/index.html.hpp \
-	examples/server/index-new.html.hpp \
-	examples/server/index.js.hpp \
 	examples/server/completion.js.hpp \
-	examples/server/system-prompts.js.hpp \
-	examples/server/prompt-formats.js.hpp \
-	examples/server/json-schema-to-grammar.mjs.hpp \
+	examples/server/loading.html.hpp \
+	examples/server/deps_daisyui.min.css.hpp \
+	examples/server/deps_markdown-it.js.hpp \
+	examples/server/deps_tailwindcss.js.hpp \
+	examples/server/deps_vue.esm-browser.js.hpp \
 	common/json.hpp \
 	common/stb_image.h \
 	$(OBJ_ALL)
@@ -1448,6 +1359,11 @@ examples/server/%.hpp: examples/server/public/% Makefile
 		echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
 	) > $@

+llama-gen-docs: examples/gen-docs/gen-docs.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
 libllava.a: examples/llava/llava.cpp \
 	examples/llava/llava.h \
 	examples/llava/clip.cpp \
@@ -1495,21 +1411,21 @@ common/build-info.o: common/build-info.cpp

 tests: $(TEST_TARGETS)

-llama-benchmark-matmult: examples/benchmark/benchmark-matmult.cpp \
-	$(OBJ_GGML) common/build-info.o
+tests/test-arg-parser: tests/test-arg-parser.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-run-benchmark-matmult: llama-benchmark-matmult
-	./$@
-
-.PHONY: run-benchmark-matmult swift
-
 tests/test-llama-grammar: tests/test-llama-grammar.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

+tests/test-log: tests/test-log.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
 tests/test-grammar-parser: tests/test-grammar-parser.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
@@ -1529,11 +1445,6 @@ tests/test-json-schema-to-grammar: tests/test-json-schema-to-grammar.cpp \
 	$(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-tests/test-grad0: tests/test-grad0.cpp \
-	$(OBJ_GGML)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
 tests/test-opt: tests/test-opt.cpp \
 	$(OBJ_GGML)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
--- a/Package.swift
+++ b/Package.swift
@@ -10,10 +10,16 @@ var sources = [
    "src/unicode.cpp",
    "src/unicode-data.cpp",
    "ggml/src/ggml.c",
-    "ggml/src/ggml-alloc.c",
-    "ggml/src/ggml-backend.c",
-    "ggml/src/ggml-quants.c",
    "ggml/src/ggml-aarch64.c",
+    "ggml/src/ggml-alloc.c",
+    "ggml/src/ggml-backend.cpp",
+    "ggml/src/ggml-backend-reg.cpp",
+    "ggml/src/ggml-cpu/ggml-cpu.c",
+    "ggml/src/ggml-cpu/ggml-cpu.cpp",
+    "ggml/src/ggml-cpu/ggml-cpu-aarch64.c",
+    "ggml/src/ggml-cpu/ggml-cpu-quants.c",
+    "ggml/src/ggml-threading.cpp",
+    "ggml/src/ggml-quants.c",
 ]

 var resources: [Resource] = []
@@ -21,6 +27,7 @@ var linkerSettings: [LinkerSetting] = []
 var cSettings: [CSetting] =  [
    .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
    .unsafeFlags(["-fno-objc-arc"]),
+    .headerSearchPath("ggml/src"),
    // NOTE: NEW_LAPACK will required iOS version 16.4+
    // We should consider add this in the future when we drop support for iOS 14
    // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
@@ -29,8 +36,9 @@ var cSettings: [CSetting] =  [
 ]

 #if canImport(Darwin)
-sources.append("ggml/src/ggml-metal.m")
-resources.append(.process("ggml/src/ggml-metal.metal"))
+sources.append("ggml/src/ggml-common.h")
+sources.append("ggml/src/ggml-metal/ggml-metal.m")
+resources.append(.process("ggml/src/ggml-metal/ggml-metal.metal"))
 linkerSettings.append(.linkedFramework("Accelerate"))
 cSettings.append(
    contentsOf: [
@@ -60,13 +68,15 @@ let package = Package(
            name: "llama",
            path: ".",
            exclude: [
+               "build",
               "cmake",
               "examples",
               "scripts",
               "models",
               "tests",
               "CMakeLists.txt",
-               "Makefile"
+               "Makefile",
+               "ggml/src/ggml-metal-embed.metal"
            ],
            sources: sources,
            resources: resources,
--- a/README.md
+++ b/README.md
@@ -10,32 +10,16 @@

 Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++

-> [!IMPORTANT]
-[2024 Jun 12] Binaries have been renamed w/ a `llama-` prefix. `main` is now `llama-cli`, `server` is `llama-server`, etc (https://github.com/ggerganov/llama.cpp/pull/7809)
-
 ## Recent API changes

- [2024 Jun 26] The source code and CMake build scripts have been restructured https://github.com/ggerganov/llama.cpp/pull/8006
- [2024 Apr 21] `llama_token_to_piece` can now optionally render special tokens https://github.com/ggerganov/llama.cpp/pull/6807
- [2024 Apr 4] State and session file functions reorganized under `llama_state_*` https://github.com/ggerganov/llama.cpp/pull/6341
- [2024 Mar 26] Logits and embeddings API updated for compactness https://github.com/ggerganov/llama.cpp/pull/6122
- [2024 Mar 13] Add `llama_synchronize()` + `llama_context_params.n_ubatch` https://github.com/ggerganov/llama.cpp/pull/6017
- [2024 Mar 8] `llama_kv_cache_seq_rm()` returns a `bool` instead of `void`, and new `llama_n_seq_max()` returns the upper limit of acceptable `seq_id` in batches (relevant when dealing with multiple sequences) https://github.com/ggerganov/llama.cpp/pull/5328
- [2024 Mar 4] Embeddings API updated https://github.com/ggerganov/llama.cpp/pull/5796
- [2024 Mar 3] `struct llama_context_params` https://github.com/ggerganov/llama.cpp/pull/5849
+- [Changelog for `libllama` API](https://github.com/ggerganov/llama.cpp/issues/9289)
+- [Changelog for `llama-server` REST API](https://github.com/ggerganov/llama.cpp/issues/9291)

 ## Hot topics

- **`convert.py` has been deprecated and moved to `examples/convert_legacy_llama.py`, please use `convert_hf_to_gguf.py`** https://github.com/ggerganov/llama.cpp/pull/7430
- Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021
- BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920
- MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387
- Model sharding instructions using `gguf-split` https://github.com/ggerganov/llama.cpp/discussions/6404
- Fix major bug in Metal batched inference https://github.com/ggerganov/llama.cpp/pull/6225
- Multi-GPU pipeline parallelism support https://github.com/ggerganov/llama.cpp/pull/6017
- Looking for contributions to add Deepseek support: https://github.com/ggerganov/llama.cpp/issues/5981
- Quantization blind testing: https://github.com/ggerganov/llama.cpp/discussions/5962
- Initial Mamba support has been added: https://github.com/ggerganov/llama.cpp/pull/5328
+- **Introducing GGUF-my-LoRA** https://github.com/ggerganov/llama.cpp/discussions/10123
+- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669
+- Hugging Face GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)

 ----

@@ -46,9 +30,9 @@ variety of hardware - locally and in the cloud.

 - Plain C/C++ implementation without any dependencies
 - Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
- AVX, AVX2 and AVX512 support for x86 architectures
+- AVX, AVX2, AVX512 and AMX support for x86 architectures
 - 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP)
+- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads MTT GPUs via MUSA)
 - Vulkan and SYCL backend support
 - CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity

@@ -95,6 +79,7 @@ Typically finetunes of the base models below are supported as well.
 - [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
 - [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
 - [x] [OLMo](https://allenai.org/olmo)
+- [x] [OLMoE](https://huggingface.co/allenai/OLMoE-1B-7B-0924)
 - [x] [Granite models](https://huggingface.co/collections/ibm-granite/granite-code-models-6624c5cec322e4c148c8b330)
 - [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)
 - [x] [Snowflake-Arctic MoE](https://huggingface.co/collections/Snowflake/arctic-66290090abe542894a5ac520)
@@ -107,6 +92,9 @@ Typically finetunes of the base models below are supported as well.
 - [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)
 - [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct)
 - [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a)
+- [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat)
+- [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a)
+- [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM)

 (instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))

@@ -128,6 +116,7 @@ Typically finetunes of the base models below are supported as well.
 - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
 - Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
 - JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp)
+- JS/TS (Programmable Prompt Engine CLI): [offline-ai/cli](https://github.com/offline-ai/cli)
 - JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm)
 - Typescript/Wasm (nicer API, available on npm): [ngxson/wllama](https://github.com/ngxson/wllama)
 - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
@@ -135,14 +124,18 @@ Typically finetunes of the base models below are supported as well.
 - Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
 - Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs)
 - C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
+- C#/VB.NET (more features - community license): [LM-Kit.NET](https://docs.lm-kit.com/lm-kit-net/index.html)
 - Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
 - Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
 - React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn)
 - Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
 - Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
 - Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
+- Flutter: [xuegao-tzx/Fllama](https://github.com/xuegao-tzx/Fllama)
 - PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326)
 - Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp)
+- Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift)
+- Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama)

 **UI:**

@@ -181,19 +174,25 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 - [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
 - [AIKit](https://github.com/sozercan/aikit) (MIT)
 - [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL)
+- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
+- [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL)
+- [PocketPal AI - An iOS and Android App](https://github.com/a-ghorbani/pocketpal-ai) (MIT)

 *(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*

 **Tools:**

 - [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
+- [akx/ollama-dl](https://github.com/akx/ollama-dl) – download models from the Ollama library to be used directly with llama.cpp
 - [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
 - [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
+- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with prebuild Mobile and Web platform wrappers and a model example)

 **Infrastructure:**

 - [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
 - [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
+- [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly

 **Games:**
 - [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you.
@@ -422,7 +421,7 @@ Please refer to [Build llama.cpp locally](./docs/build.md)
 | [BLAS](./docs/build.md#blas-build) | All |
 | [BLIS](./docs/backend/BLIS.md) | All |
 | [SYCL](./docs/backend/SYCL.md) | Intel and Nvidia GPU |
-| [MUSA](./docs/build.md#musa) | Moore Threads GPU |
+| [MUSA](./docs/build.md#musa) | Moore Threads MTT GPU |
 | [CUDA](./docs/build.md#cuda) | Nvidia GPU |
 | [hipBLAS](./docs/build.md#hipblas) | AMD GPU |
 | [Vulkan](./docs/build.md#vulkan) | GPU |
@@ -454,20 +453,20 @@ To learn more how to measure perplexity using llama.cpp, [read this documentatio
 - Contributors can open PRs
 - Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
 - Collaborators will be invited based on contributions
- Any help with managing issues and PRs is very appreciated!
+- Any help with managing issues, PRs and projects is very appreciated!
 - See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
 - Read the [CONTRIBUTING.md](CONTRIBUTING.md) for more information
 - Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205)
 - A bit of backstory for those who are interested: [Changelog podcast](https://changelog.com/podcast/532)

-## Other documentations
+## Other documentation

 - [main (cli)](./examples/main/README.md)
 - [server](./examples/server/README.md)
 - [jeopardy](./examples/jeopardy/README.md)
 - [GBNF grammars](./grammars/README.md)

-**Development documentations**
+**Development documentation**

 - [How to build](./docs/build.md)
 - [Running on Docker](./docs/docker.md)
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -1,4 +1,4 @@
-#/bin/bash
+#!/bin/bash
 #
 # sample usage:
 #
@@ -39,7 +39,7 @@ SRC=`pwd`
 CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON"

 if [ ! -z ${GG_BUILD_METAL} ]; then
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON -DGGML_METAL_USE_BF16=ON"
 fi

 if [ ! -z ${GG_BUILD_CUDA} ]; then
@@ -53,7 +53,7 @@ if [ ! -z ${GG_BUILD_SYCL} ]; then
        exit 1
    fi

-    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
 fi

 if [ ! -z ${GG_BUILD_VULKAN} ]; then
@@ -326,36 +326,36 @@ function gg_run_open_llama_7b_v2 {
    ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
    ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k

-    (time ./bin/llama-cli --model ${model_f16}  -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-cli --model ${model_f16}  -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log

-    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log

-    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
+    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log

-    (time ./bin/llama-save-load-state     -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state     -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 10 -c 0     ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 99 -c 0     ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log

    function check_ppl {
        qnt="$1"
@@ -460,34 +460,34 @@ function gg_run_pythia_1_4b {
    ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
    ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k

-    (time ./bin/llama-cli --model ${model_f16}  -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-cli --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-cli --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-cli --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-cli --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-cli --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-cli --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-cli --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-cli --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-cli --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-cli --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-cli --model ${model_f16}  -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-cli --model ${model_q8_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-cli --model ${model_q4_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-cli --model ${model_q4_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-cli --model ${model_q5_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-cli --model ${model_q5_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-cli --model ${model_q2_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-cli --model ${model_q3_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-cli --model ${model_q4_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-cli --model ${model_q5_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-cli --model ${model_q6_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log

-    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log

-    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
+    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log

-    (time ./bin/llama-save-load-state     --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0     ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log

    function check_ppl {
        qnt="$1"
@@ -591,36 +591,36 @@ function gg_run_pythia_2_8b {
    ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
    ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k

-    (time ./bin/llama-cli --model ${model_f16}  -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-cli --model ${model_f16}  -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log

-    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log

-    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
+    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log

-    (time ./bin/llama-save-load-state     -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state     -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0     ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0     ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log

    function check_ppl {
        qnt="$1"
@@ -706,12 +706,88 @@ function gg_run_embd_bge_small {

    ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0

-    (time ./bin/llama-embedding --model ${model_f16}  -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-embedding --model ${model_f16}  -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log

    set +e
 }

+function gg_sum_embd_bge_small {
+    gg_printf '### %s\n\n' "${ci}"
+
+    gg_printf 'BGE Small (BERT):\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
+    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
+}
+
+# rerank_tiny
+
+function gg_run_rerank_tiny {
+    cd ${SRC}
+
+    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/config.json
+    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/tokenizer.json
+    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/tokenizer_config.json
+    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/special_tokens_map.json
+    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/resolve/main/pytorch_model.bin
+    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/sentence_bert_config.json
+    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/vocab.txt
+    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/modules.json
+    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/config.json
+
+    gg_wget models-mnt/rerank-tiny/1_Pooling https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/1_Pooling/config.json
+
+    path_models="../models-mnt/rerank-tiny"
+
+    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
+
+    set -e
+
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
+
+    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
+
+    model_f16="${path_models}/ggml-model-f16.gguf"
+
+    # for this model, the SEP token is "</s>"
+    (time ./bin/llama-embedding --model ${model_f16} -p "what is panda?</s></s>hi\nwhat is panda?</s></s>it's a bear\nwhat is panda?</s></s>The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
+
+    # sample output
+    # rerank score 0:    0.029
+    # rerank score 1:    0.029
+    # rerank score 2:    0.135
+
+    # check that the score is in the range [$3, $4]
+    function check_score {
+        qnt="$1"
+        score=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
+
+        if [ $(echo "$score < $3" | bc) -eq 1 ] || [ $(echo "$score > $4" | bc) -eq 1 ]; then
+            printf '  - %s @ %s (FAIL: score not in range [%s, %s])\n' "$qnt" "$score" "$3" "$4"
+            return 20
+        fi
+
+        printf '  - %s @ %s OK\n' "$qnt" "$score"
+        return 0
+    }
+
+    check_score "rerank score 0" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 0")" "0.00" "0.05" | tee -a $OUT/${ci}-rk-f16.log
+    check_score "rerank score 1" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 1")" "0.00" "0.05" | tee -a $OUT/${ci}-rk-f16.log
+    check_score "rerank score 2" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 2")" "0.10" "0.30" | tee -a $OUT/${ci}-rk-f16.log
+
+    set +e
+}
+
+function gg_sum_rerank_tiny {
+    gg_printf '### %s\n\n' "${ci}"
+
+    gg_printf 'Rerank Tiny (Jina):\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-rk-f16.log)"
+}
+
 function gg_check_build_requirements {
    if ! command -v cmake &> /dev/null; then
        gg_printf 'cmake not found, please install'
@@ -726,17 +802,11 @@ function gg_check_build_requirements {
    fi
 }

-function gg_sum_embd_bge_small {
-    gg_printf '### %s\n\n' "${ci}"
-
-    gg_printf 'BGE Small (BERT):\n'
-    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
-    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
-    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
-}
-
 ## main

+export LLAMA_LOG_PREFIX=1
+export LLAMA_LOG_TIMESTAMPS=1
+
 if [ -z ${GG_BUILD_LOW_PERF} ]; then
    # Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
    rm -rf ${SRC}/models-mnt
@@ -759,6 +829,7 @@ test $ret -eq 0 && gg_run ctest_release

 if [ -z ${GG_BUILD_LOW_PERF} ]; then
    test $ret -eq 0 && gg_run embd_bge_small
+    test $ret -eq 0 && gg_run rerank_tiny

    if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then
        test $ret -eq 0 && gg_run test_scripts_debug
--- a/cmake/arm64-apple-clang.cmake
+++ b/cmake/arm64-apple-clang.cmake
@@ -0,0 +1,16 @@
+set( CMAKE_SYSTEM_NAME Darwin )
+set( CMAKE_SYSTEM_PROCESSOR arm64 )
+
+set( target arm64-apple-darwin-macho )
+
+set( CMAKE_C_COMPILER    clang )
+set( CMAKE_CXX_COMPILER  clang++ )
+
+set( CMAKE_C_COMPILER_TARGET   ${target} )
+set( CMAKE_CXX_COMPILER_TARGET ${target} )
+
+set( arch_c_flags "-march=armv8.4-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
+set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function" )
+
+set( CMAKE_C_FLAGS_INIT   "${arch_c_flags} ${warn_c_flags}" )
+set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
--- a/cmake/llama-config.cmake.in
+++ b/cmake/llama-config.cmake.in
@@ -6,7 +6,7 @@ set(LLAMA_SHARED_LIB   @BUILD_SHARED_LIBS@)
 set(GGML_BLAS       @GGML_BLAS@)
 set(GGML_CUDA       @GGML_CUDA@)
 set(GGML_METAL      @GGML_METAL@)
-set(GGML_HIPBLAS    @GGML_HIPBLAS@)
+set(GGML_HIP        @GGML_HIP@)
 set(GGML_ACCELERATE @GGML_ACCELERATE@)
 set(GGML_VULKAN @GGML_VULKAN@)
 set(GGML_VULKAN_CHECK_RESULTS @GGML_VULKAN_CHECK_RESULTS@)
@@ -66,11 +66,15 @@ endif()

 find_library(ggml_LIBRARY ggml
    REQUIRED
-    HINTS ${LLAMA_LIB_DIR})
+    HINTS ${LLAMA_LIB_DIR}
+    NO_CMAKE_FIND_ROOT_PATH
+)

 find_library(llama_LIBRARY llama
    REQUIRED
-    HINTS ${LLAMA_LIB_DIR})
+    HINTS ${LLAMA_LIB_DIR}
+    NO_CMAKE_FIND_ROOT_PATH
+)

 set(_llama_link_deps "${ggml_LIBRARY}" "@GGML_LINK_LIBRARIES@")
 set(_llama_transient_defines "@GGML_TRANSIENT_DEFINES@")
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -51,21 +51,21 @@ endif()
 set(TARGET common)

 add_library(${TARGET} STATIC
+    arg.cpp
+    arg.h
    base64.hpp
-    common.h
    common.cpp
-    sampling.h
-    sampling.cpp
-    console.h
+    common.h
    console.cpp
-    grammar-parser.h
-    grammar-parser.cpp
-    json.hpp
+    console.h
    json-schema-to-grammar.cpp
-    train.h
-    train.cpp
-    ngram-cache.h
+    json.hpp
+    log.cpp
+    log.h
    ngram-cache.cpp
+    ngram-cache.h
+    sampling.cpp
+    sampling.h
    )

 if (BUILD_SHARED_LIBS)
--- a/common/arg.cpp
+++ b/common/arg.cpp
--- a/common/arg.h
+++ b/common/arg.h
@@ -0,0 +1,77 @@
+#pragma once
+
+#include "common.h"
+
+#include <set>
+#include <string>
+#include <vector>
+
+//
+// CLI argument parsing
+//
+
+struct common_arg {
+    std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
+    std::vector<const char *> args;
+    const char * value_hint   = nullptr; // help text or example for arg value
+    const char * value_hint_2 = nullptr; // for second arg value
+    const char * env          = nullptr;
+    std::string help;
+    bool is_sparam = false; // is current arg a sampling param?
+    void (*handler_void)   (common_params & params) = nullptr;
+    void (*handler_string) (common_params & params, const std::string &) = nullptr;
+    void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr;
+    void (*handler_int)    (common_params & params, int) = nullptr;
+
+    common_arg(
+        const std::initializer_list<const char *> & args,
+        const char * value_hint,
+        const std::string & help,
+        void (*handler)(common_params & params, const std::string &)
+    ) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
+
+    common_arg(
+        const std::initializer_list<const char *> & args,
+        const char * value_hint,
+        const std::string & help,
+        void (*handler)(common_params & params, int)
+    ) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
+
+    common_arg(
+        const std::initializer_list<const char *> & args,
+        const std::string & help,
+        void (*handler)(common_params & params)
+    ) : args(args), help(help), handler_void(handler) {}
+
+    // support 2 values for arg
+    common_arg(
+        const std::initializer_list<const char *> & args,
+        const char * value_hint,
+        const char * value_hint_2,
+        const std::string & help,
+        void (*handler)(common_params & params, const std::string &, const std::string &)
+    ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
+
+    common_arg & set_examples(std::initializer_list<enum llama_example> examples);
+    common_arg & set_env(const char * env);
+    common_arg & set_sparam();
+    bool in_example(enum llama_example ex);
+    bool get_value_from_env(std::string & output);
+    bool has_value_from_env();
+    std::string to_string();
+};
+
+struct common_params_context {
+    enum llama_example ex = LLAMA_EXAMPLE_COMMON;
+    common_params & params;
+    std::vector<common_arg> options;
+    void(*print_usage)(int, char **) = nullptr;
+    common_params_context(common_params & params) : params(params) {}
+};
+
+// parse input arguments from CLI
+// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
+bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
+
+// function to be used by test-arg-parser
+common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
--- a/common/common.cpp
+++ b/common/common.cpp
--- a/common/common.h
+++ b/common/common.h
@@ -4,18 +4,9 @@

 #include "llama.h"

-#include "sampling.h"
-
-#define LOG_NO_FILE_LINE_FUNCTION
-#include "log.h"
-
-#include <cmath>
 #include <string>
 #include <vector>
-#include <random>
-#include <thread>
-#include <unordered_map>
-#include <tuple>
+#include <sstream>

 #ifdef _WIN32
 #define DIRECTORY_SEPARATOR '\\'
@@ -33,12 +24,12 @@

 #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"

-struct llama_lora_adapter_info {
+struct common_lora_adapter_info {
    std::string path;
    float scale;
 };

-struct llama_lora_adapter_container : llama_lora_adapter_info {
+struct common_lora_adapter_container : common_lora_adapter_info {
    struct llama_lora_adapter * adapter;
 };

@@ -48,34 +39,123 @@ extern char const * LLAMA_COMMIT;
 extern char const * LLAMA_COMPILER;
 extern char const * LLAMA_BUILD_TARGET;

-struct llama_control_vector_load_info;
+struct common_control_vector_load_info;

 //
 // CPU utils
 //

+struct cpu_params {
+    int      n_threads                   = -1;
+    bool     cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
+    bool     mask_valid                  = false;   // Default: any CPU
+    enum ggml_sched_priority  priority   = GGML_SCHED_PRIO_NORMAL;  // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
+    bool     strict_cpu                  = false;   // Use strict CPU placement
+    uint32_t poll                        = 50;      // Polling (busywait) level (0 - no polling, 100 - mostly polling)
+};
+
 int32_t cpu_get_num_physical_cores();
 int32_t cpu_get_num_math();

 //
-// CLI argument parsing
+// Common params
 //

+enum llama_example {
+    LLAMA_EXAMPLE_COMMON,
+    LLAMA_EXAMPLE_SPECULATIVE,
+    LLAMA_EXAMPLE_MAIN,
+    LLAMA_EXAMPLE_INFILL,
+    LLAMA_EXAMPLE_EMBEDDING,
+    LLAMA_EXAMPLE_PERPLEXITY,
+    LLAMA_EXAMPLE_RETRIEVAL,
+    LLAMA_EXAMPLE_PASSKEY,
+    LLAMA_EXAMPLE_IMATRIX,
+    LLAMA_EXAMPLE_BENCH,
+    LLAMA_EXAMPLE_SERVER,
+    LLAMA_EXAMPLE_CVECTOR_GENERATOR,
+    LLAMA_EXAMPLE_EXPORT_LORA,
+    LLAMA_EXAMPLE_LLAVA,
+    LLAMA_EXAMPLE_LOOKUP,
+    LLAMA_EXAMPLE_PARALLEL,
+
+    LLAMA_EXAMPLE_COUNT,
+};
+
+enum common_sampler_type {
+    COMMON_SAMPLER_TYPE_NONE        = 0,
+    COMMON_SAMPLER_TYPE_DRY         = 1,
+    COMMON_SAMPLER_TYPE_TOP_K       = 2,
+    COMMON_SAMPLER_TYPE_TOP_P       = 3,
+    COMMON_SAMPLER_TYPE_MIN_P       = 4,
+  //COMMON_SAMPLER_TYPE_TFS_Z       = 5,
+    COMMON_SAMPLER_TYPE_TYPICAL_P   = 6,
+    COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
+    COMMON_SAMPLER_TYPE_XTC         = 8,
+    COMMON_SAMPLER_TYPE_INFILL      = 9,
+};
+
 // dimensionality reduction methods, used by cvector-generator
 enum dimre_method {
    DIMRE_METHOD_PCA,
    DIMRE_METHOD_MEAN,
 };

-struct gpt_params {
-    uint32_t seed                 = LLAMA_DEFAULT_SEED; // RNG seed
+// sampler parameters
+struct common_sampler_params {
+    uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler

-    int32_t n_threads             = cpu_get_num_math();
-    int32_t n_threads_draft       =    -1;
-    int32_t n_threads_batch       =    -1; // number of threads to use for batch processing (-1 = use n_threads)
-    int32_t n_threads_batch_draft =    -1;
+    int32_t n_prev             = 64;    // number of previous tokens to remember
+    int32_t n_probs            = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
+    int32_t min_keep           = 0;     // 0 = disabled, otherwise samplers should return at least min_keep tokens
+    int32_t top_k              = 40;    // <= 0 to use vocab size
+    float   top_p              = 0.95f; // 1.0 = disabled
+    float   min_p              = 0.05f; // 0.0 = disabled
+    float   xtc_probability    = 0.00f; // 0.0 = disabled
+    float   xtc_threshold      = 0.10f; // > 0.5 disables XTC
+    float   typ_p              = 1.00f; // typical_p, 1.0 = disabled
+    float   temp               = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
+    float   dynatemp_range     = 0.00f; // 0.0 = disabled
+    float   dynatemp_exponent  = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
+    int32_t penalty_last_n     = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
+    float   penalty_repeat     = 1.00f; // 1.0 = disabled
+    float   penalty_freq       = 0.00f; // 0.0 = disabled
+    float   penalty_present    = 0.00f; // 0.0 = disabled
+    float   dry_multiplier     = 0.0f;  // 0.0 = disabled;      DRY repetition penalty for tokens extending repetition:
+    float   dry_base           = 1.75f; // 0.0 = disabled;      multiplier * base ^ (length of sequence before token - allowed length)
+    int32_t dry_allowed_length = 2;     // tokens extending repetitions beyond this receive penalty
+    int32_t dry_penalty_last_n = -1;    // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
+    int32_t mirostat           = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
+    float   mirostat_tau       = 5.00f; // target entropy
+    float   mirostat_eta       = 0.10f; // learning rate
+    bool    penalize_nl        = false; // consider newlines as a repeatable token
+    bool    ignore_eos         = false;
+    bool    no_perf            = false; // disable performance metrics
+
+    std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"};     // default sequence breakers for DRY
+
+
+    std::vector<enum common_sampler_type> samplers = {
+        COMMON_SAMPLER_TYPE_DRY,
+        COMMON_SAMPLER_TYPE_TOP_K,
+        COMMON_SAMPLER_TYPE_TYPICAL_P,
+        COMMON_SAMPLER_TYPE_TOP_P,
+        COMMON_SAMPLER_TYPE_MIN_P,
+        COMMON_SAMPLER_TYPE_XTC,
+        COMMON_SAMPLER_TYPE_TEMPERATURE,
+    };
+
+    std::string grammar; // optional BNF-like grammar to constrain sampling
+
+    std::vector<llama_logit_bias> logit_bias; // logit biases to apply
+
+    // print the parameters into a string
+    std::string print() const;
+};
+
+struct common_params {
    int32_t n_predict             =    -1; // new tokens to predict
-    int32_t n_ctx                 =     0; // context size
+    int32_t n_ctx                 =  4096; // context size
    int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
    int32_t n_ubatch              =   512; // physical batch size for prompt processing (must be >=32 to use BLAS)
    int32_t n_keep                =     0; // number of tokens to keep from initial prompt
@@ -98,7 +178,12 @@ struct gpt_params {
    float   yarn_beta_fast        = 32.0f; // YaRN low correction dim
    float   yarn_beta_slow        =  1.0f; // YaRN high correction dim
    int32_t yarn_orig_ctx         =     0; // YaRN original context length
-    float   defrag_thold          = -1.0f; // KV cache defragmentation threshold
+    float   defrag_thold          =  0.1f; // KV cache defragmentation threshold
+
+    struct cpu_params cpuparams;
+    struct cpu_params cpuparams_batch;
+    struct cpu_params draft_cpuparams;
+    struct cpu_params draft_cpuparams_batch;

    ggml_backend_sched_eval_callback cb_eval = nullptr;
    void * cb_eval_user_data                 = nullptr;
@@ -110,35 +195,33 @@ struct gpt_params {
    enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
    enum llama_attention_type    attention_type    = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings

-    // // sampling parameters
-    struct llama_sampling_params sparams;
+    struct common_sampler_params sparams;

-    std::string model                = ""; // model path
-    std::string model_draft          = ""; // draft model for speculative decoding
-    std::string model_alias          = "unknown"; // model alias
-    std::string model_url            = ""; // model url to download
-    std::string hf_token             = ""; // HF token
-    std::string hf_repo              = ""; // HF repo
-    std::string hf_file              = ""; // HF file
-    std::string prompt               = "";
-    std::string prompt_file          = ""; // store the external prompt file name
-    std::string path_prompt_cache    = ""; // path to file for saving/loading prompt eval state
-    std::string input_prefix         = ""; // string to prefix user inputs with
-    std::string input_suffix         = ""; // string to suffix user inputs with
-    std::string logdir               = ""; // directory in which to save YAML log files
-    std::string lookup_cache_static  = ""; // path of static ngram cache file for lookup decoding
-    std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
-    std::string logits_file          = ""; // file for saving *all* logits
-    std::string rpc_servers          = ""; // comma separated list of RPC servers
+    std::string model                = ""; // model path                                                    // NOLINT
+    std::string model_draft          = ""; // draft model for speculative decoding                          // NOLINT
+    std::string model_alias          = "unknown"; // model alias                                            // NOLINT
+    std::string model_url            = ""; // model url to download                                         // NOLINT
+    std::string hf_token             = ""; // HF token                                                      // NOLINT
+    std::string hf_repo              = ""; // HF repo                                                       // NOLINT
+    std::string hf_file              = ""; // HF file                                                       // NOLINT
+    std::string prompt               = "";                                                                  // NOLINT
+    std::string prompt_file          = ""; // store the external prompt file name                           // NOLINT
+    std::string path_prompt_cache    = ""; // path to file for saving/loading prompt eval state             // NOLINT
+    std::string input_prefix         = ""; // string to prefix user inputs with                             // NOLINT
+    std::string input_suffix         = ""; // string to suffix user inputs with                             // NOLINT
+    std::string lookup_cache_static  = ""; // path of static ngram cache file for lookup decoding           // NOLINT
+    std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding          // NOLINT
+    std::string logits_file          = ""; // file for saving *all* logits                                  // NOLINT
+    std::string rpc_servers          = ""; // comma separated list of RPC servers                           // NOLINT

    std::vector<std::string> in_files;   // all input files
    std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
    std::vector<llama_model_kv_override> kv_overrides;

    bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
-    std::vector<llama_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
+    std::vector<common_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale

-    std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
+    std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale

    int32_t verbosity                  = 0;
    int32_t control_vector_layer_start = -1; // layer range for control vector
@@ -173,15 +256,15 @@ struct gpt_params {
    bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
    bool cont_batching     = true;  // insert new sequences for decoding on-the-fly
    bool flash_attn        = false; // flash attention
+    bool no_perf           = false; // disable performance metrics
+    bool ctx_shift         = true;  // context shift on inifinite text generation

    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
-    bool ignore_eos        = false; // ignore generated EOS tokens
    bool logits_all        = false; // return logits for all tokens in the batch
    bool use_mmap          = true;  // use mmap for faster loads
    bool use_mlock         = false; // use mlock to keep model in memory
    bool verbose_prompt    = false; // print prompt tokens before generation
    bool display_prompt    = true;  // print prompt before generation
-    bool infill            = false; // use infill mode
    bool dump_kv_cache     = false; // dump the KV cache contents for debugging purposes
    bool no_kv_offload     = false; // disable KV offloading
    bool warmup            = true;  // warmup run
@@ -191,33 +274,37 @@ struct gpt_params {
    std::string cache_type_v = "f16"; // KV cache data type for the V

    // multimodal models (see examples/llava)
-    std::string mmproj = "";        // path to multimodal projector
+    std::string mmproj = "";        // path to multimodal projector                                         // NOLINT
    std::vector<std::string> image; // path to image file(s)

    // embedding
    bool embedding         = false; // get only sentence embedding
-    int32_t embd_normalize = 2;     // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
+    int32_t embd_normalize = 2;     // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
    std::string embd_out   = "";    // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
-    std::string embd_sep   = "\n";  // separator of embendings
+    std::string embd_sep   = "\n";  // separator of embeddings
+    bool reranking         = false; // enable reranking support on server

    // server params
    int32_t port           = 8080;         // server listens on this network port
    int32_t timeout_read   = 600;          // http read timeout in seconds
    int32_t timeout_write  = timeout_read; // http write timeout in seconds
-    int32_t n_threads_http = -1;           // number of threads to process HTTP requests
+    int32_t n_threads_http = -1;           // number of threads to process HTTP requests (TODO: support threadpool)
+    int32_t n_cache_reuse  = 0;            // min chunk size to reuse from the cache via KV shifting

    std::string hostname      = "127.0.0.1";
-    std::string public_path   = "";
-    std::string chat_template = "";
-    std::string system_prompt = "";
+    std::string public_path   = "";                                                                         // NOLINT
+    std::string chat_template = "";                                                                         // NOLINT
    bool enable_chat_template = true;

    std::vector<std::string> api_keys;

-    std::string ssl_file_key  = "";
-    std::string ssl_file_cert = "";
+    std::string ssl_file_key  = "";                                                                         // NOLINT
+    std::string ssl_file_cert = "";                                                                         // NOLINT

-    bool endpoint_slots   = true;
+    // "advanced" endpoints are disabled by default for better security
+    bool webui            = true;
+    bool endpoint_slots   = false;
+    bool endpoint_props   = false; // only control POST requests, not GET
    bool endpoint_metrics = false;

    bool log_json = false;
@@ -265,23 +352,38 @@ struct gpt_params {
    bool spm_infill = false; // suffix/prefix/middle pattern for infill

    std::string lora_outfile = "ggml-lora-merged-f16.gguf";
+
+    // batched-bench params
+    bool batched_bench_output_jsonl = false;
 };

-void gpt_params_parse_from_env(gpt_params & params);
-void gpt_params_handle_model_default(gpt_params & params);
+// call once at the start of a program if it uses libcommon
+// initializes the logging system and prints info about the build
+void common_init();

-bool gpt_params_parse_ex   (int argc, char ** argv, gpt_params & params);
-bool gpt_params_parse      (int argc, char ** argv, gpt_params & params);
-bool gpt_params_find_arg   (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
-void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
+std::string common_params_get_system_info(const common_params & params);

-std::string gpt_params_get_system_info(const gpt_params & params);
+bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]);
+bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
+void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr);
+bool set_process_priority(enum ggml_sched_priority prio);

 //
 // String utils
 //

-std::vector<std::string> string_split(std::string input, char separator);
+#ifdef __GNUC__
+#ifdef __MINGW32__
+#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
+#else
+#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
+#endif
+#else
+#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
+#endif
+
+LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
+std::string string_format(const char * fmt, ...);

 std::string string_strip(const std::string & str);
 std::string string_get_sortable_timestamp();
@@ -290,6 +392,7 @@ void string_replace_all(std::string & s, const std::string & search, const std::

 template<class T>
 static std::vector<T> string_split(const std::string & str, char delim) {
+    static_assert(!std::is_same<T, std::string>::value, "Please use the specialized version for std::string");
    std::vector<T> values;
    std::istringstream str_stream(str);
    std::string token;
@@ -302,9 +405,30 @@ static std::vector<T> string_split(const std::string & str, char delim) {
    return values;
 }

+template<>
+std::vector<std::string> string_split<std::string>(const std::string & input, char separator)
+{
+    std::vector<std::string> parts;
+    size_t begin_pos = 0;
+    size_t separator_pos = input.find(separator);
+    while (separator_pos != std::string::npos) {
+        std::string part = input.substr(begin_pos, separator_pos - begin_pos);
+        parts.emplace_back(part);
+        begin_pos = separator_pos + 1;
+        separator_pos = input.find(separator, begin_pos);
+    }
+    parts.emplace_back(input.substr(begin_pos, separator_pos - begin_pos));
+    return parts;
+}
+
 bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
 void string_process_escapes(std::string & input);

+std::string string_from(bool value);
+std::string string_from(const std::vector<int> & values);
+std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
+std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
+
 //
 // Filesystem utils
 //
@@ -319,28 +443,29 @@ std::string fs_get_cache_file(const std::string & filename);
 // Model utils
 //

-struct llama_init_result {
+struct common_init_result {
    struct llama_model   * model   = nullptr;
    struct llama_context * context = nullptr;
-    std::vector<llama_lora_adapter_container> lora_adapters;
+    std::vector<common_lora_adapter_container> lora_adapters;
 };

-struct llama_init_result    llama_init_from_gpt_params(gpt_params & params);
+struct common_init_result     common_init_from_params(common_params & params);

-struct llama_model_params   llama_model_params_from_gpt_params  (const gpt_params & params);
-struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
+struct llama_model_params     common_model_params_to_llama  (const common_params & params);
+struct llama_context_params   common_context_params_to_llama(const common_params & params);
+struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);

-struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
-struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
+struct llama_model * common_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
+struct llama_model * common_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);

 // clear LoRA adapters from context, then apply new list of adapters
-void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters);
+void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);

 // Batch utils

-void llama_batch_clear(struct llama_batch & batch);
+void common_batch_clear(struct llama_batch & batch);

-void llama_batch_add(
+void common_batch_add(
                 struct llama_batch & batch,
                        llama_token   id,
                          llama_pos   pos,
@@ -353,13 +478,13 @@ void llama_batch_add(

 // tokenizes a string into a vector of tokens
 // should work similar to Python's `tokenizer.encode`
-std::vector<llama_token> llama_tokenize(
+std::vector<llama_token> common_tokenize(
  const struct llama_context * ctx,
           const std::string & text,
                        bool   add_special,
                        bool   parse_special = false);

-std::vector<llama_token> llama_tokenize(
+std::vector<llama_token> common_tokenize(
    const struct llama_model * model,
           const std::string & text,
                        bool   add_special,
@@ -367,7 +492,7 @@ std::vector<llama_token> llama_tokenize(

 // tokenizes a token into a piece, optionally renders special/control tokens
 // should work similar to Python's `tokenizer.id_to_piece`
-std::string llama_token_to_piece(
+std::string common_token_to_piece(
        const struct llama_context * ctx,
                       llama_token   token,
                       bool          special = true);
@@ -375,7 +500,7 @@ std::string llama_token_to_piece(
 // detokenizes a vector of tokens into a string
 // should work similar to Python's `tokenizer.decode`
 // optionally renders special/control tokens
-std::string llama_detokenize(
+std::string common_detokenize(
                         llama_context * ctx,
        const std::vector<llama_token> & tokens,
                                  bool   special = true);
@@ -385,31 +510,31 @@ std::string llama_detokenize(
 //

 // same with llama_chat_message, but uses std::string
-struct llama_chat_msg {
+struct common_chat_msg {
    std::string role;
    std::string content;
 };

 // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
-bool llama_chat_verify_template(const std::string & tmpl);
+bool common_chat_verify_template(const std::string & tmpl);

 // CPP wrapper for llama_chat_apply_template
 // If the built-in template is not supported, we default to chatml
 // If the custom "tmpl" is not supported, we throw an error
-std::string llama_chat_apply_template(const struct llama_model * model,
+std::string common_chat_apply_template(const struct llama_model * model,
        const std::string & tmpl,
-        const std::vector<llama_chat_msg> & chat,
+        const std::vector<common_chat_msg> & chat,
        bool add_ass);

 // Format single message, while taking into account the position of that message in chat history
-std::string llama_chat_format_single(const struct llama_model * model,
+std::string common_chat_format_single(const struct llama_model * model,
        const std::string & tmpl,
-        const std::vector<llama_chat_msg> & past_msg,
-        const llama_chat_msg & new_msg,
+        const std::vector<common_chat_msg> & past_msg,
+        const common_chat_msg & new_msg,
        bool add_ass);

 // Returns an example of formatted chat
-std::string llama_chat_format_example(const struct llama_model * model,
+std::string common_chat_format_example(const struct llama_model * model,
        const std::string & tmpl);

 //
@@ -417,31 +542,31 @@ std::string llama_chat_format_example(const struct llama_model * model,
 //

 // Dump the KV cache view with the number of sequences per cell.
-void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
+void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);

 // Dump the KV cache view showing individual sequences in each cell (long output).
-void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
+void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);

 //
 // Embedding utils
 //

-void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
+void common_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);

-float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
+float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);

 //
 // Control vector utils
 //

-struct llama_control_vector_data {
+struct common_control_vector_data {
    int n_embd;

    // stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
    std::vector<float> data;
 };

-struct llama_control_vector_load_info {
+struct common_control_vector_load_info {
    float strength;

    std::string fname;
@@ -449,7 +574,7 @@ struct llama_control_vector_load_info {

 // Load control vectors, scale each by strength, and add them together.
 // On error, returns {-1, empty}
-llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos);
+common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos);

 //
 // Split utils
@@ -458,15 +583,3 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
 static const char * const LLM_KV_SPLIT_NO            = "split.no";
 static const char * const LLM_KV_SPLIT_COUNT         = "split.count";
 static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
-
-//
-// YAML utils
-//
-
-void yaml_dump_vector_float    (FILE * stream, const char * prop_name, const std::vector<float> & data);
-void yaml_dump_vector_int      (FILE * stream, const char * prop_name, const std::vector<int> & data);
-void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
-
-void yaml_dump_non_result_info(
-    FILE * stream, const gpt_params & params, const llama_context * lctx,
-    const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
--- a/common/console.cpp
+++ b/common/console.cpp
@@ -94,6 +94,9 @@ namespace console {
                simple_io = true;
            }
        }
+        if (simple_io) {
+            _setmode(_fileno(stdin), _O_U8TEXT);
+        }
 #else
        // POSIX-specific console initialization
        if (!simple_io) {
--- a/common/grammar-parser.cpp
+++ b/common/grammar-parser.cpp
@@ -1,539 +0,0 @@
-#include "grammar-parser.h"
-#include <cstdint>
-#include <cwchar>
-#include <string>
-#include <utility>
-#include <stdexcept>
-#include <exception>
-
-namespace grammar_parser {
-    // NOTE: assumes valid utf8 (but checks for overrun)
-    // copied from llama.cpp
-    static std::pair<uint32_t, const char *> decode_utf8(const char * src) {
-        static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
-        uint8_t  first_byte = static_cast<uint8_t>(*src);
-        uint8_t  highbits   = first_byte >> 4;
-        int      len        = lookup[highbits];
-        uint8_t  mask       = (1 << (8 - len)) - 1;
-        uint32_t value      = first_byte & mask;
-        const char * end    = src + len; // may overrun!
-        const char * pos    = src + 1;
-        for ( ; pos < end && *pos; pos++) {
-            value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
-        }
-        return std::make_pair(value, pos);
-    }
-
-    static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
-        uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
-        auto result = state.symbol_ids.emplace(std::string(src, len), next_id);
-        return result.first->second;
-    }
-
-    static uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) {
-        uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
-        state.symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id;
-        return next_id;
-    }
-
-    static void add_rule(
-            parse_state & state,
-            uint32_t      rule_id,
-            const std::vector<llama_grammar_element> & rule) {
-        if (state.rules.size() <= rule_id) {
-            state.rules.resize(rule_id + 1);
-        }
-        state.rules[rule_id] = rule;
-    }
-
-    static bool is_digit_char(char c) {
-        return '0' <= c && c <= '9';
-    }
-
-    static bool is_word_char(char c) {
-        return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || is_digit_char(c);
-    }
-
-    static std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
-        const char * pos   = src;
-        const char * end   = src + size;
-        uint32_t     value = 0;
-        for ( ; pos < end && *pos; pos++) {
-            value <<= 4;
-            char c = *pos;
-            if ('a' <= c && c <= 'f') {
-                value += c - 'a' + 10;
-            } else if ('A' <= c && c <= 'F') {
-                value += c - 'A' + 10;
-            } else if ('0' <= c && c <= '9') {
-                value += c - '0';
-            } else {
-                break;
-            }
-        }
-        if (pos != end) {
-            throw std::runtime_error("expecting " + std::to_string(size) + " hex chars at " + src);
-        }
-        return std::make_pair(value, pos);
-    }
-
-    static const char * parse_space(const char * src, bool newline_ok) {
-        const char * pos = src;
-        while (*pos == ' ' || *pos == '\t' || *pos == '#' ||
-                (newline_ok && (*pos == '\r' || *pos == '\n'))) {
-            if (*pos == '#') {
-                while (*pos && *pos != '\r' && *pos != '\n') {
-                    pos++;
-                }
-            } else {
-                pos++;
-            }
-        }
-        return pos;
-    }
-
-    static const char * parse_name(const char * src) {
-        const char * pos = src;
-        while (is_word_char(*pos)) {
-            pos++;
-        }
-        if (pos == src) {
-            throw std::runtime_error(std::string("expecting name at ") + src);
-        }
-        return pos;
-    }
-
-    static const char * parse_int(const char * src) {
-        const char * pos = src;
-        while (is_digit_char(*pos)) {
-            pos++;
-        }
-        if (pos == src) {
-            throw std::runtime_error(std::string("expecting integer at ") + src);
-        }
-        return pos;
-    }
-
-    static std::pair<uint32_t, const char *> parse_char(const char * src) {
-        if (*src == '\\') {
-            switch (src[1]) {
-                case 'x': return parse_hex(src + 2, 2);
-                case 'u': return parse_hex(src + 2, 4);
-                case 'U': return parse_hex(src + 2, 8);
-                case 't': return std::make_pair('\t', src + 2);
-                case 'r': return std::make_pair('\r', src + 2);
-                case 'n': return std::make_pair('\n', src + 2);
-                case '\\':
-                case '"':
-                case '[':
-                case ']':
-                    return std::make_pair(src[1], src + 2);
-                default:
-                    throw std::runtime_error(std::string("unknown escape at ") + src);
-            }
-        } else if (*src) {
-            return decode_utf8(src);
-        }
-        throw std::runtime_error("unexpected end of input");
-    }
-
-    const char * parse_alternates(
-            parse_state       & state,
-            const char        * src,
-            const std::string & rule_name,
-            uint32_t            rule_id,
-            bool                is_nested);
-
-    static const char * parse_sequence(
-            parse_state                        & state,
-            const char                         * src,
-            const std::string                  & rule_name,
-            std::vector<llama_grammar_element> & out_elements,
-            bool                                 is_nested) {
-        size_t last_sym_start = out_elements.size();
-        const char * pos = src;
-
-        auto handle_repetitions = [&](int min_times, int max_times) {
-
-            if (last_sym_start == out_elements.size()) {
-                throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos);
-            }
-
-            // apply transformation to previous symbol (last_sym_start to end) according to
-            // the following rewrite rules:
-            // S{m,n} --> S S S (m times) S'(n-m)
-            //            S'(x)   ::= S S'(x-1) |
-            //            (... n-m definitions of these S' rules ...)
-            //            S'(1)   ::= S |
-            // S{m,} -->  S S S (m times) S'
-            //            S'     ::= S S' |
-            // S*     --> S{0,}
-            //        --> S'     ::= S S' |
-            // S+     --> S{1,}
-            //        --> S S'
-            //            S'     ::= S S' |
-            // S?     --> S{0,1}
-            //        --> S'
-            //            S'     ::= S |
-
-            std::vector<llama_grammar_element> previous_elements(out_elements.begin() + last_sym_start, out_elements.end());
-            if (min_times == 0) {
-                out_elements.resize(last_sym_start);
-            } else {
-                // Repeat the previous elements (min_times - 1) times
-                for (int i = 1; i < min_times; i++) {
-                    out_elements.insert(out_elements.end(), previous_elements.begin(), previous_elements.end());
-                }
-            }
-
-            uint32_t last_rec_rule_id = 0;
-            auto n_opt = max_times < 0 ? 1 : max_times - min_times;
-
-            std::vector<llama_grammar_element> rec_rule(previous_elements);
-            for (int i = 0; i < n_opt; i++) {
-                rec_rule.resize(previous_elements.size());
-                uint32_t rec_rule_id = generate_symbol_id(state, rule_name);
-                if (i > 0 || max_times < 0) {
-                    rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, max_times < 0 ? rec_rule_id : last_rec_rule_id});
-                }
-                rec_rule.push_back({LLAMA_GRETYPE_ALT, 0});
-                rec_rule.push_back({LLAMA_GRETYPE_END, 0});
-                add_rule(state, rec_rule_id, rec_rule);
-                last_rec_rule_id = rec_rule_id;
-            }
-            if (n_opt > 0) {
-                out_elements.push_back({LLAMA_GRETYPE_RULE_REF, last_rec_rule_id});
-            }
-        };
-
-        while (*pos) {
-            if (*pos == '"') { // literal string
-                pos++;
-                last_sym_start = out_elements.size();
-                while (*pos != '"') {
-                    if (!*pos) {
-                        throw std::runtime_error("unexpected end of input");
-                    }
-                    auto char_pair = parse_char(pos);
-                         pos       = char_pair.second;
-                    out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
-                }
-                pos = parse_space(pos + 1, is_nested);
-            } else if (*pos == '[') { // char range(s)
-                pos++;
-                enum llama_gretype start_type = LLAMA_GRETYPE_CHAR;
-                if (*pos == '^') {
-                    pos++;
-                    start_type = LLAMA_GRETYPE_CHAR_NOT;
-                }
-                last_sym_start = out_elements.size();
-                while (*pos != ']') {
-                    if (!*pos) {
-                        throw std::runtime_error("unexpected end of input");
-                    }
-                    auto char_pair = parse_char(pos);
-                         pos       = char_pair.second;
-                    enum llama_gretype type = last_sym_start < out_elements.size()
-                        ? LLAMA_GRETYPE_CHAR_ALT
-                        : start_type;
-
-                    out_elements.push_back({type, char_pair.first});
-                    if (pos[0] == '-' && pos[1] != ']') {
-                        if (!pos[1]) {
-                            throw std::runtime_error("unexpected end of input");
-                        }
-                        auto endchar_pair = parse_char(pos + 1);
-                             pos          = endchar_pair.second;
-                        out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});
-                    }
-                }
-                pos = parse_space(pos + 1, is_nested);
-            } else if (is_word_char(*pos)) { // rule reference
-                const char * name_end    = parse_name(pos);
-                uint32_t     ref_rule_id = get_symbol_id(state, pos, name_end - pos);
-                pos = parse_space(name_end, is_nested);
-                last_sym_start = out_elements.size();
-                out_elements.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id});
-            } else if (*pos == '(') { // grouping
-                // parse nested alternates into synthesized rule
-                pos = parse_space(pos + 1, true);
-                uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
-                pos = parse_alternates(state, pos, rule_name, sub_rule_id, true);
-                last_sym_start = out_elements.size();
-                // output reference to synthesized rule
-                out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
-                if (*pos != ')') {
-                    throw std::runtime_error(std::string("expecting ')' at ") + pos);
-                }
-                pos = parse_space(pos + 1, is_nested);
-            } else if (*pos == '.') { // any char
-                last_sym_start = out_elements.size();
-                out_elements.push_back({LLAMA_GRETYPE_CHAR_ANY, 0});
-                pos = parse_space(pos + 1, is_nested);
-            } else if (*pos == '*') {
-                pos = parse_space(pos + 1, is_nested);
-                handle_repetitions(0, -1);
-            } else if (*pos == '+') {
-                pos = parse_space(pos + 1, is_nested);
-                handle_repetitions(1, -1);
-            } else if (*pos == '?') {
-                pos = parse_space(pos + 1, is_nested);
-                handle_repetitions(0, 1);
-            } else if (*pos == '{') {
-                pos = parse_space(pos + 1, is_nested);
-
-                if (!is_digit_char(*pos)) {
-                    throw std::runtime_error(std::string("expecting an int at ") + pos);
-                }
-                const char * int_end = parse_int(pos);
-                int min_times = std::stoul(std::string(pos, int_end - pos));
-                pos = parse_space(int_end, is_nested);
-
-                int max_times = -1;
-
-                if (*pos == '}') {
-                    max_times = min_times;
-                    pos = parse_space(pos + 1, is_nested);
-                } else if (*pos == ',') {
-                    pos = parse_space(pos + 1, is_nested);
-
-                    if (is_digit_char(*pos)) {
-                        const char * int_end = parse_int(pos);
-                        max_times = std::stoul(std::string(pos, int_end - pos));
-                        pos = parse_space(int_end, is_nested);
-                    }
-
-                    if (*pos != '}') {
-                        throw std::runtime_error(std::string("expecting '}' at ") + pos);
-                    }
-                    pos = parse_space(pos + 1, is_nested);
-                } else {
-                    throw std::runtime_error(std::string("expecting ',' at ") + pos);
-                }
-                handle_repetitions(min_times, max_times);
-            } else {
-                break;
-            }
-        }
-        return pos;
-    }
-
-    const char * parse_alternates(
-            parse_state       & state,
-            const char        * src,
-            const std::string & rule_name,
-            uint32_t            rule_id,
-            bool                is_nested) {
-        std::vector<llama_grammar_element> rule;
-        const char * pos = parse_sequence(state, src, rule_name, rule, is_nested);
-        while (*pos == '|') {
-            rule.push_back({LLAMA_GRETYPE_ALT, 0});
-            pos = parse_space(pos + 1, true);
-            pos = parse_sequence(state, pos, rule_name, rule, is_nested);
-        }
-        rule.push_back({LLAMA_GRETYPE_END, 0});
-        add_rule(state, rule_id, rule);
-        return pos;
-    }
-
-    static const char * parse_rule(parse_state & state, const char * src) {
-        const char * name_end = parse_name(src);
-        const char * pos      = parse_space(name_end, false);
-        size_t       name_len = name_end - src;
-        uint32_t     rule_id  = get_symbol_id(state, src, name_len);
-        const std::string name(src, name_len);
-
-        if (!(pos[0] == ':' && pos[1] == ':' && pos[2] == '=')) {
-            throw std::runtime_error(std::string("expecting ::= at ") + pos);
-        }
-        pos = parse_space(pos + 3, true);
-
-        pos = parse_alternates(state, pos, name, rule_id, false);
-
-        if (*pos == '\r') {
-            pos += pos[1] == '\n' ? 2 : 1;
-        } else if (*pos == '\n') {
-            pos++;
-        } else if (*pos) {
-            throw std::runtime_error(std::string("expecting newline or end at ") + pos);
-        }
-        return parse_space(pos, true);
-    }
-
-    parse_state parse(const char * src) {
-        try {
-            parse_state state;
-            const char * pos = parse_space(src, true);
-            while (*pos) {
-                pos = parse_rule(state, pos);
-            }
-            // Validate the state to ensure that all rules are defined
-            for (const auto & rule : state.rules) {
-                if (rule.empty()) {
-                    throw std::runtime_error("Undefined rule");
-                }
-                for (const auto & elem : rule) {
-                    if (elem.type == LLAMA_GRETYPE_RULE_REF) {
-                        // Ensure that the rule at that location exists
-                        if (elem.value >= state.rules.size() || state.rules[elem.value].empty()) {
-                            // Get the name of the rule that is missing
-                            for (const auto & kv : state.symbol_ids) {
-                                if (kv.second == elem.value) {
-                                    throw std::runtime_error("Undefined rule identifier '" + kv.first + "'");
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-            return state;
-        } catch (const std::exception & err) {
-            fprintf(stderr, "%s: error parsing grammar: %s\n", __func__, err.what());
-            return parse_state();
-        }
-    }
-
-    static void print_grammar_char(FILE * file, uint32_t c) {
-        if (0x20 <= c && c <= 0x7f) {
-            fprintf(file, "%c", static_cast<char>(c));
-        } else {
-            // cop out of encoding UTF-8
-            fprintf(file, "<U+%04X>", c);
-        }
-    }
-
-    static bool is_char_element(llama_grammar_element elem) {
-        switch (elem.type) {
-            case LLAMA_GRETYPE_CHAR:           return true;
-            case LLAMA_GRETYPE_CHAR_NOT:       return true;
-            case LLAMA_GRETYPE_CHAR_ALT:       return true;
-            case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true;
-            case LLAMA_GRETYPE_CHAR_ANY:       return true;
-            default:                           return false;
-        }
-    }
-
-    static void print_rule_binary(FILE * file, const std::vector<llama_grammar_element> & rule) {
-        for (auto elem : rule) {
-            switch (elem.type) {
-                case LLAMA_GRETYPE_END:            fprintf(file, "END");            break;
-                case LLAMA_GRETYPE_ALT:            fprintf(file, "ALT");            break;
-                case LLAMA_GRETYPE_RULE_REF:       fprintf(file, "RULE_REF");       break;
-                case LLAMA_GRETYPE_CHAR:           fprintf(file, "CHAR");           break;
-                case LLAMA_GRETYPE_CHAR_NOT:       fprintf(file, "CHAR_NOT");       break;
-                case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
-                case LLAMA_GRETYPE_CHAR_ALT:       fprintf(file, "CHAR_ALT");       break;
-                case LLAMA_GRETYPE_CHAR_ANY:       fprintf(file, "CHAR_ANY");       break;
-            }
-            switch (elem.type) {
-                case LLAMA_GRETYPE_END:
-                case LLAMA_GRETYPE_ALT:
-                case LLAMA_GRETYPE_RULE_REF:
-                    fprintf(file, "(%u) ", elem.value);
-                    break;
-                case LLAMA_GRETYPE_CHAR:
-                case LLAMA_GRETYPE_CHAR_NOT:
-                case LLAMA_GRETYPE_CHAR_RNG_UPPER:
-                case LLAMA_GRETYPE_CHAR_ALT:
-                case LLAMA_GRETYPE_CHAR_ANY:
-                    fprintf(file, "(\"");
-                    print_grammar_char(file, elem.value);
-                    fprintf(file, "\") ");
-                    break;
-            }
-        }
-        fprintf(file, "\n");
-    }
-
-    static void print_rule(
-            FILE     * file,
-            uint32_t   rule_id,
-            const std::vector<llama_grammar_element> & rule,
-            const std::map<uint32_t, std::string>    & symbol_id_names) {
-        if (rule.empty() || rule.back().type != LLAMA_GRETYPE_END) {
-            throw std::runtime_error(
-                "malformed rule, does not end with LLAMA_GRETYPE_END: " + std::to_string(rule_id));
-        }
-        fprintf(file, "%s ::= ", symbol_id_names.at(rule_id).c_str());
-        for (size_t i = 0, end = rule.size() - 1; i < end; i++) {
-            llama_grammar_element elem = rule[i];
-            switch (elem.type) {
-                case LLAMA_GRETYPE_END:
-                    throw std::runtime_error(
-                        "unexpected end of rule: " + std::to_string(rule_id) + "," +
-                        std::to_string(i));
-                case LLAMA_GRETYPE_ALT:
-                    fprintf(file, "| ");
-                    break;
-                case LLAMA_GRETYPE_RULE_REF:
-                    fprintf(file, "%s ", symbol_id_names.at(elem.value).c_str());
-                    break;
-                case LLAMA_GRETYPE_CHAR:
-                    fprintf(file, "[");
-                    print_grammar_char(file, elem.value);
-                    break;
-                case LLAMA_GRETYPE_CHAR_NOT:
-                    fprintf(file, "[^");
-                    print_grammar_char(file, elem.value);
-                    break;
-                case LLAMA_GRETYPE_CHAR_RNG_UPPER:
-                    if (i == 0 || !is_char_element(rule[i - 1])) {
-                        throw std::runtime_error(
-                            "LLAMA_GRETYPE_CHAR_RNG_UPPER without preceding char: " +
-                            std::to_string(rule_id) + "," + std::to_string(i));
-                    }
-                    fprintf(file, "-");
-                    print_grammar_char(file, elem.value);
-                    break;
-                case LLAMA_GRETYPE_CHAR_ALT:
-                    if (i == 0 || !is_char_element(rule[i - 1])) {
-                        throw std::runtime_error(
-                            "LLAMA_GRETYPE_CHAR_ALT without preceding char: " +
-                            std::to_string(rule_id) + "," + std::to_string(i));
-                    }
-                    print_grammar_char(file, elem.value);
-                    break;
-                case LLAMA_GRETYPE_CHAR_ANY:
-                    fprintf(file, ".");
-                    break;
-            }
-            if (is_char_element(elem)) {
-                switch (rule[i + 1].type) {
-                    case LLAMA_GRETYPE_CHAR_ALT:
-                    case LLAMA_GRETYPE_CHAR_RNG_UPPER:
-                    case LLAMA_GRETYPE_CHAR_ANY:
-                        break;
-                    default:
-                        fprintf(file, "] ");
-                }
-            }
-        }
-        fprintf(file, "\n");
-    }
-
-    void print_grammar(FILE * file, const parse_state & state) {
-        try {
-            std::map<uint32_t, std::string> symbol_id_names;
-            for (const auto & kv : state.symbol_ids) {
-                symbol_id_names[kv.second] = kv.first;
-            }
-            for (size_t i = 0, end = state.rules.size(); i < end; i++) {
-                // fprintf(file, "%zu: ", i);
-                // print_rule_binary(file, state.rules[i]);
-                print_rule(file, uint32_t(i), state.rules[i], symbol_id_names);
-                // fprintf(file, "\n");
-            }
-        } catch (const std::exception & err) {
-            fprintf(stderr, "\n%s: error printing grammar: %s\n", __func__, err.what());
-        }
-    }
-
-    std::vector<const llama_grammar_element *> parse_state::c_rules() {
-        std::vector<const llama_grammar_element *> ret;
-        ret.reserve(rules.size());
-        for (const auto & rule : rules) {
-            ret.push_back(rule.data());
-        }
-        return ret;
-    }
-}
--- a/common/grammar-parser.h
+++ b/common/grammar-parser.h
@@ -1,29 +0,0 @@
-// Implements a parser for an extended Backus-Naur form (BNF), producing the
-// binary context-free grammar format specified by llama.h. Supports character
-// ranges, grouping, and repetition operators. As an example, a grammar for
-// arithmetic might look like:
-//
-// root  ::= expr
-// expr  ::= term ([-+*/] term)*
-// term  ::= num | "(" space expr ")" space
-// num   ::= [0-9]+ space
-// space ::= [ \t\n]*
-
-#pragma once
-#include "llama.h"
-#include <vector>
-#include <map>
-#include <cstdint>
-#include <string>
-
-namespace grammar_parser {
-    struct parse_state {
-        std::map<std::string, uint32_t>                 symbol_ids;
-        std::vector<std::vector<llama_grammar_element>> rules;
-
-        std::vector<const llama_grammar_element *> c_rules();
-    };
-
-    parse_state parse(const char * src);
-    void print_grammar(FILE * file, const parse_state & state);
-}
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -611,7 +611,7 @@ private:
            }
            return join_seq();
        };
-        return _add_rule(name, "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space");
+        return _add_rule(name, "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space");
    }

    /*
--- a/common/log.cpp
+++ b/common/log.cpp
@@ -0,0 +1,401 @@
+#include "log.h"
+
+#include <condition_variable>
+#include <cstdarg>
+#include <cstdio>
+#include <mutex>
+#include <sstream>
+#include <thread>
+#include <vector>
+
+int common_log_verbosity_thold = LOG_DEFAULT_LLAMA;
+
+void common_log_set_verbosity_thold(int verbosity) {
+    common_log_verbosity_thold = verbosity;
+}
+
+#define LOG_COL_DEFAULT "\033[0m"
+#define LOG_COL_BOLD    "\033[1m"
+#define LOG_COL_RED     "\033[31m"
+#define LOG_COL_GREEN   "\033[32m"
+#define LOG_COL_YELLOW  "\033[33m"
+#define LOG_COL_BLUE    "\033[34m"
+#define LOG_COL_MAGENTA "\033[35m"
+#define LOG_COL_CYAN    "\033[36m"
+#define LOG_COL_WHITE   "\033[37m"
+
+static int64_t t_us() {
+    return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
+}
+
+// colors
+enum common_log_col : int {
+    COMMON_LOG_COL_DEFAULT = 0,
+    COMMON_LOG_COL_BOLD,
+    COMMON_LOG_COL_RED,
+    COMMON_LOG_COL_GREEN,
+    COMMON_LOG_COL_YELLOW,
+    COMMON_LOG_COL_BLUE,
+    COMMON_LOG_COL_MAGENTA,
+    COMMON_LOG_COL_CYAN,
+    COMMON_LOG_COL_WHITE,
+};
+
+// disable colors by default
+static std::vector<const char *> g_col = {
+    "",
+    "",
+    "",
+    "",
+    "",
+    "",
+    "",
+    "",
+    "",
+};
+
+struct common_log_entry {
+    enum ggml_log_level level;
+
+    bool prefix;
+
+    int64_t timestamp;
+
+    std::vector<char> msg;
+
+    // signals the worker thread to stop
+    bool is_end;
+
+    void print(FILE * file = nullptr) const {
+        FILE * fcur = file;
+        if (!fcur) {
+            // stderr displays DBG messages only when their verbosity level is not higher than the threshold
+            // these messages will still be logged to a file
+            if (level == GGML_LOG_LEVEL_DEBUG && common_log_verbosity_thold < LOG_DEFAULT_DEBUG) {
+                return;
+            }
+
+            fcur = stdout;
+
+            if (level != GGML_LOG_LEVEL_NONE) {
+                fcur = stderr;
+            }
+        }
+
+        if (level != GGML_LOG_LEVEL_NONE && level != GGML_LOG_LEVEL_CONT && prefix) {
+            if (timestamp) {
+                // [M.s.ms.us]
+                fprintf(fcur, "%s%d.%02d.%03d.%03d%s ",
+                        g_col[COMMON_LOG_COL_BLUE],
+                        (int) (timestamp / 1000000 / 60),
+                        (int) (timestamp / 1000000 % 60),
+                        (int) (timestamp / 1000 % 1000),
+                        (int) (timestamp % 1000),
+                        g_col[COMMON_LOG_COL_DEFAULT]);
+            }
+
+            switch (level) {
+                case GGML_LOG_LEVEL_INFO:  fprintf(fcur, "%sI %s", g_col[COMMON_LOG_COL_GREEN],   g_col[COMMON_LOG_COL_DEFAULT]); break;
+                case GGML_LOG_LEVEL_WARN:  fprintf(fcur, "%sW %s", g_col[COMMON_LOG_COL_MAGENTA], ""                        ); break;
+                case GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[COMMON_LOG_COL_RED],     ""                        ); break;
+                case GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[COMMON_LOG_COL_YELLOW],  ""                        ); break;
+                default:
+                    break;
+            }
+        }
+
+        fprintf(fcur, "%s", msg.data());
+
+        if (level == GGML_LOG_LEVEL_WARN || level == GGML_LOG_LEVEL_ERROR || level == GGML_LOG_LEVEL_DEBUG) {
+            fprintf(fcur, "%s", g_col[COMMON_LOG_COL_DEFAULT]);
+        }
+
+        fflush(fcur);
+    }
+};
+
+struct common_log {
+    // default capacity - will be expanded if needed
+    common_log() : common_log(256) {}
+
+    common_log(size_t capacity) {
+        file = nullptr;
+        prefix = false;
+        timestamps = false;
+        running = false;
+        t_start = t_us();
+
+        // initial message size - will be expanded if longer messages arrive
+        entries.resize(capacity);
+        for (auto & entry : entries) {
+            entry.msg.resize(256);
+        }
+
+        head = 0;
+        tail = 0;
+
+        resume();
+    }
+
+    ~common_log() {
+        pause();
+        if (file) {
+            fclose(file);
+        }
+    }
+
+private:
+    std::mutex mtx;
+    std::thread thrd;
+    std::condition_variable cv;
+
+    FILE * file;
+
+    bool prefix;
+    bool timestamps;
+    bool running;
+
+    int64_t t_start;
+
+    // ring buffer of entries
+    std::vector<common_log_entry> entries;
+    size_t head;
+    size_t tail;
+
+    // worker thread copies into this
+    common_log_entry cur;
+
+public:
+    void add(enum ggml_log_level level, const char * fmt, va_list args) {
+        std::lock_guard<std::mutex> lock(mtx);
+
+        if (!running) {
+            // discard messages while the worker thread is paused
+            return;
+        }
+
+        auto & entry = entries[tail];
+
+        {
+            // cannot use args twice, so make a copy in case we need to expand the buffer
+            va_list args_copy;
+            va_copy(args_copy, args);
+
+#if 1
+            const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args);
+            if (n >= entry.msg.size()) {
+                entry.msg.resize(n + 1);
+                vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args_copy);
+            }
+#else
+            // hack for bolding arguments
+
+            std::stringstream ss;
+            for (int i = 0; fmt[i] != 0; i++) {
+                if (fmt[i] == '%') {
+                    ss << LOG_COL_BOLD;
+                    while (fmt[i] != ' ' && fmt[i] != ')' && fmt[i] != ']' && fmt[i] != 0) ss << fmt[i++];
+                    ss << LOG_COL_DEFAULT;
+                    if (fmt[i] == 0) break;
+                }
+                ss << fmt[i];
+            }
+            const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args);
+            if (n >= entry.msg.size()) {
+                entry.msg.resize(n + 1);
+                vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args_copy);
+            }
+#endif
+        }
+
+        entry.level = level;
+        entry.prefix = prefix;
+        entry.timestamp = 0;
+        if (timestamps) {
+            entry.timestamp = t_us() - t_start;
+        }
+        entry.is_end = false;
+
+        tail = (tail + 1) % entries.size();
+        if (tail == head) {
+            // expand the buffer
+            std::vector<common_log_entry> new_entries(2*entries.size());
+
+            size_t new_tail = 0;
+
+            do {
+                new_entries[new_tail] = std::move(entries[head]);
+
+                head     = (head     + 1) % entries.size();
+                new_tail = (new_tail + 1);
+            } while (head != tail);
+
+            head = 0;
+            tail = new_tail;
+
+            for (size_t i = tail; i < new_entries.size(); i++) {
+                new_entries[i].msg.resize(256);
+            }
+
+            entries = std::move(new_entries);
+        }
+
+        cv.notify_one();
+    }
+
+    void resume() {
+        std::lock_guard<std::mutex> lock(mtx);
+
+        if (running) {
+            return;
+        }
+
+        running = true;
+
+        thrd = std::thread([this]() {
+            while (true) {
+                {
+                    std::unique_lock<std::mutex> lock(mtx);
+                    cv.wait(lock, [this]() { return head != tail; });
+
+                    cur = entries[head];
+
+                    head = (head + 1) % entries.size();
+                }
+
+                if (cur.is_end) {
+                    break;
+                }
+
+                cur.print(); // stdout and stderr
+
+                if (file) {
+                    cur.print(file);
+                }
+            }
+        });
+    }
+
+    void pause() {
+        {
+            std::lock_guard<std::mutex> lock(mtx);
+
+            if (!running) {
+                return;
+            }
+
+            running = false;
+
+            // push an entry to signal the worker thread to stop
+            {
+                auto & entry = entries[tail];
+                entry.is_end = true;
+
+                tail = (tail + 1) % entries.size();
+            }
+
+            cv.notify_one();
+        }
+
+        thrd.join();
+    }
+
+    void set_file(const char * path) {
+        pause();
+
+        if (file) {
+            fclose(file);
+        }
+
+        if (path) {
+            file = fopen(path, "w");
+        } else {
+            file = nullptr;
+        }
+
+        resume();
+    }
+
+    void set_colors(bool colors) {
+        pause();
+
+        if (colors) {
+            g_col[COMMON_LOG_COL_DEFAULT] = LOG_COL_DEFAULT;
+            g_col[COMMON_LOG_COL_BOLD]    = LOG_COL_BOLD;
+            g_col[COMMON_LOG_COL_RED]     = LOG_COL_RED;
+            g_col[COMMON_LOG_COL_GREEN]   = LOG_COL_GREEN;
+            g_col[COMMON_LOG_COL_YELLOW]  = LOG_COL_YELLOW;
+            g_col[COMMON_LOG_COL_BLUE]    = LOG_COL_BLUE;
+            g_col[COMMON_LOG_COL_MAGENTA] = LOG_COL_MAGENTA;
+            g_col[COMMON_LOG_COL_CYAN]    = LOG_COL_CYAN;
+            g_col[COMMON_LOG_COL_WHITE]   = LOG_COL_WHITE;
+        } else {
+            for (size_t i = 0; i < g_col.size(); i++) {
+                g_col[i] = "";
+            }
+        }
+
+        resume();
+    }
+
+    void set_prefix(bool prefix) {
+        std::lock_guard<std::mutex> lock(mtx);
+
+        this->prefix = prefix;
+    }
+
+    void set_timestamps(bool timestamps) {
+        std::lock_guard<std::mutex> lock(mtx);
+
+        this->timestamps = timestamps;
+    }
+};
+
+//
+// public API
+//
+
+struct common_log * common_log_init() {
+    return new common_log;
+}
+
+struct common_log * common_log_main() {
+    static struct common_log log;
+
+    return &log;
+}
+
+void common_log_pause(struct common_log * log) {
+    log->pause();
+}
+
+void common_log_resume(struct common_log * log) {
+    log->resume();
+}
+
+void common_log_free(struct common_log * log) {
+    delete log;
+}
+
+void common_log_add(struct common_log * log, enum ggml_log_level level, const char * fmt, ...) {
+    va_list args;
+    va_start(args, fmt);
+    log->add(level, fmt, args);
+    va_end(args);
+}
+
+void common_log_set_file(struct common_log * log, const char * file) {
+    log->set_file(file);
+}
+
+void common_log_set_colors(struct common_log * log, bool colors) {
+    log->set_colors(colors);
+}
+
+void common_log_set_prefix(struct common_log * log, bool prefix) {
+    log->set_prefix(prefix);
+}
+
+void common_log_set_timestamps(struct common_log * log, bool timestamps) {
+    log->set_timestamps(timestamps);
+}
--- a/common/log.h
+++ b/common/log.h
@@ -1,724 +1,92 @@
 #pragma once

-#include <chrono>
-#include <cstring>
-#include <sstream>
-#include <iostream>
-#include <thread>
-#include <vector>
-#include <algorithm>
-#include <cinttypes>
+#include "ggml.h" // for ggml_log_level

-// --------------------------------
-//
-// Basic usage:
-//
-// --------
-//
-//  The LOG() and LOG_TEE() macros are ready to go by default
-//   they do not require any initialization.
-//
-//  LOGLN() and LOG_TEELN() are variants which automatically
-//   include \n character at the end of the log string.
-//
-//  LOG() behaves exactly like printf, by default writing to a logfile.
-//  LOG_TEE() additionally, prints to the screen too ( mimics Unix tee command ).
-//
-//  Default logfile is named
-//   "llama.<threadID>.log"
-//  Default LOG_TEE() secondary output target is
-//   stderr
-//
-//  Logs can be dynamically disabled or enabled using functions:
-//   log_disable()
-//  and
-//   log_enable()
-//
-//  A log target can be changed with:
-//   log_set_target( string )
-//    creating and opening, or re-opening a file by string filename
-//  or
-//   log_set_target( FILE* )
-//    allowing to point at stderr, stdout, or any valid FILE* file handler.
-//
-// --------
-//
-// End of Basic usage.
-//
-// --------------------------------
-
-// Specifies a log target.
-//  default uses log_handler() with "llama.log" log file
-//  this can be changed, by defining LOG_TARGET
-//  like so:
-//
-//  #define LOG_TARGET (a valid FILE*)
-//  #include "log.h"
-//
-//  or it can be simply redirected to stdout or stderr
-//  like so:
-//
-//  #define LOG_TARGET stderr
-//  #include "log.h"
-//
-//  The log target can also be redirected to a different function
-//  like so:
-//
-//  #define LOG_TARGET log_handler_different()
-//  #include "log.h"
-//
-//  FILE* log_handler_different()
-//  {
-//      return stderr;
-//  }
-//
-//  or:
-//
-//  #define LOG_TARGET log_handler_another_one("somelog.log")
-//  #include "log.h"
-//
-//  FILE* log_handler_another_one(char*filename)
-//  {
-//      static FILE* logfile = nullptr;
-//      (...)
-//      if( !logfile )
-//      {
-//          fopen(...)
-//      }
-//      (...)
-//      return logfile
-//  }
-//
-#ifndef LOG_TARGET
-    #define LOG_TARGET log_handler()
-#endif
-
-#ifndef LOG_TEE_TARGET
-    #define LOG_TEE_TARGET stderr
-#endif
-
-// Utility for synchronizing log configuration state
-//  since std::optional was introduced only in c++17
-enum LogTriState
-{
-    LogTriStateSame,
-    LogTriStateFalse,
-    LogTriStateTrue
-};
-
-// Utility to obtain "pid" like unique process id and use it when creating log files.
-inline std::string log_get_pid()
-{
-   static std::string pid;
-   if (pid.empty())
-   {
-       // std::this_thread::get_id() is the most portable way of obtaining a "process id"
-       //  it's not the same as "pid" but is unique enough to solve multiple instances
-       //  trying to write to the same log.
-       std::stringstream ss;
-       ss << std::this_thread::get_id();
-       pid = ss.str();
-   }
-
-   return pid;
-}
-
-// Utility function for generating log file names with unique id based on thread id.
-//  invocation with log_filename_generator( "llama", "log" ) creates a string "llama.<number>.log"
-//  where the number is a runtime id of the current thread.
-
-#define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(LogTriStateSame, log_file_basename, log_file_extension)
-
-// INTERNAL, DO NOT USE
-inline std::string log_filename_generator_impl(LogTriState multilog, const std::string & log_file_basename, const std::string & log_file_extension)
-{
-    static bool _multilog = false;
-
-    if (multilog != LogTriStateSame)
-    {
-        _multilog = multilog == LogTriStateTrue;
-    }
-
-    std::stringstream buf;
-
-    buf << log_file_basename;
-    if (_multilog)
-    {
-        buf << ".";
-        buf << log_get_pid();
-    }
-    buf << ".";
-    buf << log_file_extension;
-
-    return buf.str();
-}
-
-#ifndef LOG_DEFAULT_FILE_NAME
-    #define LOG_DEFAULT_FILE_NAME log_filename_generator("llama", "log")
-#endif
-
-// Utility for turning #define values into string literals
-//  so we can have a define for stderr and
-//  we can print "stderr" instead of literal stderr, etc.
-#define LOG_STRINGIZE1(s) #s
-#define LOG_STRINGIZE(s) LOG_STRINGIZE1(s)
-
-#define LOG_TEE_TARGET_STRING LOG_STRINGIZE(LOG_TEE_TARGET)
-
-// Allows disabling timestamps.
-//  in order to disable, define LOG_NO_TIMESTAMPS
-//  like so:
-//
-//  #define LOG_NO_TIMESTAMPS
-//  #include "log.h"
-//
-#ifndef LOG_NO_TIMESTAMPS
-    #ifndef _MSC_VER
-        #define LOG_TIMESTAMP_FMT "[%" PRIu64 "] "
-        #define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
-    #else
-        #define LOG_TIMESTAMP_FMT "[%" PRIu64 "] "
-        #define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
-    #endif
+#ifndef __GNUC__
+#    define LOG_ATTRIBUTE_FORMAT(...)
+#elif defined(__MINGW32__)
+#    define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
 #else
-    #define LOG_TIMESTAMP_FMT "%s"
-    #define LOG_TIMESTAMP_VAL ,""
+#    define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
 #endif

-#ifdef LOG_TEE_TIMESTAMPS
-    #ifndef _MSC_VER
-        #define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] "
-        #define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
-    #else
-        #define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] "
-        #define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
-    #endif
-#else
-    #define LOG_TEE_TIMESTAMP_FMT "%s"
-    #define LOG_TEE_TIMESTAMP_VAL ,""
-#endif
+#define LOG_DEFAULT_DEBUG 1
+#define LOG_DEFAULT_LLAMA 0

-// Allows disabling file/line/function prefix
-//  in order to disable, define LOG_NO_FILE_LINE_FUNCTION
-//  like so:
+// needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
+// set via common_log_set_verbosity()
+extern int common_log_verbosity_thold;
+
+void common_log_set_verbosity_thold(int verbosity); // not thread-safe
+
+// the common_log uses an internal worker thread to print/write log messages
+// when the worker thread is paused, incoming log messages are discarded
+struct common_log;
+
+struct common_log * common_log_init();
+struct common_log * common_log_main(); // singleton, automatically destroys itself on exit
+void                common_log_pause (struct common_log * log); // pause  the worker thread, not thread-safe
+void                common_log_resume(struct common_log * log); // resume the worker thread, not thread-safe
+void                common_log_free  (struct common_log * log);
+
+LOG_ATTRIBUTE_FORMAT(3, 4)
+void common_log_add(struct common_log * log, enum ggml_log_level level, const char * fmt, ...);
+
+// defaults: file = NULL, colors = false, prefix = false, timestamps = false
 //
-//  #define LOG_NO_FILE_LINE_FUNCTION
-//  #include "log.h"
+// regular log output:
 //
-#ifndef LOG_NO_FILE_LINE_FUNCTION
-    #ifndef _MSC_VER
-        #define LOG_FLF_FMT "[%24s:%5d][%24s] "
-        #define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
-    #else
-        #define LOG_FLF_FMT "[%24s:%5ld][%24s] "
-        #define LOG_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
-    #endif
-#else
-    #define LOG_FLF_FMT "%s"
-    #define LOG_FLF_VAL ,""
-#endif
-
-#ifdef LOG_TEE_FILE_LINE_FUNCTION
-    #ifndef _MSC_VER
-        #define LOG_TEE_FLF_FMT "[%24s:%5d][%24s] "
-        #define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
-    #else
-        #define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] "
-        #define LOG_TEE_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
-    #endif
-#else
-    #define LOG_TEE_FLF_FMT "%s"
-    #define LOG_TEE_FLF_VAL ,""
-#endif
-
-// INTERNAL, DO NOT USE
-//  USE LOG() INSTEAD
+//   ggml_backend_metal_log_allocated_size: allocated buffer, size =  6695.84 MiB, ( 6695.91 / 21845.34)
+//   llm_load_tensors: ggml ctx size =    0.27 MiB
+//   llm_load_tensors: offloading 32 repeating layers to GPU
+//   llm_load_tensors: offloading non-repeating layers to GPU
 //
-#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__)
-    #define LOG_IMPL(str, ...)                                                                                      \
-    do {                                                                                                            \
-        if (LOG_TARGET != nullptr)                                                                                  \
-        {                                                                                                           \
-            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \
-            fflush(LOG_TARGET);                                                                                     \
-        }                                                                                                           \
+// with prefix = true, timestamps = true, the log output will look like this:
+//
+//   0.00.035.060 D ggml_backend_metal_log_allocated_size: allocated buffer, size =  6695.84 MiB, ( 6695.91 / 21845.34)
+//   0.00.035.064 I llm_load_tensors: ggml ctx size =    0.27 MiB
+//   0.00.090.578 I llm_load_tensors: offloading 32 repeating layers to GPU
+//   0.00.090.579 I llm_load_tensors: offloading non-repeating layers to GPU
+//
+// I - info    (stdout, V = 0)
+// W - warning (stderr, V = 0)
+// E - error   (stderr, V = 0)
+// D - debug   (stderr, V = LOG_DEFAULT_DEBUG)
+//
+
+void common_log_set_file      (struct common_log * log, const char * file);       // not thread-safe
+void common_log_set_colors    (struct common_log * log,       bool   colors);     // not thread-safe
+void common_log_set_prefix    (struct common_log * log,       bool   prefix);     // whether to output prefix to each log
+void common_log_set_timestamps(struct common_log * log,       bool   timestamps); // whether to output timestamps in the prefix
+
+// helper macros for logging
+// use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
+//
+// for example:
+//
+//   LOG_DBG("this is a debug message: %d\n", expensive_function());
+//
+// this will avoid calling expensive_function() if LOG_DEFAULT_DEBUG > common_log_verbosity_thold
+//
+
+#define LOG_TMPL(level, verbosity, ...) \
+    do { \
+        if ((verbosity) <= common_log_verbosity_thold) { \
+            common_log_add(common_log_main(), (level), __VA_ARGS__); \
+        } \
    } while (0)
-#else
-    #define LOG_IMPL(str, ...)                                                                                           \
-    do {                                                                                                                 \
-        if (LOG_TARGET != nullptr)                                                                                       \
-        {                                                                                                                \
-            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \
-            fflush(LOG_TARGET);                                                                                          \
-        }                                                                                                                \
-    } while (0)
-#endif

-// INTERNAL, DO NOT USE
-//  USE LOG_TEE() INSTEAD
-//
-#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__)
-    #define LOG_TEE_IMPL(str, ...)                                                                                                      \
-    do {                                                                                                                                \
-        if (LOG_TARGET != nullptr)                                                                                                      \
-        {                                                                                                                               \
-            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__);                     \
-            fflush(LOG_TARGET);                                                                                                         \
-        }                                                                                                                               \
-        if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr)                         \
-        {                                                                                                                               \
-            fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL, __VA_ARGS__); \
-            fflush(LOG_TEE_TARGET);                                                                                                     \
-        }                                                                                                                               \
-    } while (0)
-#else
-    #define LOG_TEE_IMPL(str, ...)                                                                                                           \
-    do {                                                                                                                                     \
-        if (LOG_TARGET != nullptr)                                                                                                           \
-        {                                                                                                                                    \
-            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__);                     \
-            fflush(LOG_TARGET);                                                                                                              \
-        }                                                                                                                                    \
-        if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr)                              \
-        {                                                                                                                                    \
-            fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL "", ##__VA_ARGS__); \
-            fflush(LOG_TEE_TARGET);                                                                                                          \
-        }                                                                                                                                    \
-    } while (0)
-#endif
+#define LOG(...)             LOG_TMPL(GGML_LOG_LEVEL_NONE, 0,         __VA_ARGS__)
+#define LOGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_NONE, verbosity, __VA_ARGS__)

-// The '\0' as a last argument, is a trick to bypass the silly
-//  "warning: ISO C++11 requires at least one argument for the "..." in a variadic macro"
-//  so we can have a single macro which can be called just like printf.
+#define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO,  0,                 __VA_ARGS__)
+#define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN,  0,                 __VA_ARGS__)
+#define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, 0,                 __VA_ARGS__)
+#define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_DEFAULT_DEBUG, __VA_ARGS__)
+#define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT,  0,                 __VA_ARGS__)

-// Main LOG macro.
-//  behaves like printf, and supports arguments the exact same way.
-//
-#if !defined(_MSC_VER) || defined(__clang__)
-    #define LOG(...) LOG_IMPL(__VA_ARGS__, "")
-#else
-    #define LOG(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "")
-#endif
-
-// Main TEE macro.
-//  does the same as LOG
-//  and
-//  simultaneously writes stderr.
-//
-// Secondary target can be changed just like LOG_TARGET
-//  by defining LOG_TEE_TARGET
-//
-#if !defined(_MSC_VER) || defined(__clang__)
-    #define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
-#else
-    #define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "")
-#endif
-
-// LOG macro variants with auto endline.
-#if !defined(_MSC_VER) || defined(__clang__)
-    #define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
-    #define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
-#else
-    #define LOGLN(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "\n")
-    #define LOG_TEELN(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "\n")
-#endif
-
-// INTERNAL, DO NOT USE
-inline FILE *log_handler1_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr)
-{
-    static bool _initialized = false;
-    static bool _append = false;
-    static bool _disabled = filename.empty() && target == nullptr;
-    static std::string log_current_filename{filename};
-    static FILE *log_current_target{target};
-    static FILE *logfile = nullptr;
-
-    if (change)
-    {
-        if (append != LogTriStateSame)
-        {
-            _append = append == LogTriStateTrue;
-            return logfile;
-        }
-
-        if (disable == LogTriStateTrue)
-        {
-            // Disable primary target
-            _disabled = true;
-        }
-        // If previously disabled, only enable, and keep previous target
-        else if (disable == LogTriStateFalse)
-        {
-            _disabled = false;
-        }
-        // Otherwise, process the arguments
-        else if (log_current_filename != filename || log_current_target != target)
-        {
-            _initialized = false;
-        }
-    }
-
-    if (_disabled)
-    {
-        // Log is disabled
-        return nullptr;
-    }
-
-    if (_initialized)
-    {
-        // with fallback in case something went wrong
-        return logfile ? logfile : stderr;
-    }
-
-    // do the (re)initialization
-    if (target != nullptr)
-    {
-        if (logfile != nullptr && logfile != stdout && logfile != stderr)
-        {
-            fclose(logfile);
-        }
-
-        log_current_filename = LOG_DEFAULT_FILE_NAME;
-        log_current_target = target;
-
-        logfile = target;
-    }
-    else
-    {
-        if (log_current_filename != filename)
-        {
-            if (logfile != nullptr && logfile != stdout && logfile != stderr)
-            {
-                fclose(logfile);
-            }
-        }
-
-        logfile = fopen(filename.c_str(), _append ? "a" : "w");
-    }
-
-    if (!logfile)
-    {
-        //  Verify whether the file was opened, otherwise fallback to stderr
-        logfile = stderr;
-
-        fprintf(stderr, "Failed to open logfile '%s' with error '%s'\n", filename.c_str(), std::strerror(errno));
-        fflush(stderr);
-
-        // At this point we let the init flag be to true below, and let the target fallback to stderr
-        //  otherwise we would repeatedly fopen() which was already unsuccessful
-    }
-
-    _initialized = true;
-
-    return logfile ? logfile : stderr;
-}
-
-// INTERNAL, DO NOT USE
-inline FILE *log_handler2_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME)
-{
-    return log_handler1_impl(change, append, disable, filename, target);
-}
-
-// Disables logs entirely at runtime.
-//  Makes LOG() and LOG_TEE() produce no output,
-//  until enabled back.
-#define log_disable() log_disable_impl()
-
-// INTERNAL, DO NOT USE
-inline FILE *log_disable_impl()
-{
-    return log_handler1_impl(true, LogTriStateSame, LogTriStateTrue);
-}
-
-// Enables logs at runtime.
-#define log_enable() log_enable_impl()
-
-// INTERNAL, DO NOT USE
-inline FILE *log_enable_impl()
-{
-    return log_handler1_impl(true, LogTriStateSame, LogTriStateFalse);
-}
-
-// Sets target fir logs, either by a file name or FILE* pointer (stdout, stderr, or any valid FILE*)
-#define log_set_target(target) log_set_target_impl(target)
-
-// INTERNAL, DO NOT USE
-inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, LogTriStateSame, filename); }
-inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, LogTriStateSame, target); }
-
-// INTERNAL, DO NOT USE
-inline FILE *log_handler() { return log_handler1_impl(); }
-
-// Enable or disable creating separate log files for each run.
-//  can ONLY be invoked BEFORE first log use.
-#define log_multilog(enable) log_filename_generator_impl((enable) ? LogTriStateTrue : LogTriStateFalse, "", "")
-// Enable or disable append mode for log file.
-//  can ONLY be invoked BEFORE first log use.
-#define log_append(enable) log_append_impl(enable)
-// INTERNAL, DO NOT USE
-inline FILE *log_append_impl(bool enable)
-{
-    return log_handler1_impl(true, enable ? LogTriStateTrue : LogTriStateFalse, LogTriStateSame);
-}
-
-inline void log_test()
-{
-    log_disable();
-    LOG("01 Hello World to nobody, because logs are disabled!\n");
-    log_enable();
-    LOG("02 Hello World to default output, which is \"%s\" ( Yaaay, arguments! )!\n", LOG_STRINGIZE(LOG_TARGET));
-    LOG_TEE("03 Hello World to **both** default output and " LOG_TEE_TARGET_STRING "!\n");
-    log_set_target(stderr);
-    LOG("04 Hello World to stderr!\n");
-    LOG_TEE("05 Hello World TEE with double printing to stderr prevented!\n");
-    log_set_target(LOG_DEFAULT_FILE_NAME);
-    LOG("06 Hello World to default log file!\n");
-    log_set_target(stdout);
-    LOG("07 Hello World to stdout!\n");
-    log_set_target(LOG_DEFAULT_FILE_NAME);
-    LOG("08 Hello World to default log file again!\n");
-    log_disable();
-    LOG("09 Hello World _1_ into the void!\n");
-    log_enable();
-    LOG("10 Hello World back from the void ( you should not see _1_ in the log or the output )!\n");
-    log_disable();
-    log_set_target("llama.anotherlog.log");
-    LOG("11 Hello World _2_ to nobody, new target was selected but logs are still disabled!\n");
-    log_enable();
-    LOG("12 Hello World this time in a new file ( you should not see _2_ in the log or the output )?\n");
-    log_set_target("llama.yetanotherlog.log");
-    LOG("13 Hello World this time in yet new file?\n");
-    log_set_target(log_filename_generator("llama_autonamed", "log"));
-    LOG("14 Hello World in log with generated filename!\n");
-#ifdef _MSC_VER
-    LOG_TEE("15 Hello msvc TEE without arguments\n");
-    LOG_TEE("16 Hello msvc TEE with (%d)(%s) arguments\n", 1, "test");
-    LOG_TEELN("17 Hello msvc TEELN without arguments\n");
-    LOG_TEELN("18 Hello msvc TEELN with (%d)(%s) arguments\n", 1, "test");
-    LOG("19 Hello msvc LOG without arguments\n");
-    LOG("20 Hello msvc LOG with (%d)(%s) arguments\n", 1, "test");
-    LOGLN("21 Hello msvc LOGLN without arguments\n");
-    LOGLN("22 Hello msvc LOGLN with (%d)(%s) arguments\n", 1, "test");
-#endif
-}
-
-inline bool log_param_single_parse(const std::string & param)
-{
-    if ( param == "--log-test")
-    {
-        log_test();
-        return true;
-    }
-
-    if ( param == "--log-disable")
-    {
-        log_disable();
-        return true;
-    }
-
-    if ( param == "--log-enable")
-    {
-        log_enable();
-        return true;
-    }
-
-    if (param == "--log-new")
-    {
-        log_multilog(true);
-        return true;
-    }
-
-    if (param == "--log-append")
-    {
-        log_append(true);
-        return true;
-    }
-
-    return false;
-}
-
-inline bool log_param_pair_parse(bool check_but_dont_parse, const std::string & param, const std::string & next = std::string())
-{
-    if ( param == "--log-file")
-    {
-        if (!check_but_dont_parse)
-        {
-            log_set_target(log_filename_generator(next.empty() ? "unnamed" : next, "log"));
-        }
-
-        return true;
-    }
-
-    return false;
-}
-
-inline void log_print_usage()
-{
-    printf("log options:\n");
-    /* format
-    printf("  -h, --help            show this help message and exit\n");*/
-    /* spacing
-    printf("__-param----------------Description\n");*/
-    printf("  --log-test            Run simple logging test\n");
-    printf("  --log-disable         Disable trace logs\n");
-    printf("  --log-enable          Enable trace logs\n");
-    printf("  --log-file            Specify a log filename (without extension)\n");
-    printf("  --log-new             Create a separate new log file on start. "
-                                   "Each log file will have unique name: \"<name>.<ID>.log\"\n");
-    printf("  --log-append          Don't truncate the old log file.\n");
-    printf("\n");
-}
-
-#define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv)
-
-// INTERNAL, DO NOT USE
-inline void log_dump_cmdline_impl(int argc, char **argv)
-{
-    std::stringstream buf;
-    for (int i = 0; i < argc; ++i)
-    {
-        if (std::string(argv[i]).find(' ') != std::string::npos)
-        {
-            buf << " \"" << argv[i] <<"\"";
-        }
-        else
-        {
-            buf << " " << argv[i];
-        }
-    }
-    LOGLN("Cmd:%s", buf.str().c_str());
-}
-
-#define log_tostr(var) log_var_to_string_impl(var).c_str()
-
-inline std::string log_var_to_string_impl(bool var)
-{
-    return var ? "true" : "false";
-}
-
-inline std::string log_var_to_string_impl(std::string var)
-{
-    return var;
-}
-
-inline std::string log_var_to_string_impl(const std::vector<int> & var)
-{
-    std::stringstream buf;
-    buf << "[ ";
-    bool first = true;
-    for (auto e : var)
-    {
-        if (first)
-        {
-            first = false;
-        }
-        else
-        {
-            buf << ", ";
-        }
-        buf << std::to_string(e);
-    }
-    buf << " ]";
-
-    return buf.str();
-}
-
-template <typename C, typename T>
-inline std::string LOG_TOKENS_TOSTR_PRETTY(const C & ctx, const T & tokens)
-{
-    std::stringstream buf;
-    buf << "[ ";
-
-    bool first = true;
-    for (const auto & token : tokens)
-    {
-        if (!first) {
-            buf << ", ";
-        } else {
-            first = false;
-        }
-
-        auto detokenized = llama_token_to_piece(ctx, token);
-
-        detokenized.erase(
-            std::remove_if(
-                detokenized.begin(),
-                detokenized.end(),
-                [](const unsigned char c) { return !std::isprint(c); }),
-            detokenized.end());
-
-        buf
-            << "'" << detokenized << "'"
-            << ":" << std::to_string(token);
-    }
-    buf << " ]";
-
-    return buf.str();
-}
-
-template <typename C, typename B>
-inline std::string LOG_BATCH_TOSTR_PRETTY(const C & ctx, const B & batch)
-{
-    std::stringstream buf;
-    buf << "[ ";
-
-    bool first = true;
-    for (int i = 0; i < batch.n_tokens; ++i)
-    {
-        if (!first) {
-            buf << ", ";
-        } else {
-            first = false;
-        }
-
-        auto detokenized = llama_token_to_piece(ctx, batch.token[i]);
-
-        detokenized.erase(
-            std::remove_if(
-                detokenized.begin(),
-                detokenized.end(),
-                [](const unsigned char c) { return !std::isprint(c); }),
-            detokenized.end());
-
-        buf
-            << "\n" << std::to_string(i)
-            << ":token '" << detokenized << "'"
-            << ":pos " << std::to_string(batch.pos[i])
-            << ":n_seq_id  " << std::to_string(batch.n_seq_id[i])
-            << ":seq_id " << std::to_string(batch.seq_id[i][0])
-            << ":logits " << std::to_string(batch.logits[i]);
-    }
-    buf << " ]";
-
-    return buf.str();
-}
-
-#ifdef LOG_DISABLE_LOGS
-
-#undef LOG
-#define LOG(...) // dummy stub
-#undef LOGLN
-#define LOGLN(...) // dummy stub
-
-#undef LOG_TEE
-#define LOG_TEE(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf
-
-#undef LOG_TEELN
-#define LOG_TEELN(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf
-
-#undef LOG_DISABLE
-#define LOG_DISABLE() // dummy stub
-
-#undef LOG_ENABLE
-#define LOG_ENABLE() // dummy stub
-
-#undef LOG_ENABLE
-#define LOG_ENABLE() // dummy stub
-
-#undef LOG_SET_TARGET
-#define LOG_SET_TARGET(...) // dummy stub
-
-#undef LOG_DUMP_CMDLINE
-#define LOG_DUMP_CMDLINE(...) // dummy stub
-
-#endif // LOG_DISABLE_LOGS
+#define LOG_INFV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_INFO,  verbosity, __VA_ARGS__)
+#define LOG_WRNV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_WARN,  verbosity, __VA_ARGS__)
+#define LOG_ERRV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, verbosity, __VA_ARGS__)
+#define LOG_DBGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, verbosity, __VA_ARGS__)
+#define LOG_CNTV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_CONT,  verbosity, __VA_ARGS__)
--- a/common/ngram-cache.cpp
+++ b/common/ngram-cache.cpp
@@ -2,10 +2,13 @@
 #include "common.h"
 #include "log.h"

+#include <cinttypes>
 #include <cstdint>
+#include <cstdio>
 #include <fstream>
+#include <thread>

-void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
+void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
                              std::vector<llama_token> & inp, int nnew, bool print_progress) {
    const int64_t t_start_ms = ggml_time_ms();
    const int64_t inp_size = inp.size();
@@ -17,16 +20,16 @@ void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, in
        const int64_t i_start = std::max(inp_size - nnew, ngram_size);
        for (int64_t i = i_start; i < inp_size; ++i) {
            const int64_t ngram_start = i - ngram_size;
-            llama_ngram ngram(&inp[ngram_start], ngram_size);
+            common_ngram ngram(&inp[ngram_start], ngram_size);
            const llama_token token = inp[i];

-            llama_ngram_cache::iterator part_it = ngram_cache.find(ngram);
+            common_ngram_cache::iterator part_it = ngram_cache.find(ngram);
            if (part_it == ngram_cache.end()) {
-                llama_ngram_cache_part part;
+                common_ngram_cache_part part;
                part.emplace(token, 1);
                ngram_cache.emplace(ngram, part);
            } else {
-                llama_ngram_cache_part::iterator token_count_it = part_it->second.find(token);
+                common_ngram_cache_part::iterator token_count_it = part_it->second.find(token);
                if (token_count_it == part_it->second.end()) {
                    part_it->second.emplace(token, 1);
                } else {
@@ -59,12 +62,12 @@ constexpr int draft_min_sample_size_strict[LLAMA_NGRAM_MAX] = { 4,  3,  2,  2};
 constexpr int     draft_min_percent_strict[LLAMA_NGRAM_MAX] = {75, 66, 66, 66};

 // Helper function that tries to draft a token from only the static ngram cache:
-static llama_token try_draft(llama_ngram_cache & nc_static, const llama_ngram ngram_static) {
-    llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
+static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram ngram_static) {
+    common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
    if (part_static_it == nc_static.end()) {
        return -1;
    }
-    const llama_ngram_cache_part part_static = part_static_it->second;
+    const common_ngram_cache_part part_static = part_static_it->second;

    int max_count_static  = 0;
    int sum_count_static  = 0;
@@ -92,19 +95,19 @@ static llama_token try_draft(llama_ngram_cache & nc_static, const llama_ngram ng

 // Try to draft a token from primary cache (context/dynamic), validate with static cache:
 static llama_token try_draft(
-    llama_ngram_cache & nc_primary, const std::vector<llama_ngram> & ngrams_primary, llama_ngram_cache_part & part_static,
+    common_ngram_cache & nc_primary, const std::vector<common_ngram> & ngrams_primary, common_ngram_cache_part & part_static,
    const int * min_sample_size, const int * min_percent) {

    llama_token drafted_token = -1;

    for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == -1; --i) {
-        const llama_ngram ngram_primary = ngrams_primary[i];
+        const common_ngram ngram_primary = ngrams_primary[i];

-        llama_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary);
+        common_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary);
        if (part_primary_it == nc_primary.end()) {
            continue;
        }
-        const llama_ngram_cache_part part_primary = part_primary_it->second;
+        const common_ngram_cache_part part_primary = part_primary_it->second;

        int max_count_primary = 0;
        int max_count_static  = 0;
@@ -114,7 +117,7 @@ static llama_token try_draft(
        for (std::pair<llama_token, int> token_count_primary : part_primary) {
            const llama_token token = token_count_primary.first;

-            llama_ngram_cache_part::iterator token_count_static_it = part_static.find(token);
+            common_ngram_cache_part::iterator token_count_static_it = part_static.find(token);

            const int32_t count_primary = token_count_primary.second;
            const int32_t count_static  = token_count_static_it != part_static.end() ? 100*token_count_static_it->second : 1;
@@ -139,9 +142,9 @@ static llama_token try_draft(
    return drafted_token;
 }

-void llama_ngram_cache_draft(
+void common_ngram_cache_draft(
    std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
-    llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static
+    common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static
 ) {
    GGML_ASSERT(draft.size() == 1);
    const int inp_size = inp.size();
@@ -154,21 +157,21 @@ void llama_ngram_cache_draft(
        llama_token drafted_token = -1;

        const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1;
-        llama_ngram ngram_static;
+        common_ngram ngram_static;
        for (int j = ngram_start_static; j < ngram_start_static + LLAMA_NGRAM_STATIC; ++j) {
            ngram_static.tokens[j-ngram_start_static] = get_token(inp, draft, j);
        }
-        llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
-        llama_ngram_cache_part part_static;
+        common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
+        common_ngram_cache_part part_static;
        if (part_static_it != nc_static.end()) {
            part_static = part_static_it->second;
        }

        // cd = context + dynamic
-        std::vector<llama_ngram> ngrams_cd;
+        std::vector<common_ngram> ngrams_cd;
        for (int ngram_size_cd = ngram_min; ngram_size_cd <= ngram_max; ++ngram_size_cd) {
            const int ngram_start_cd = inp_size-ngram_size_cd + draft.size()-1;
-            llama_ngram ngram_cd;
+            common_ngram ngram_cd;
            for (int j = ngram_start_cd; j < ngram_start_cd + ngram_size_cd; ++j) {
                ngram_cd.tokens[j-ngram_start_cd] = get_token(inp, draft, j);
            }
@@ -193,16 +196,16 @@ void llama_ngram_cache_draft(
    }
 }

-void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename) {
+void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & filename) {
    std::ofstream file_out(filename, std::ios::binary);
-    for (std::pair<llama_ngram, llama_ngram_cache_part> item : ngram_cache) {
-        const llama_ngram      ngram        = item.first;
-        llama_ngram_cache_part token_counts = item.second;
+    for (std::pair<common_ngram, common_ngram_cache_part> item : ngram_cache) {
+        const common_ngram      ngram        = item.first;
+        common_ngram_cache_part token_counts = item.second;
        GGML_ASSERT(!token_counts.empty());
        const int32_t ntokens = token_counts.size();
        GGML_ASSERT(ntokens > 0);

-        file_out.write(reinterpret_cast<const char *>(&ngram),   sizeof(llama_ngram));
+        file_out.write(reinterpret_cast<const char *>(&ngram),   sizeof(common_ngram));
        file_out.write(reinterpret_cast<const char *>(&ntokens), sizeof(int32_t));
        for (std::pair<llama_token, int32_t> item2 : token_counts) {
            const llama_token token = item2.first;
@@ -216,14 +219,14 @@ void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filen

 }

-llama_ngram_cache llama_ngram_cache_load(std::string & filename) {
+common_ngram_cache common_ngram_cache_load(std::string & filename) {
    std::ifstream hashmap_file(filename, std::ios::binary);
    if (!hashmap_file) {
        throw std::ifstream::failure("Unable to open file " + filename);
    }
-    llama_ngram_cache ngram_cache;
+    common_ngram_cache ngram_cache;

-    llama_ngram ngram;
+    common_ngram ngram;
    int32_t     ntokens;
    llama_token token;
    int32_t     count;
@@ -232,11 +235,11 @@ llama_ngram_cache llama_ngram_cache_load(std::string & filename) {
    char * ntokensc = reinterpret_cast<char*>(&ntokens);
    char * tokenc   = reinterpret_cast<char*>(&token);
    char * countc   = reinterpret_cast<char*>(&count);
-    while(hashmap_file.read(ngramc, sizeof(llama_ngram))) {
+    while(hashmap_file.read(ngramc, sizeof(common_ngram))) {
        GGML_ASSERT(!hashmap_file.eof());
        GGML_ASSERT(hashmap_file.read(ntokensc, sizeof(int32_t)));
        GGML_ASSERT(ntokens > 0);
-        llama_ngram_cache_part token_counts;
+        common_ngram_cache_part token_counts;

        for (int i = 0; i < ntokens; ++i) {
            GGML_ASSERT(!hashmap_file.eof());
@@ -254,12 +257,12 @@ llama_ngram_cache llama_ngram_cache_load(std::string & filename) {
    return ngram_cache;
 }

-void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add) {
-    for (std::pair<llama_ngram, llama_ngram_cache_part> ngram_part : ngram_cache_add) {
-        const llama_ngram      ngram = ngram_part.first;
-        llama_ngram_cache_part  part = ngram_part.second;
+void common_ngram_cache_merge(common_ngram_cache & ngram_cache_target, common_ngram_cache & ngram_cache_add) {
+    for (std::pair<common_ngram, common_ngram_cache_part> ngram_part : ngram_cache_add) {
+        const common_ngram      ngram = ngram_part.first;
+        common_ngram_cache_part  part = ngram_part.second;

-        llama_ngram_cache::iterator part_merged_it = ngram_cache_target.find(ngram);
+        common_ngram_cache::iterator part_merged_it = ngram_cache_target.find(ngram);
        if (part_merged_it == ngram_cache_target.end()) {
            ngram_cache_target.emplace(ngram, part);
            continue;
@@ -270,7 +273,7 @@ void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram
            const int32_t     count = token_count.second;
            GGML_ASSERT(count > 0);

-            llama_ngram_cache_part::iterator token_count_merged_it = part_merged_it->second.find(token);
+            common_ngram_cache_part::iterator token_count_merged_it = part_merged_it->second.find(token);
            if (token_count_merged_it == part_merged_it->second.end()) {
                part_merged_it->second.emplace(token, count);
                continue;
--- a/common/ngram-cache.h
+++ b/common/ngram-cache.h
@@ -12,22 +12,22 @@

 // Data structures to map n-grams to empirical token probabilities:

-struct llama_ngram {
+struct common_ngram {
    llama_token tokens[LLAMA_NGRAM_MAX];

-    llama_ngram() {
+    common_ngram() {
        for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
            tokens[i] = -1;
        }
    }

-    llama_ngram(const llama_token * input, const int ngram_size) {
+    common_ngram(const llama_token * input, const int ngram_size) {
        for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
            tokens[i] = i < ngram_size ? input[i] : -1;
        }
    }

-    bool operator==(const llama_ngram & other) const {
+    bool operator==(const common_ngram & other) const {
        for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
            if (tokens[i] != other.tokens[i]) {
                return false;
@@ -37,28 +37,28 @@ struct llama_ngram {
    }
 };

-struct llama_token_hash_function {
+struct common_token_hash_function {
    size_t operator()(const llama_token token) const {
        // see https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
        return token * 11400714819323198485llu;
    }
 };

-struct llama_ngram_hash_function {
-    size_t operator()(const llama_ngram & ngram) const {
-        size_t hash = llama_token_hash_function{}(ngram.tokens[0]);
+struct common_ngram_hash_function {
+    size_t operator()(const common_ngram & ngram) const {
+        size_t hash = common_token_hash_function{}(ngram.tokens[0]);
        for (int i = 1; i < LLAMA_NGRAM_MAX; ++i) {
-            hash ^= llama_token_hash_function{}(ngram.tokens[i]);
+            hash ^= common_token_hash_function{}(ngram.tokens[i]);
        }
        return hash;
    }
 };

 // token -> number of times token has been seen
-typedef std::unordered_map<llama_token, int32_t> llama_ngram_cache_part;
+typedef std::unordered_map<llama_token, int32_t> common_ngram_cache_part;

 // n-gram -> empirical distribution of following tokens
-typedef std::unordered_map<llama_ngram, llama_ngram_cache_part, llama_ngram_hash_function> llama_ngram_cache;
+typedef std::unordered_map<common_ngram, common_ngram_cache_part, common_ngram_hash_function> common_ngram_cache;


 // Update an ngram cache with tokens.
@@ -70,8 +70,8 @@ typedef std::unordered_map<llama_ngram, llama_ngram_cache_part, llama_ngram_hash
 //
 // In order to get correct results inp_data can ONLY BE APPENDED TO.
 // Changes in the middle need a complete rebuild.
-void llama_ngram_cache_update(
-    llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector<llama_token> & inp_data, int nnew, bool print_progress);
+void common_ngram_cache_update(
+    common_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector<llama_token> & inp_data, int nnew, bool print_progress);

 // Try to draft tokens from ngram caches.
 // inp:                the tokens generated so far.
@@ -81,21 +81,21 @@ void llama_ngram_cache_update(
 // nc_context:         ngram cache based on current context.
 // nc_dynamic:         ngram cache based on previous user generations.
 // nc_static:          ngram cache generated from a large text corpus, used for validation.
-void llama_ngram_cache_draft(
+void common_ngram_cache_draft(
    std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
-    llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static);
+    common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static);

 // Save an ngram cache to a file.
 // ngram_cache: the ngram cache to save.
 // filename:    the path under which to save the ngram cache.
-void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename);
+void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & filename);

-// Load an ngram cache saved with llama_ngram_cache_save.
+// Load an ngram cache saved with common_ngram_cache_save.
 // filename: the path from which to load the ngram cache.
 // returns:  an ngram cache containing the information saved to filename.
-llama_ngram_cache llama_ngram_cache_load(std::string & filename);
+common_ngram_cache common_ngram_cache_load(std::string & filename);

 // Merge two ngram caches.
 // ngram_cache_target: the ngram cache to which to add the information from ngram_cache_add.
 // ngram_cache_add:    the ngram cache to add to ngram_cache_target.
-void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add);
+void common_ngram_cache_merge(common_ngram_cache & ngram_cache_target, common_ngram_cache & ngram_cache_add);
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -1,460 +1,466 @@
-#define LLAMA_API_INTERNAL
 #include "sampling.h"
-#include <random>

-struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) {
-    struct llama_sampling_context * result = new llama_sampling_context();
+#include "common.h"

-    result->params  = params;
-    result->grammar = nullptr;
+#include <cmath>
+#include <unordered_map>

-    // if there is a grammar, parse it
-    if (!params.grammar.empty()) {
-        result->parsed_grammar = grammar_parser::parse(params.grammar.c_str());
+// the ring buffer works similarly to std::deque, but with a fixed capacity
+// TODO: deduplicate with llama-impl.h
+template<typename T>
+struct ring_buffer {
+    ring_buffer(size_t cap) : capacity(cap), data(cap) {}

-        // will be empty (default) if there are parse errors
-        if (result->parsed_grammar.rules.empty()) {
-            fprintf(stderr, "%s: failed to parse grammar\n", __func__);
-            delete result;
-            return nullptr;
+    T & front() {
+        if (sz == 0) {
+            throw std::runtime_error("ring buffer is empty");
+        }
+        return data[first];
+    }
+
+    const T & front() const {
+        if (sz == 0) {
+            throw std::runtime_error("ring buffer is empty");
+        }
+        return data[first];
+    }
+
+    T & back() {
+        if (sz == 0) {
+            throw std::runtime_error("ring buffer is empty");
+        }
+        return data[pos];
+    }
+
+    const T & back() const {
+        if (sz == 0) {
+            throw std::runtime_error("ring buffer is empty");
+        }
+        return data[pos];
+    }
+
+    void push_back(const T & value) {
+        if (sz == capacity) {
+            // advance the start when buffer is full
+            first = (first + 1) % capacity;
+        } else {
+            sz++;
+        }
+        data[pos] = value;
+        pos = (pos + 1) % capacity;
+    }
+
+    T pop_front() {
+        if (sz == 0) {
+            throw std::runtime_error("ring buffer is empty");
+        }
+        T value = data[first];
+        first = (first + 1) % capacity;
+        sz--;
+        return value;
+    }
+
+    const T & rat(size_t i) const {
+        if (i >= sz) {
+            throw std::runtime_error("ring buffer: index out of bounds");
+        }
+        return data[(first + sz - i - 1) % capacity];
+    }
+
+    std::vector<T> to_vector() const {
+        std::vector<T> result;
+        result.reserve(sz);
+        for (size_t i = 0; i < sz; i++) {
+            result.push_back(data[(first + i) % capacity]);
+        }
+        return result;
+    }
+
+    void clear() {
+        // here only reset the status of the buffer
+        sz = 0;
+        first = 0;
+        pos = 0;
+    }
+
+    bool empty() const {
+        return sz == 0;
+    }
+
+    size_t size() const {
+        return sz;
+    }
+
+    size_t capacity = 0;
+    size_t sz = 0;
+    size_t first = 0;
+    size_t pos = 0;
+    std::vector<T> data;
+};
+
+struct common_sampler {
+    common_sampler_params params;
+
+    struct llama_sampler * grmr;
+    struct llama_sampler * chain;
+
+    ring_buffer<llama_token> prev;
+
+    std::vector<llama_token_data> cur;
+
+    llama_token_data_array cur_p;
+
+    void set_logits(struct llama_context * ctx, int idx) {
+        const auto * logits = llama_get_logits_ith(ctx, idx);
+
+        const int n_vocab = llama_n_vocab(llama_get_model(ctx));
+
+        cur.resize(n_vocab);
+
+        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+            cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
        }

-        // Ensure that there is a "root" node.
-        if (result->parsed_grammar.symbol_ids.find("root") == result->parsed_grammar.symbol_ids.end()) {
-            fprintf(stderr, "%s: grammar does not contain a 'root' symbol\n", __func__);
-            delete result;
-            return nullptr;
-        }
-
-        std::vector<const llama_grammar_element *> grammar_rules(result->parsed_grammar.c_rules());
-
-        struct llama_grammar * grammar = llama_grammar_init(
-                grammar_rules.data(),
-                grammar_rules.size(), result->parsed_grammar.symbol_ids.at("root"));
-        if (grammar == nullptr) {
-            throw std::runtime_error("Failed to initialize llama_grammar");
-        }
-        result->grammar = grammar;
+        cur_p = { cur.data(), cur.size(), -1, false };
    }
+};

-    result->prev.resize(params.n_prev);
-
-    result->n_valid = 0;
-
-    llama_sampling_set_rng_seed(result, params.seed);
-
-    return result;
-}
-
-void llama_sampling_free(struct llama_sampling_context * ctx) {
-    if (ctx->grammar != NULL) {
-        llama_grammar_free(ctx->grammar);
-    }
-
-    delete ctx;
-}
-
-void llama_sampling_reset(llama_sampling_context * ctx) {
-    if (ctx->grammar != NULL) {
-        llama_grammar_free(ctx->grammar);
-        ctx->grammar = NULL;
-    }
-
-    if (!ctx->parsed_grammar.rules.empty()) {
-        std::vector<const llama_grammar_element *> grammar_rules(ctx->parsed_grammar.c_rules());
-
-        struct llama_grammar * grammar = llama_grammar_init(
-                grammar_rules.data(),
-                grammar_rules.size(), ctx->parsed_grammar.symbol_ids.at("root"));
-        if (grammar == nullptr) {
-            throw std::runtime_error("Failed to initialize llama_grammar");
-        }
-        ctx->grammar = grammar;
-    }
-
-    std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
-    ctx->cur.clear();
-    ctx->n_valid = 0;
-}
-
-void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) {
-    if (seed == LLAMA_DEFAULT_SEED) {
-        seed = std::random_device{}();
-    }
-    ctx->rng.seed(seed);
-}
-
-void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst) {
-    if (dst->grammar) {
-        llama_grammar_free(dst->grammar);
-        dst->grammar = nullptr;
-    }
-
-    if (src->grammar) {
-        dst->grammar = llama_grammar_copy(src->grammar);
-    }
-
-    dst->prev = src->prev;
-}
-
-llama_token llama_sampling_last(llama_sampling_context * ctx) {
-    return ctx->prev.back();
-}
-
-std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n) {
-    const int size = ctx_sampling->prev.size();
-
-    n = std::min(n, size);
-
-    std::string result;
-
-    for (int i = size - n; i < size; i++) {
-        result += llama_token_to_piece(ctx_main, ctx_sampling->prev[i]);
-    }
-
-    return result;
-}
-
-std::string llama_sampling_print(const llama_sampling_params & params) {
+std::string common_sampler_params::print() const {
    char result[1024];

    snprintf(result, sizeof(result),
            "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
-            "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
+            "\tdry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d\n"
+            "\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, temp = %.3f\n"
            "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
-            params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present,
-            params.top_k, params.tfs_z, params.top_p, params.min_p, params.typical_p, params.temp,
-            params.mirostat, params.mirostat_eta, params.mirostat_tau);
+            penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
+            dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n,
+            top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, temp,
+            mirostat, mirostat_eta, mirostat_tau);

    return std::string(result);
 }

-std::string llama_sampling_order_print(const llama_sampling_params & params) {
-    std::string result = "CFG -> Penalties ";
+struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params) {
+    llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
+
+    lparams.no_perf = params.no_perf;
+
+    auto * result = new common_sampler {
+        /* .params = */ params,
+        /* .grmr   = */ llama_sampler_init_grammar(model, params.grammar.c_str(), "root"),
+        /* .chain  = */ llama_sampler_chain_init(lparams),
+        /* .prev   = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
+        /* .cur    = */ {},
+        /* .cur_p  = */ {},
+    };
+
+    llama_sampler_chain_add(result->chain,
+            llama_sampler_init_logit_bias(
+                llama_n_vocab(model),
+                params.logit_bias.size(),
+                params.logit_bias.data()));
+
+    llama_sampler_chain_add(result->chain,
+            llama_sampler_init_penalties(
+                llama_n_vocab  (model),
+                llama_token_eos(model),
+                llama_token_nl (model),
+                params.penalty_last_n,
+                params.penalty_repeat,
+                params.penalty_freq,
+                params.penalty_present,
+                params.penalize_nl,
+                params.ignore_eos));
+
    if (params.mirostat == 0) {
-        for (auto sampler_type : params.samplers_sequence) {
-            const auto sampler_type_name = llama_sampling_type_to_str(sampler_type);
-            if (!sampler_type_name.empty()) {
-                result += "-> " + sampler_type_name + " ";
+        for (const auto & cnstr : params.samplers) {
+            switch (cnstr) {
+                    case COMMON_SAMPLER_TYPE_DRY:
+                    {
+                        std::vector<const char*> c_breakers;
+                        c_breakers.reserve(params.dry_sequence_breakers.size());
+                        for (const auto& str : params.dry_sequence_breakers) {
+                            c_breakers.push_back(str.c_str());
+                        }
+
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_dry      (model, params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
+                    }
+                        break;
+                case COMMON_SAMPLER_TYPE_TOP_K:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_top_k    (params.top_k));
+                    break;
+                case COMMON_SAMPLER_TYPE_TOP_P:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_top_p    (params.top_p, params.min_keep));
+                    break;
+                case COMMON_SAMPLER_TYPE_MIN_P:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_min_p    (params.min_p, params.min_keep));
+                    break;
+                case COMMON_SAMPLER_TYPE_XTC:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_xtc      (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
+                    break;
+                case COMMON_SAMPLER_TYPE_TYPICAL_P:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_typical  (params.typ_p, params.min_keep));
+                    break;
+                case COMMON_SAMPLER_TYPE_TEMPERATURE:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
+                    break;
+                case COMMON_SAMPLER_TYPE_INFILL:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_infill   (model));
+                    break;
+                default:
+                    GGML_ASSERT(false && "unknown sampler type");
            }
        }
+        llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
+    } else if (params.mirostat == 1) {
+        llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
+        llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
+    } else if (params.mirostat == 2) {
+        llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
+        llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
    } else {
-        result += "-> mirostat ";
+        GGML_ASSERT(false && "unknown mirostat version");
    }

    return result;
 }

-std::string llama_sampling_type_to_str(llama_sampler_type sampler_type) {
-    switch (sampler_type) {
-        case llama_sampler_type::TOP_K:       return "top_k";
-        case llama_sampler_type::TFS_Z:       return "tfs_z";
-        case llama_sampler_type::TYPICAL_P:   return "typical_p";
-        case llama_sampler_type::TOP_P:       return "top_p";
-        case llama_sampler_type::MIN_P:       return "min_p";
-        case llama_sampler_type::TEMPERATURE: return "temperature";
+void common_sampler_free(struct common_sampler * gsmpl) {
+    if (gsmpl) {
+        llama_sampler_free(gsmpl->grmr);
+
+        llama_sampler_free(gsmpl->chain);
+
+        delete gsmpl;
+    }
+}
+
+void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
+    if (accept_grammar) {
+        llama_sampler_accept(gsmpl->grmr, token);
+    }
+
+    llama_sampler_accept(gsmpl->chain, token);
+
+    gsmpl->prev.push_back(token);
+}
+
+void common_sampler_reset(struct common_sampler * gsmpl) {
+    llama_sampler_reset(gsmpl->grmr);
+
+    llama_sampler_reset(gsmpl->chain);
+}
+
+struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
+    return new common_sampler {
+        /* .params = */ gsmpl->params,
+        /* .grmr   = */ llama_sampler_clone(gsmpl->grmr),
+        /* .chain  = */ llama_sampler_clone(gsmpl->chain),
+        /* .prev   = */ gsmpl->prev,
+        /* .cur    = */ gsmpl->cur,
+        /* .cur_p  = */ gsmpl->cur_p,
+    };
+}
+
+void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl) {
+    // TODO: measure grammar performance
+
+    if (gsmpl) {
+        llama_perf_sampler_print(gsmpl->chain);
+    }
+    if (ctx) {
+        llama_perf_context_print(ctx);
+    }
+}
+
+llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
+    gsmpl->set_logits(ctx, idx);
+
+    auto & grmr  = gsmpl->grmr;
+    auto & chain = gsmpl->chain;
+    auto & cur_p = gsmpl->cur_p; // initialized by set_logits
+
+    if (grammar_first) {
+        llama_sampler_apply(grmr, &cur_p);
+    }
+
+    llama_sampler_apply(chain, &cur_p);
+
+    GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
+
+    const llama_token id = cur_p.data[cur_p.selected].id;
+
+    if (grammar_first) {
+        return id;
+    }
+
+    // check if it the sampled token fits the grammar
+    {
+        llama_token_data       single_token_data       = { id, 1.0f, 0.0f };
+        llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
+
+        llama_sampler_apply(grmr, &single_token_data_array);
+
+        const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
+        if (is_valid) {
+            return id;
+        }
+    }
+
+    // resampling:
+    // if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
+    gsmpl->set_logits(ctx, idx);
+
+    llama_sampler_apply(grmr,  &cur_p);
+    llama_sampler_apply(chain, &cur_p);
+
+    GGML_ASSERT(cur_p.selected != -1 && "no selected token during re-sampling - check your sampling configuration");
+
+    return cur_p.data[cur_p.selected].id;
+}
+
+uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
+    return llama_sampler_get_seed(gsmpl->chain);
+}
+
+// helpers
+
+llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl) {
+    return &gsmpl->cur_p;
+}
+
+llama_token common_sampler_last(const struct common_sampler * gsmpl) {
+    return gsmpl->prev.rat(0);
+}
+
+std::string common_sampler_print(const struct common_sampler * gsmpl) {
+    std::string result = "logits ";
+
+    for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
+        const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
+        result += std::string("-> ") + llama_sampler_name(smpl) + " ";
+    }
+
+    return result;
+}
+
+std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx_main, int n) {
+    n = std::min(n, (int) gsmpl->prev.size());
+
+    if (n <= 0) {
+        return "";
+    }
+
+    std::string result;
+    result.reserve(8*n); // 8 is the average length of a token [citation needed], TODO: compute this from the vocab
+
+    for (int i = n - 1; i >= 0; i--) {
+        const llama_token id = gsmpl->prev.rat(i);
+
+        GGML_ASSERT(id != LLAMA_TOKEN_NULL && "null token in the sampling history - should not happen");
+
+        result += common_token_to_piece(ctx_main, id);
+    }
+
+    return result;
+}
+
+char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
+    switch (cnstr) {
+        case COMMON_SAMPLER_TYPE_DRY:         return 'd';
+        case COMMON_SAMPLER_TYPE_TOP_K:       return 'k';
+        case COMMON_SAMPLER_TYPE_TYPICAL_P:   return 'y';
+        case COMMON_SAMPLER_TYPE_TOP_P:       return 'p';
+        case COMMON_SAMPLER_TYPE_MIN_P:       return 'm';
+        case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
+        case COMMON_SAMPLER_TYPE_XTC:         return 'x';
+        case COMMON_SAMPLER_TYPE_INFILL:      return 'i';
+        default : return '?';
+    }
+}
+
+std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
+    switch (cnstr) {
+        case COMMON_SAMPLER_TYPE_DRY:         return "dry";
+        case COMMON_SAMPLER_TYPE_TOP_K:       return "top_k";
+        case COMMON_SAMPLER_TYPE_TYPICAL_P:   return "typ_p";
+        case COMMON_SAMPLER_TYPE_TOP_P:       return "top_p";
+        case COMMON_SAMPLER_TYPE_MIN_P:       return "min_p";
+        case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
+        case COMMON_SAMPLER_TYPE_XTC:         return "xtc";
+        case COMMON_SAMPLER_TYPE_INFILL:      return "infill";
        default : return "";
    }
 }

-std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
-    std::unordered_map<std::string, llama_sampler_type> sampler_canonical_name_map {
-        {"top_k",       llama_sampler_type::TOP_K},
-        {"top_p",       llama_sampler_type::TOP_P},
-        {"typical_p",   llama_sampler_type::TYPICAL_P},
-        {"min_p",       llama_sampler_type::MIN_P},
-        {"tfs_z",       llama_sampler_type::TFS_Z},
-        {"temperature", llama_sampler_type::TEMPERATURE}
+std::vector<common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
+    std::unordered_map<std::string, common_sampler_type> sampler_canonical_name_map {
+        { "dry",         COMMON_SAMPLER_TYPE_DRY },
+        { "top_k",       COMMON_SAMPLER_TYPE_TOP_K },
+        { "top_p",       COMMON_SAMPLER_TYPE_TOP_P },
+        { "typ_p",       COMMON_SAMPLER_TYPE_TYPICAL_P },
+        { "min_p",       COMMON_SAMPLER_TYPE_MIN_P },
+        { "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
+        { "xtc",         COMMON_SAMPLER_TYPE_XTC },
+        { "infill",      COMMON_SAMPLER_TYPE_INFILL },
    };

    // since samplers names are written multiple ways
    // make it ready for both system names and input names
-    std::unordered_map<std::string, llama_sampler_type> sampler_alt_name_map {
-        {"top-k",       llama_sampler_type::TOP_K},
-        {"top-p",       llama_sampler_type::TOP_P},
-        {"nucleus",     llama_sampler_type::TOP_P},
-        {"typical-p",   llama_sampler_type::TYPICAL_P},
-        {"typical",     llama_sampler_type::TYPICAL_P},
-        {"min-p",       llama_sampler_type::MIN_P},
-        {"tfs-z",       llama_sampler_type::TFS_Z},
-        {"tfs",         llama_sampler_type::TFS_Z},
-        {"temp",        llama_sampler_type::TEMPERATURE}
+    std::unordered_map<std::string, common_sampler_type> sampler_alt_name_map {
+        { "top-k",       COMMON_SAMPLER_TYPE_TOP_K },
+        { "top-p",       COMMON_SAMPLER_TYPE_TOP_P },
+        { "nucleus",     COMMON_SAMPLER_TYPE_TOP_P },
+        { "typical-p",   COMMON_SAMPLER_TYPE_TYPICAL_P },
+        { "typical",     COMMON_SAMPLER_TYPE_TYPICAL_P },
+        { "typ-p",       COMMON_SAMPLER_TYPE_TYPICAL_P },
+        { "typ",         COMMON_SAMPLER_TYPE_TYPICAL_P },
+        { "min-p",       COMMON_SAMPLER_TYPE_MIN_P },
+        { "temp",        COMMON_SAMPLER_TYPE_TEMPERATURE },
    };

-    std::vector<llama_sampler_type> sampler_types;
-    sampler_types.reserve(names.size());
-    for (const auto & name : names)
-    {
-        auto sampler_item = sampler_canonical_name_map.find(name);
-        if (sampler_item != sampler_canonical_name_map.end())
-        {
-            sampler_types.push_back(sampler_item->second);
-        }
-        else
-        {
-            if (allow_alt_names)
-            {
-                sampler_item = sampler_alt_name_map.find(name);
-                if (sampler_item != sampler_alt_name_map.end())
-                {
-                    sampler_types.push_back(sampler_item->second);
-                }
-            }
-        }
-    }
-    return sampler_types;
-}
+    std::vector<common_sampler_type> samplers;
+    samplers.reserve(names.size());

-std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string) {
-    std::unordered_map<char, llama_sampler_type> sampler_name_map {
-        {'k', llama_sampler_type::TOP_K},
-        {'p', llama_sampler_type::TOP_P},
-        {'y', llama_sampler_type::TYPICAL_P},
-        {'m', llama_sampler_type::MIN_P},
-        {'f', llama_sampler_type::TFS_Z},
-        {'t', llama_sampler_type::TEMPERATURE}
-    };
-
-    std::vector<llama_sampler_type> sampler_types;
-    sampler_types.reserve(names_string.size());
-    for (const auto & c : names_string) {
-        const auto sampler_item = sampler_name_map.find(c);
-        if (sampler_item != sampler_name_map.end()) {
-            sampler_types.push_back(sampler_item->second);
-        }
-    }
-    return sampler_types;
-}
-
-// no reasons to expose this function in header
-static void sampler_queue(
-                   struct llama_context * ctx_main,
-            const llama_sampling_params & params,
-                 llama_token_data_array & cur_p,
-                                 size_t   min_keep) {
-    const float         temp              = params.temp;
-    const float         dynatemp_range    = params.dynatemp_range;
-    const float         dynatemp_exponent = params.dynatemp_exponent;
-    const int32_t       top_k             = params.top_k;
-    const float         top_p             = params.top_p;
-    const float         min_p             = params.min_p;
-    const float         tfs_z             = params.tfs_z;
-    const float         typical_p         = params.typical_p;
-    const std::vector<llama_sampler_type> & samplers_sequence = params.samplers_sequence;
-
-    for (auto sampler_type : samplers_sequence) {
-        switch (sampler_type) {
-            case llama_sampler_type::TOP_K    : llama_sample_top_k    (ctx_main, &cur_p, top_k,     min_keep); break;
-            case llama_sampler_type::TFS_Z    : llama_sample_tail_free(ctx_main, &cur_p, tfs_z,     min_keep); break;
-            case llama_sampler_type::TYPICAL_P: llama_sample_typical  (ctx_main, &cur_p, typical_p, min_keep); break;
-            case llama_sampler_type::TOP_P    : llama_sample_top_p    (ctx_main, &cur_p, top_p,     min_keep); break;
-            case llama_sampler_type::MIN_P    : llama_sample_min_p    (ctx_main, &cur_p, min_p,     min_keep); break;
-            case llama_sampler_type::TEMPERATURE:
-                if (dynatemp_range > 0) {
-                    float dynatemp_min = std::max(0.0f, temp - dynatemp_range);
-                    float dynatemp_max = std::max(0.0f, temp + dynatemp_range);
-                    llama_sample_entropy(ctx_main, &cur_p, dynatemp_min, dynatemp_max, dynatemp_exponent);
-                } else {
-                    llama_sample_temp(ctx_main, &cur_p, temp);
-                }
-                break;
-            default : break;
-        }
-    }
-}
-
-static llama_token llama_sampling_sample_impl(
-                  struct llama_sampling_context * ctx_sampling,
-                  struct llama_context * ctx_main,
-                  struct llama_context * ctx_cfg,
-                  const int idx,
-                  bool is_resampling) {
-    const llama_sampling_params & params = ctx_sampling->params;
-
-    const float   temp            = params.temp;
-    const int     mirostat        = params.mirostat;
-    const float   mirostat_tau    = params.mirostat_tau;
-    const float   mirostat_eta    = params.mirostat_eta;
-
-    std::vector<float> original_logits;
-    auto cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx, /* apply_grammar= */ is_resampling, &original_logits);
-    if (ctx_sampling->grammar != NULL && !is_resampling) {
-        GGML_ASSERT(!original_logits.empty());
-    }
-    llama_token id = 0;
-
-    if (temp < 0.0) {
-        // greedy sampling, with probs
-        llama_sample_softmax(ctx_main, &cur_p);
-        id = cur_p.data[0].id;
-    } else if (temp == 0.0) {
-        // greedy sampling, no probs
-        id = llama_sample_token_greedy(ctx_main, &cur_p);
-    } else {
-        if (mirostat == 1) {
-            const int mirostat_m = 100;
-            llama_sample_temp(ctx_main, &cur_p, temp);
-            id = llama_sample_token_mirostat(ctx_main, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &ctx_sampling->mirostat_mu);
-        } else if (mirostat == 2) {
-            llama_sample_temp(ctx_main, &cur_p, temp);
-            id = llama_sample_token_mirostat_v2(ctx_main, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling->mirostat_mu);
+    for (const auto & name : names) {
+        auto sampler = sampler_canonical_name_map.find(name);
+        if (sampler != sampler_canonical_name_map.end()) {
+            samplers.push_back(sampler->second);
        } else {
-            // temperature sampling
-            size_t min_keep = std::max(1, params.min_keep);
-
-            sampler_queue(ctx_main, params, cur_p, min_keep);
-
-            id = llama_sample_token_with_rng(ctx_main, &cur_p, ctx_sampling->rng);
-
-            //{
-            //    const int n_top = 10;
-            //    LOG("top %d candidates:\n", n_top);
-
-            //    for (int i = 0; i < n_top; i++) {
-            //        const llama_token id = cur_p.data[i].id;
-            //        (void)id; // To avoid a warning that id is unused when logging is disabled.
-            //        LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx_main, id).c_str(), cur_p.data[i].p);
-            //    }
-            //}
-
-            //LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx_main, id).c_str());
-        }
-    }
-
-    if (ctx_sampling->grammar != NULL && !is_resampling) {
-        // Get a pointer to the logits
-        float * logits = llama_get_logits_ith(ctx_main, idx);
-
-        // Create an array with a single token data element for the sampled id
-        llama_token_data single_token_data = {id, logits[id], 0.0f};
-        llama_token_data_array single_token_data_array = { &single_token_data, 1, false };
-
-        // Apply grammar constraints to the single token
-        llama_grammar_sample(ctx_sampling->grammar, ctx_main, &single_token_data_array);
-
-        // Check if the token is valid according to the grammar by seeing if its logit has been set to -INFINITY
-        bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
-
-        // If the token is not valid according to the grammar, perform resampling
-        if (!is_valid) {
-            LOG("Resampling because token %d: '%s' does not meet grammar rules\n", id, llama_token_to_piece(ctx_main, id).c_str());
-
-            // Restore logits from the copy
-            std::copy(original_logits.begin(), original_logits.end(), logits);
-
-            return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ true);
-        }
-    }
-
-    ctx_sampling->n_valid = temp == 0.0f ? 0 : cur_p.size;
-
-    return id;
-}
-
-static llama_token_data_array llama_sampling_prepare_impl(
-                  struct llama_sampling_context * ctx_sampling,
-                  struct llama_context * ctx_main,
-                  struct llama_context * ctx_cfg,
-                  const int idx,
-                  bool apply_grammar,
-                  std::vector<float> * original_logits) {
-    const llama_sampling_params & params = ctx_sampling->params;
-
-    const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
-
-    const int32_t penalty_last_n  = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
-    const float   penalty_repeat  = params.penalty_repeat;
-    const float   penalty_freq    = params.penalty_freq;
-    const float   penalty_present = params.penalty_present;
-
-    const bool    penalize_nl     = params.penalize_nl;
-
-    auto & prev = ctx_sampling->prev;
-    auto & cur  = ctx_sampling->cur;
-
-    // Get a pointer to the logits
-    float * logits = llama_get_logits_ith(ctx_main, idx);
-
-    if (ctx_sampling->grammar != NULL && !apply_grammar) {
-        GGML_ASSERT(original_logits != NULL);
-        // Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this.
-        *original_logits = {logits, logits + n_vocab};
-    }
-
-    // apply params.logit_bias map
-    for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
-        logits[it->first] += it->second;
-    }
-
-    if (ctx_cfg) {
-        float * logits_guidance = llama_get_logits_ith(ctx_cfg, idx);
-        llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale);
-    }
-
-    cur.resize(n_vocab);
-
-    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-        cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
-    }
-
-    llama_token_data_array cur_p = { cur.data(), cur.size(), false };
-
-    // apply penalties
-    const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev;
-    const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n);
-    if (penalty_tokens_used_size) {
-        const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];
-
-        llama_sample_repetition_penalties(ctx_main, &cur_p,
-                penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size,
-                penalty_tokens_used_size, penalty_repeat, penalty_freq, penalty_present);
-
-        if (!penalize_nl) {
-            for (size_t idx = 0; idx < cur_p.size; idx++) {
-                if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) {
-                    cur_p.data[idx].logit = nl_logit;
-                    break;
+            if (allow_alt_names) {
+                sampler = sampler_alt_name_map.find(name);
+                if (sampler != sampler_alt_name_map.end()) {
+                    samplers.push_back(sampler->second);
                }
            }
        }
    }

-    // apply grammar checks before sampling logic
-    if (apply_grammar && ctx_sampling->grammar != NULL) {
-        llama_grammar_sample(ctx_sampling->grammar, ctx_main, &cur_p);
+    return samplers;
+}
+
+std::vector<common_sampler_type> common_sampler_types_from_chars(const std::string & chars) {
+    std::unordered_map<char, common_sampler_type> sampler_name_map = {
+        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_DRY),         COMMON_SAMPLER_TYPE_DRY },
+        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K),       COMMON_SAMPLER_TYPE_TOP_K },
+        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P),   COMMON_SAMPLER_TYPE_TYPICAL_P },
+        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P),       COMMON_SAMPLER_TYPE_TOP_P },
+        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P),       COMMON_SAMPLER_TYPE_MIN_P },
+        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
+        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC),         COMMON_SAMPLER_TYPE_XTC },
+        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL),      COMMON_SAMPLER_TYPE_INFILL },
+    };
+
+    std::vector<common_sampler_type> samplers;
+    samplers.reserve(chars.size());
+
+    for (const auto & c : chars) {
+        const auto sampler = sampler_name_map.find(c);
+        if (sampler != sampler_name_map.end()) {
+            samplers.push_back(sampler->second);
+        }
    }

-    return cur_p;
-}
-
-llama_token llama_sampling_sample(
-                  struct llama_sampling_context * ctx_sampling,
-                  struct llama_context * ctx_main,
-                  struct llama_context * ctx_cfg,
-                  const int idx) {
-    // Call the implementation function with is_resampling set to false by default
-    return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ false);
-}
-
-llama_token_data_array llama_sampling_prepare(
-                  struct llama_sampling_context * ctx_sampling,
-                  struct llama_context * ctx_main,
-                  struct llama_context * ctx_cfg,
-                  const int idx,
-                  bool apply_grammar,
-                  std::vector<float> * original_logits) {
-    return llama_sampling_prepare_impl(ctx_sampling,ctx_main, ctx_cfg, idx, apply_grammar, original_logits);
-}
-
-void llama_sampling_accept(
-        struct llama_sampling_context * ctx_sampling,
-        struct llama_context * ctx_main,
-        llama_token id,
-        bool apply_grammar) {
-    ctx_sampling->prev.erase(ctx_sampling->prev.begin());
-    ctx_sampling->prev.push_back(id);
-
-    if (ctx_sampling->grammar != NULL && apply_grammar) {
-        llama_grammar_accept_token(ctx_sampling->grammar, ctx_main, id);
-    }
+    return samplers;
 }
--- a/common/sampling.h
+++ b/common/sampling.h
@@ -2,159 +2,82 @@

 #include "llama.h"

-#include "grammar-parser.h"
-
-#include <random>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-// sampler types
-enum class llama_sampler_type : char {
-    TOP_K       = 'k',
-    TOP_P       = 'p',
-    MIN_P       = 'm',
-    TFS_Z       = 'f',
-    TYPICAL_P   = 'y',
-    TEMPERATURE = 't'
-};
-
-// sampling parameters
-typedef struct llama_sampling_params {
-    int32_t     n_prev                = 64;                 // number of previous tokens to remember
-    int32_t     n_probs               = 0;                  // if greater than 0, output the probabilities of top n_probs tokens.
-    int32_t     min_keep              = 0;                  // 0 = disabled, otherwise samplers should return at least min_keep tokens
-    int32_t     top_k                 = 40;                 // <= 0 to use vocab size
-    float       top_p                 = 0.95f;              // 1.0 = disabled
-    float       min_p                 = 0.05f;              // 0.0 = disabled
-    float       tfs_z                 = 1.00f;              // 1.0 = disabled
-    float       typical_p             = 1.00f;              // 1.0 = disabled
-    float       temp                  = 0.80f;              // <= 0.0 to sample greedily, 0.0 to not output probabilities
-    float       dynatemp_range        = 0.00f;              // 0.0 = disabled
-    float       dynatemp_exponent     = 1.00f;              // controls how entropy maps to temperature in dynamic temperature sampler
-    int32_t     penalty_last_n        = 64;                 // last n tokens to penalize (0 = disable penalty, -1 = context size)
-    float       penalty_repeat        = 1.00f;              // 1.0 = disabled
-    float       penalty_freq          = 0.00f;              // 0.0 = disabled
-    float       penalty_present       = 0.00f;              // 0.0 = disabled
-    int32_t     mirostat              = 0;                  // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
-    float       mirostat_tau          = 5.00f;              // target entropy
-    float       mirostat_eta          = 0.10f;              // learning rate
-    bool        penalize_nl           = false;              // consider newlines as a repeatable token
-    uint32_t    seed                  = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampling_context
-
-    std::vector<llama_sampler_type> samplers_sequence = {
-        llama_sampler_type::TOP_K,
-        llama_sampler_type::TFS_Z,
-        llama_sampler_type::TYPICAL_P,
-        llama_sampler_type::TOP_P,
-        llama_sampler_type::MIN_P,
-        llama_sampler_type::TEMPERATURE
-    };
-
-    std::string grammar;  // optional BNF-like grammar to constrain sampling
-
-    // Classifier-Free Guidance
-    // https://arxiv.org/abs/2306.17806
-    std::string cfg_negative_prompt; // string to help guidance
-    float       cfg_scale     = 1.f; // how strong is guidance
-
-    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
-
-    std::vector<llama_token> penalty_prompt_tokens;
-    bool                     use_penalty_prompt_tokens = false;
-} llama_sampling_params;
-
-// general sampler context
-// TODO: move to llama.h
-struct llama_sampling_context {
-    // parameters that will be used for sampling
-    llama_sampling_params params;
-
-    // mirostat sampler state
-    float mirostat_mu;
-
-    llama_grammar * grammar;
-
-    // internal
-    grammar_parser::parse_state parsed_grammar;
-
-    // TODO: replace with ring-buffer
-    std::vector<llama_token>      prev;
-    std::vector<llama_token_data> cur;
-    size_t n_valid; // Number of correct top tokens with correct probabilities.
-
-    std::mt19937 rng;
-};
-
 #include "common.h"

-// Create a new sampling context instance.
-struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params);
+#include <string>
+#include <vector>

-void llama_sampling_free(struct llama_sampling_context * ctx);
-
-// Reset the sampler context
-// - clear prev tokens
-// - reset grammar
-void llama_sampling_reset(llama_sampling_context * ctx);
-
-// Set the sampler seed
-void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed);
-
-// Copy the sampler context
-void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst);
-
-// Get the last sampled token
-llama_token llama_sampling_last(llama_sampling_context * ctx);
-
-// Get a string representation of the last sampled tokens
-std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n);
-
-// Print sampling parameters into a string
-std::string llama_sampling_print(const llama_sampling_params & params);
-
-// Print sampling order into a string
-std::string llama_sampling_order_print(const llama_sampling_params & params);
-
-std::string llama_sampling_type_to_str(llama_sampler_type sampler_type);
-
-std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
-std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string);
-
-// this is a common sampling function used across the examples for convenience
-// it can serve as a starting point for implementing your own sampling function
-// Note: When using multiple sequences, it is the caller's responsibility to call
-//       llama_sampling_reset when a sequence ends
+// common_sampler extends llama_sampler with additional functionality:
 //
-// required:
-//  - ctx_main:     context to use for sampling
-//  - ctx_sampling: sampling-specific context
+//  - grammar support
+//  - custom sampler logic based on the parameters
+//  - history of the last accepted tokens
+//  - performance metrics
 //
-// optional:
-//  - ctx_cfg:      context to use for classifier-free guidance
-//  - idx:          sample from llama_get_logits_ith(ctx, idx)
+// This goal is to have a common implementation of the sampling logic shared across the examples.
+// For example, depending on the temperature, the sampling chain can be very simple (greedy) or more
+// complex (top-k, top-p, etc).
 //
-// returns:
-//  - token:      sampled token
-//  - candidates: vector of candidate tokens
+// Another example is related to the grammar. In general, the grammar constraints applied on the full
+// vocabulary can be very taxing. To improve performance, the grammar can be applied only to the sampled
+// token in order to verify if it fits the grammar. And only if the token doesn't fit the grammar, the
+// grammar constraints are applied to the full vocabulary and the token is resampled.
+//
+// The common_sampler also maintains a container with the last accepted tokens. In the future, this can
+// be moved into the core llama library.
+//
+// For convenience, the common_sampler also maintains a container with the current candidate tokens.
+// This can be used to access the probabilities of the rest of the non-sampled tokens.
+//
+// TODO: measure grammar performance
 //
-llama_token llama_sampling_sample(
-        struct llama_sampling_context * ctx_sampling,
-        struct llama_context * ctx_main,
-        struct llama_context * ctx_cfg,
-        int idx = -1);

-// Prepares and adjusts the set of token candidates for sampling based on penalties, biases, and sampling parameters.
-llama_token_data_array llama_sampling_prepare(
-        struct llama_sampling_context * ctx_sampling,
-        struct llama_context * ctx_main,
-        struct llama_context * ctx_cfg,
-        int idx = 0,
-        bool apply_grammar = true,
-        std::vector<float> * original_logits = nullptr);
+struct common_sampler;

-void llama_sampling_accept(
-        struct llama_sampling_context * ctx_sampling,
-        struct llama_context * ctx_main,
-        llama_token id,
-        bool apply_grammar);
+// llama_sampler API overloads
+
+struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params);
+
+void common_sampler_free(struct common_sampler * gsmpl);
+
+// if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
+void                    common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar);
+void                    common_sampler_reset (struct common_sampler * gsmpl);
+struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
+
+// arguments can be nullptr to skip printing
+void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);
+
+// extended sampling implementation:
+//
+// - set logits
+// - apply the configured sampler chain
+// - check if the token fits the grammar (if any)
+// - if not: resample by first applying the grammar constraints and then sampling again (slower path)
+//
+// if grammar_first is true, the grammar is applied before the samplers (slower)
+// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
+//
+llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
+
+uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
+
+// helpers
+
+// access the internal list of current candidate tokens
+llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl);
+
+// get the last accepted token
+llama_token common_sampler_last(const struct common_sampler * gsmpl);
+
+// print the sampler chain into a string
+std::string common_sampler_print(const struct common_sampler * gsmpl);
+
+// get a string representation of the last accepted tokens
+std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx, int n);
+
+char        common_sampler_type_to_chr(enum common_sampler_type cnstr);
+std::string common_sampler_type_to_str(enum common_sampler_type cnstr);
+
+std::vector<enum common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
+std::vector<enum common_sampler_type> common_sampler_types_from_chars(const std::string & chars);
--- a/common/train.cpp
+++ b/common/train.cpp
--- a/common/train.h
+++ b/common/train.h
@@ -1,233 +0,0 @@
-// Various helper functions and utilities for training
-
-#pragma once
-
-#include <string>
-#include <random>
-#include <vector>
-
-#include "ggml.h"
-#include "llama.h"
-
-#define LLAMA_TRAIN_MAX_NODES 16384
-
-typedef std::string mt19937_state;
-
-struct train_state {
-    struct ggml_opt_context * opt;
-
-    uint64_t train_its;
-    uint64_t train_samples;
-    uint64_t train_tokens;
-    uint64_t train_epochs;
-
-    size_t        shuffle_samples_hash; // fn, sample_count, *zip(sample_begins, sample_sizes)
-    mt19937_state shuffle_rng_state_current;
-    mt19937_state shuffle_rng_state_next;
-    size_t        shuffle_sample_count;
-    size_t        shuffle_next_sample;
-};
-
-struct train_params_common {
-    const char * fn_train_data;
-    const char * fn_checkpoint_in;
-    const char * fn_checkpoint_out;
-    const char * pattern_fn_it;
-    const char * fn_latest;
-
-    bool print_usage;
-
-    int save_every;
-
-    uint32_t seed;
-
-    int n_ctx;
-    int n_threads;
-    int n_batch;
-    int n_gradient_accumulation;
-    int n_epochs;
-    int n_gpu_layers;
-
-    bool custom_n_ctx;
-
-    bool use_flash;
-    bool use_checkpointing;
-
-    std::string sample_start;
-    bool include_sample_start;
-    bool escape;
-    bool overlapping_samples;
-    bool fill_with_next_samples;
-    bool separate_with_eos;
-    bool separate_with_bos;
-    bool sample_random_offsets;
-
-    bool force_reshuffle;
-
-    int   warmup;
-    int   cos_decay_steps;
-    float cos_decay_restart;
-    float cos_decay_min;
-    bool  enable_restart;
-
-    int   opt_past;
-    float opt_delta;
-    int   opt_max_no_improvement;
-
-    int   adam_n_iter;
-    float adam_alpha;
-    float adam_min_alpha;
-    float adam_decay;
-    int   adam_decay_min_ndim;
-    float adam_beta1;
-    float adam_beta2;
-    float adam_gclip;
-    float adam_eps_f;
-};
-
-typedef void (*save_train_files_callback)(void * data, struct train_state * train);
-
-struct train_opt_callback_data {
-    struct train_params_common * params;
-    struct train_state         * train;
-    save_train_files_callback    save_cb;
-    void                       * save_data;
-    struct llama_context       * lctx;
-    int                          last_save_iter;
-    llama_token                * tokens_data;
-    size_t                       tokens_size;
-    size_t                     * samples_begin;
-    size_t                     * samples_size;
-    size_t                     * shuffled_samples_offs;
-    size_t                     * shuffled_samples_begin;
-    size_t                     * shuffled_samples_size;
-    size_t                       samples_count;
-    struct ggml_tensor         * tokens_input;
-    struct ggml_tensor         * target_probs;
-    int                          first_iter;
-    int                          first_epoch;
-    int                          iter_at_last_epoch;
-    int64_t                      last_time;
-    double                       millis_per_iter;
-};
-
-struct train_state * init_train_state();
-void free_train_state(struct train_state  * state);
-
-struct train_params_common get_default_train_params_common();
-void print_common_train_usage(int /*argc*/, char ** argv, const struct train_params_common * params);
-
-bool consume_common_train_arg(int argc, char ** argv, int * idx, struct train_params_common * params, bool * invalid_param);
-void finish_processing_train_args(struct train_params_common * params);
-
-struct random_normal_distribution;
-struct random_uniform_distribution;
-
-struct random_normal_distribution  * init_random_normal_distribution (int seed, float mean, float std, float min, float max);
-struct random_uniform_distribution * init_random_uniform_distribution(int seed, float min, float max);
-
-void free_random_normal_distribution (struct random_normal_distribution  * rnd);
-void free_random_uniform_distribution(struct random_uniform_distribution * rnd);
-
-struct ggml_tensor * randomize_tensor_normal (struct ggml_tensor * tensor, struct random_normal_distribution * rnd);
-struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd);
-
-// generate random float in interval [0,1)
-float frand();
-float frand_normal (struct random_normal_distribution * rnd);
-float frand_uniform(struct random_uniform_distribution * rnd);
-
-int   clamp (const int v, const int min, const int max);
-float fclamp(const float v, const float min, const float max);
-
-void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0);
-void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1);
-void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2);
-void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3);
-
-size_t tokenize_file(
-        struct llama_context     * lctx,
-        const char               * filename,
-        const std::string        & sample_start,
-        bool                       include_sample_start,
-        bool                       overlapping_samples,
-        unsigned                   context_length,
-        std::vector<llama_token> & out_tokens,
-        std::vector<size_t>      & out_samples_begin,
-        std::vector<size_t>      & out_samples_size);
-
-int64_t get_example_targets_batch(
-        struct llama_context * lctx,
-        struct ggml_tensor   * tokens_input,
-        struct ggml_tensor   * target_probs,
-        int64_t                example_id,
-        const size_t         * samples_offs,
-        const size_t         * samples_begin,
-        const size_t         * samples_size,
-              size_t           samples_count,
-        const llama_token    * train_data,
-        size_t                 n_train_data,
-        bool                   separate_with_eos,
-        bool                   separate_with_bos,
-        bool                   fill_with_next_samples,
-        bool                   sample_random_offsets);
-
-
-void          mt19937_set_state(std::mt19937& rng, const mt19937_state& rng_state);
-mt19937_state mt19937_get_state(const std::mt19937& rng);
-mt19937_state mt19937_seed_to_state(unsigned seed);
-
-mt19937_state shuffle_samples(
-        const mt19937_state & rng_state,
-        size_t              * shuffled_offs,
-        size_t              * shuffled_begins,
-        size_t              * shuffled_sizes,
-        const size_t        * begins,
-        const size_t        * sizes,
-        size_t                count);
-
-size_t hash_combine(size_t h1, size_t h2);
-
-size_t compute_samples_hash(
-    const char* fn,
-    const size_t* samples_begin,
-    const size_t* samples_size,
-    size_t sample_count);
-
-
-std::string replace_str(const char * s, const char * needle, const char * replacement);
-
-void print_duration(double milliseconds);
-
-float cosine_decay(
-    int64_t step,
-    int64_t decay_steps,
-    float   minimum);
-
-float cosine_decay_restart(
-    int64_t step,
-    int64_t decay_steps,
-    float   minimum,
-    float   restart_step_mult);
-
-float learning_schedule(
-    int64_t step,
-    int64_t warmup_steps,
-    int64_t decay_steps,
-    float   learning_rate,
-    float   overall_minimum,
-    float   cos_decay_minimum,
-    float   cos_decay_restart_step_mult,
-    bool    enable_restart);
-
-void copy_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, const char * name);
-
-void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct ggml_opt_context * opt);
-void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * opt);
-
-bool load_train_state_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct train_state * train);
-void save_train_state_gguf(struct gguf_context * fctx, struct train_state * train);
-
-std::string get_train_filename(const char * filename, const char * pattern_it, const char * latest, int64_t iteration);
-
-void train_opt_callback(void * vdata, int accum_step, float * sched, bool * cancel);
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -3,6 +3,7 @@

 from __future__ import annotations

+import ast
 import logging
 import argparse
 import contextlib
@@ -14,6 +15,7 @@ from enum import IntEnum
 from pathlib import Path
 from hashlib import sha256
 from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast
+from itertools import chain

 import math
 import numpy as np
@@ -63,7 +65,6 @@ class Model:
    model_name: str | None
    metadata_override: Path | None
    dir_model_card: Path
-    is_lora: bool

    # subclasses should define this!
    model_arch: gguf.MODEL_ARCH
@@ -71,7 +72,8 @@ class Model:
    def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
                 use_temp_file: bool = False, eager: bool = False,
                 metadata_override: Path | None = None, model_name: str | None = None,
-                 split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False, is_lora: bool = False):
+                 split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False,
+                 small_first_shard: bool = False, hparams: dict[str, Any] | None = None):
        if type(self) is Model:
            raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")

@@ -86,14 +88,13 @@ class Model:
        self.is_safetensors = len(self.part_names) > 0
        if not self.is_safetensors:
            self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
-        self.hparams = Model.load_hparams(self.dir_model)
+        self.hparams = Model.load_hparams(self.dir_model) if hparams is None else hparams
        self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
        self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
        self.tensor_names = None
        self.metadata_override = metadata_override
        self.model_name = model_name
        self.dir_model_card = dir_model  # overridden in convert_lora_to_gguf.py
-        self.is_lora = is_lora  # true if model is used inside convert_lora_to_gguf.py

        # Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
        if self.ftype == gguf.LlamaFileType.GUESSED:
@@ -131,12 +132,14 @@ class Model:
    def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
        tensor_names_from_parts: set[str] = set()

-        if len(self.part_names) > 1:
+        index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin"
+        index_name += ".index.json"
+        index_file = self.dir_model / index_name
+
+        if index_file.is_file():
            self.tensor_names = set()
-            index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin"
-            index_name += ".index.json"
            logger.info(f"gguf: loading model weight map from '{index_name}'")
-            with open(self.dir_model / index_name, "r", encoding="utf-8") as f:
+            with open(index_file, "r", encoding="utf-8") as f:
                index: dict[str, Any] = json.load(f)
                weight_map = index.get("weight_map")
                if weight_map is None or not isinstance(weight_map, dict):
@@ -144,6 +147,7 @@ class Model:
                self.tensor_names.update(weight_map.keys())
        else:
            self.tensor_names = tensor_names_from_parts
+            weight_map = {}

        for part_name in self.part_names:
            logger.info(f"gguf: loading model part '{part_name}'")
@@ -170,9 +174,17 @@ class Model:
                            data = LazyTorchTensor.from_eager(data)
                    yield name, data

-        # only verify tensor name presence; it doesn't matter if they are not in the right files
-        if len(sym_diff := tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0:
-            raise ValueError(f"Mismatch between weight map and model parts for tensor names: {sym_diff}")
+        # verify tensor name presence and identify potentially missing files
+        if len(tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0:
+            missing = sorted(self.tensor_names.difference(tensor_names_from_parts))
+            extra = sorted(tensor_names_from_parts.difference(self.tensor_names))
+            missing_files = sorted(set(weight_map[n] for n in missing if n in weight_map))
+            if len(extra) == 0 and len(missing_files) > 0:
+                raise ValueError(f"Missing or incomplete model files: {missing_files}")
+            else:
+                raise ValueError("Mismatch between weight map and model parts for tensor names:\n"
+                                 f"Missing tensors: {missing}\n"
+                                 f"Extra tensors: {extra}")

    def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str:
        if key not in gguf.MODEL_TENSORS[self.model_arch]:
@@ -258,10 +270,14 @@ class Model:

        return False

+    # some models need extra generated tensors (like rope_freqs)
+    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
+        return ()
+
    def prepare_tensors(self):
        max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")

-        for name, data_torch in self.get_tensors():
+        for name, data_torch in chain(self.generate_extra_tensors(), self.get_tensors()):
            # we don't need these
            if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
                continue
@@ -279,8 +295,13 @@ class Model:
                    bid = int(part)
                    break

-            for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)):
-                data: np.ndarray  # type hint
+            for new_name, data_torch in (self.modify_tensors(data_torch, name, bid)):
+                data = data_torch.squeeze().numpy()
+
+                # if data ends up empty, it means data_torch was a scalar tensor -> restore
+                if len(data.shape) == 0:
+                    data = data_torch.numpy()
+
                n_dims = len(data.shape)
                data_qtype: gguf.GGMLQuantizationType | bool = self.tensor_force_quant(name, new_name, bid, n_dims)

@@ -298,12 +319,31 @@ class Model:
                            gguf.MODEL_TENSOR.POS_EMBD,
                            gguf.MODEL_TENSOR.TOKEN_TYPES,
                            gguf.MODEL_TENSOR.SSM_CONV1D,
+                            gguf.MODEL_TENSOR.TIME_MIX_FIRST,
+                            gguf.MODEL_TENSOR.TIME_MIX_W1,
+                            gguf.MODEL_TENSOR.TIME_MIX_W2,
+                            gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1,
+                            gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2,
                        )
                    )
-                    or not name.endswith(".weight")
+                    or not new_name.endswith(".weight")
                ):
                    data_qtype = gguf.GGMLQuantizationType.F32

+                if data_qtype is False and any(
+                    self.match_model_tensor_name(new_name, key, bid)
+                    for key in (
+                        gguf.MODEL_TENSOR.TOKEN_EMBD,
+                        gguf.MODEL_TENSOR.OUTPUT,
+                    )
+                ):
+                    if self.ftype in (
+                        gguf.LlamaFileType.MOSTLY_TQ1_0,
+                        gguf.LlamaFileType.MOSTLY_TQ2_0,
+                    ):
+                        # TODO: use Q4_K and Q6_K
+                        data_qtype = gguf.GGMLQuantizationType.F16
+
                # No override (data_qtype is False), or wants to be quantized (data_qtype is True)
                if isinstance(data_qtype, bool):
                    if self.ftype == gguf.LlamaFileType.ALL_F32:
@@ -314,6 +354,10 @@ class Model:
                        data_qtype = gguf.GGMLQuantizationType.BF16
                    elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
                        data_qtype = gguf.GGMLQuantizationType.Q8_0
+                    elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ1_0:
+                        data_qtype = gguf.GGMLQuantizationType.TQ1_0
+                    elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ2_0:
+                        data_qtype = gguf.GGMLQuantizationType.TQ2_0
                    else:
                        raise ValueError(f"Unknown file type: {self.ftype.name}")

@@ -530,6 +574,9 @@ class Model:
        if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
            # ref: https://huggingface.co/BAAI/bge-small-en-v1.5
            res = "bert-bge"
+        if chkhsh == "8e62295832751ca1e8f92f2226f403dea30dc5165e448b5bfa05af5340c64ec7":
+            # ref: https://huggingface.co/BAAI/bge-large-zh-v1.5
+            res = "bert-bge-large"
        if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
            # ref: https://huggingface.co/mosaicml/mpt-7b
            res = "mpt"
@@ -557,6 +604,9 @@ class Model:
        if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e":
            # ref: https://huggingface.co/databricks/dbrx-base
            res = "dbrx"
+        if chkhsh == "c7699093ba4255a91e702aa38a596aa81669f3525dae06c2953267dde580f448":
+            # ref: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
+            res = "jina-v1-en"
        if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
            # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en
            res = "jina-v2-en"
@@ -602,6 +652,12 @@ class Model:
        if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae":
            # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct
            res = "exaone"
+        if chkhsh == "fcace8b9cac38ce847670c970cd5892031a753a1ef381abd1d9af00f713da085":
+            # ref: https://huggingface.co/microsoft/phi-2
+            res = "phi-2"
+        if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450":
+            # ref: https://huggingface.co/facebook/chameleon-7b
+            res = "chameleon"

        if res is None:
            logger.warning("\n")
@@ -1460,7 +1516,7 @@ class StableLMModel(Model):
                raise ValueError(f"Unprocessed norms: {norms}")


-@Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
+@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
 class LlamaModel(Model):
    model_arch = gguf.MODEL_ARCH.LLAMA

@@ -1486,6 +1542,17 @@ class LlamaModel(Model):
            special_vocab._set_special_token("eot",    32010)
            special_vocab.add_to_gguf(self.gguf_writer)

+        tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
+        if tokenizer_config_file.is_file():
+            with open(tokenizer_config_file, "r", encoding="utf-8") as f:
+                tokenizer_config_json = json.load(f)
+                if "add_prefix_space" in tokenizer_config_json:
+                    self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
+
+        # Apply to granite small models only
+        if self.hparams.get("vocab_size", 32000) == 49152:
+            self.gguf_writer.add_add_bos_token(False)
+
    def set_gguf_parameters(self):
        super().set_gguf_parameters()
        hparams = self.hparams
@@ -1502,17 +1569,6 @@ class LlamaModel(Model):
                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])

-        tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
-        if tokenizer_config_file.is_file():
-            with open(tokenizer_config_file, "r", encoding="utf-8") as f:
-                tokenizer_config_json = json.load(f)
-                if "add_prefix_space" in tokenizer_config_json:
-                    self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
-
-        # Apply to granite small models only
-        if self.hparams.get("vocab_size", 32000) == 49152:
-            self.gguf_writer.add_add_bos_token(False)
-
    @staticmethod
    def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
        if n_head_kv is not None and n_head != n_head_kv:
@@ -1568,7 +1624,7 @@ class LlamaModel(Model):

        return [(self.map_tensor_name(name), data_torch)]

-    def prepare_tensors(self):
+    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
        if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
            if rope_scaling.get("rope_type", '').lower() == "llama3":
                base = self.hparams.get("rope_theta", 10000.0)
@@ -1595,9 +1651,9 @@ class LlamaModel(Model):
                        smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
                        rope_factors.append(1 / ((1 - smooth) / factor + smooth))

-                if not self.is_lora:
-                    self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
+                yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))

+    def prepare_tensors(self):
        super().prepare_tensors()

        if self._experts is not None:
@@ -1619,15 +1675,16 @@ class BitnetModel(Model):
        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
        self.gguf_writer.add_rope_scaling_factor(1.0)

-    def weight_quant(self, weight):
+    def weight_quant(self, weight: Tensor) -> Tensor:
        dtype = weight.dtype
        weight = weight.float()
-        s = 1 / weight.abs().mean().clamp(min=1e-5)
-        weight = (weight * s).round().clamp(-1, 1) / s
-        scale = weight.abs().max().unsqueeze(0)
-        weight = torch.where(weight.abs().less(1e-6), 0, weight).type(dtype)
-        weight = torch.sign(weight).type(dtype)
-        return weight.type(dtype), scale.type(torch.float32)
+        scale = weight.abs().mean().clamp(min=1e-5)
+        iscale = 1 / scale
+        # TODO: multiply by the scale directly instead of inverting it twice
+        # (this is also unnecessarily doubly inverted upstream)
+        # ref: https://huggingface.co/1bitLLM/bitnet_b1_58-3B/blob/af89e318d78a70802061246bf037199d2fb97020/utils_quant.py#L10
+        result = (weight * iscale).round().clamp(-1, 1) / iscale
+        return result.type(dtype)

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        new_name = self.map_tensor_name(name)
@@ -1642,11 +1699,9 @@ class BitnetModel(Model):
            gguf.MODEL_TENSOR.FFN_GATE,
        ]):
            # transform weight into 1/0/-1 (in fp32)
-            weight_torch, scale_torch = self.weight_quant(data_torch)
-            yield (new_name, weight_torch)
-            yield (new_name.removesuffix(".weight") + ".scale", scale_torch)
-        else:
-            yield (new_name, data_torch)
+            data_torch = self.weight_quant(data_torch)
+
+        yield (new_name, data_torch)


@Model.register("GrokForCausalLM")
@@ -1815,6 +1870,59 @@ class MiniCPMModel(Model):
        return [(self.map_tensor_name(name), data_torch)]


+@Model.register("MiniCPM3ForCausalLM")
+class MiniCPM3Model(Model):
+    model_arch = gguf.MODEL_ARCH.MINICPM3
+
+    def set_gguf_parameters(self):
+        hparams = self.hparams
+
+        self.gguf_writer.add_file_type(self.ftype)
+        self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
+        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
+        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
+        self.gguf_writer.add_head_count(hparams["num_attention_heads"])
+        self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
+        self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
+        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
+        if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
+            self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
+        self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
+        self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
+        self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
+
+    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
+        rope_scaling = self.find_hparam(['rope_scaling'], True)
+        if rope_scaling is not None:
+            rope_dims = self.hparams["qk_rope_head_dim"]
+
+            long_factors = rope_scaling.get('long_factor', None)
+            short_factors = rope_scaling.get('short_factor', None)
+
+            if long_factors is None or short_factors is None:
+                raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
+
+            if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
+                raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
+
+            yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
+            yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
+
+    def set_vocab(self):
+        self._set_vocab_sentencepiece()
+
+    def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
+        if n_kv_head is not None and n_head != n_kv_head:
+            n_head //= n_kv_head
+
+        return (
+            weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
+            .swapaxes(1, 2)
+            .reshape(weights.shape)
+        )
+
+
@Model.register("QWenLMHeadModel")
 class QwenModel(Model):
    model_arch = gguf.MODEL_ARCH.QWEN
@@ -2114,6 +2222,13 @@ class Phi3MiniModel(Model):
        self.gguf_writer.add_file_type(self.ftype)
        self.gguf_writer.add_sliding_window(self.find_hparam(["sliding_window"]))

+    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
+        n_embd = self.find_hparam(["hidden_size", "n_embd"])
+        n_head = self.find_hparam(["num_attention_heads", "n_head"])
+        max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
+        orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
+        rope_dims = n_embd // n_head
+
        # write rope scaling for long context (128k) model
        rope_scaling = self.find_hparam(['rope_scaling'], True)
        if rope_scaling is None:
@@ -2143,9 +2258,8 @@ class Phi3MiniModel(Model):
        if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
            raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')

-        if not self.is_lora:
-            self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG]  + ".weight", np.array(long_factors, dtype=np.float32))
-            self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32))
+        yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
+        yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))


@Model.register("PlamoForCausalLM")
@@ -2507,7 +2621,7 @@ class NomicBertModel(BertModel):
        self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])


-@Model.register("XLMRobertaModel")
+@Model.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
 class XLMRobertaModel(BertModel):
    model_arch = gguf.MODEL_ARCH.BERT

@@ -2605,6 +2719,11 @@ class XLMRobertaModel(BertModel):
        self.gguf_writer.add_add_eos_token(True)

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # if name starts with "roberta.", remove the prefix
+        # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
+        if name.startswith("roberta."):
+            name = name[8:]
+
        # position embeddings start at pad_token_id + 1, so just chop down the weight tensor
        if name == "embeddings.position_embeddings.weight":
            if self._position_offset is not None:
@@ -2716,6 +2835,89 @@ class StarCoder2Model(Model):
    model_arch = gguf.MODEL_ARCH.STARCODER2


+@Model.register("Rwkv6ForCausalLM")
+class Rwkv6Model(Model):
+    model_arch = gguf.MODEL_ARCH.RWKV6
+
+    def set_vocab(self):
+        assert (self.dir_model / "rwkv_vocab_v20230424.txt").is_file()
+        vocab_size = self.hparams.get("vocab_size", 65536)
+
+        tokens: list[bytes] = ['<s>'.encode("utf-8")]
+        toktypes: list[int] = [gguf.TokenType.CONTROL]
+
+        with open(self.dir_model / "rwkv_vocab_v20230424.txt", "r", encoding="utf-8") as f:
+            lines = f.readlines()
+            for line in lines:
+                parts = line.split(' ')
+                assert len(parts) >= 3
+                token, token_len = ast.literal_eval(' '.join(parts[1:-1])), int(parts[-1])
+                token = token.encode("utf-8") if isinstance(token, str) else token
+                assert isinstance(token, bytes)
+                assert len(token) == token_len
+                token_text: str = repr(token)[2:-1]  # "b'\xff'" -> "\xff"
+                tokens.append(token_text.encode("utf-8"))
+                toktypes.append(gguf.TokenType.NORMAL)
+        remainder = vocab_size - len(tokens)
+        assert remainder >= 0
+        for i in range(len(tokens), vocab_size):
+            tokens.append(f"[PAD{i}]".encode("utf-8"))
+            toktypes.append(gguf.TokenType.UNUSED)
+
+        self.gguf_writer.add_tokenizer_model("rwkv")
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_types(toktypes)
+        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
+        special_vocab.chat_template = "rwkv-world"
+        # hack: Add '\n\n' as the EOT token to make it chat normally
+        special_vocab._set_special_token("eot", 261)
+        special_vocab.add_to_gguf(self.gguf_writer)
+
+    def set_gguf_parameters(self):
+        block_count = self.hparams["num_hidden_layers"]
+        head_size = self.hparams["head_size"]
+        hidden_size = self.hparams["hidden_size"]
+        layer_norm_eps = self.hparams["layer_norm_epsilon"]
+        rescale_every_n_layers = self.hparams["rescale_every"]
+        intermediate_size = self.hparams["intermediate_size"] if self.hparams["intermediate_size"] is not None else int((hidden_size * 3.5) // 32 * 32)
+        time_mix_extra_dim = 64 if hidden_size == 4096 else 32
+        time_decay_extra_dim = 128 if hidden_size == 4096 else 64
+
+        # RWKV isn't context limited
+        self.gguf_writer.add_context_length(1048576)
+        self.gguf_writer.add_embedding_length(hidden_size)
+        self.gguf_writer.add_block_count(block_count)
+        self.gguf_writer.add_layer_norm_eps(layer_norm_eps)
+        self.gguf_writer.add_rescale_every_n_layers(rescale_every_n_layers)
+        self.gguf_writer.add_wkv_head_size(head_size)
+        self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim)
+        self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim)
+        self.gguf_writer.add_feed_forward_length(intermediate_size)
+        self.gguf_writer.add_file_type(self.ftype)
+
+        # required by llama.cpp, unused
+        self.gguf_writer.add_head_count(0)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        new_name = self.map_tensor_name(name)
+
+        if not (new_name.endswith(".weight") or new_name.endswith(".bias")):
+            new_name += ".weight"
+
+        if new_name.endswith("time_mix_w1.weight") or new_name.endswith("time_mix_decay_w1.weight") or new_name.endswith("time_mix_decay_w2.weight"):
+            data_torch = data_torch.transpose(0, 1)
+
+        if new_name.endswith("time_mix_w2.weight"):
+            data_torch = data_torch.permute(0, 2, 1)
+
+        rescale_every_n_layers = self.hparams["rescale_every"]
+        if rescale_every_n_layers > 0:
+            if new_name.endswith("time_mix_output.weight") or new_name.endswith("channel_mix_value.weight"):
+                data_torch = data_torch.div_(2 ** int(bid // rescale_every_n_layers))
+
+        yield (new_name, data_torch)
+
+
@Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM")
 class MambaModel(Model):
    model_arch = gguf.MODEL_ARCH.MAMBA
@@ -2838,6 +3040,71 @@ class OlmoModel(Model):
        return [(self.map_tensor_name(name), data_torch)]


+@Model.register("Olmo1124ForCausalLM")
+class Olmo1124Model(Model):
+    model_arch = gguf.MODEL_ARCH.OLMO_1124
+
+
+@Model.register("OlmoeForCausalLM")
+class OlmoeModel(Model):
+    model_arch = gguf.MODEL_ARCH.OLMOE
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_layer_norm_rms_eps(1e-5)
+        if (n_experts := self.hparams.get("num_experts")) is not None:
+            self.gguf_writer.add_expert_count(n_experts)
+
+    _experts: list[dict[str, Tensor]] | None = None
+
+    # Copied from: Qwen2MoeModel
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # process the experts separately
+        if name.find("experts") != -1:
+            n_experts = self.hparams["num_experts"]
+            assert bid is not None
+
+            if self._experts is None:
+                self._experts = [{} for _ in range(self.block_count)]
+
+            self._experts[bid][name] = data_torch
+
+            if len(self._experts[bid]) >= n_experts * 3:
+                tensors: list[tuple[str, Tensor]] = []
+
+                # merge the experts into a single 3d tensor
+                for w_name in ["down_proj", "gate_proj", "up_proj"]:
+                    datas: list[Tensor] = []
+
+                    for xid in range(n_experts):
+                        ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
+                        datas.append(self._experts[bid][ename])
+                        del self._experts[bid][ename]
+
+                    data_torch = torch.stack(datas, dim=0)
+
+                    merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
+
+                    new_name = self.map_tensor_name(merged_name)
+
+                    tensors.append((new_name, data_torch))
+                return tensors
+            else:
+                return []
+
+        return [(self.map_tensor_name(name), data_torch)]
+
+    # Copied from: Qwen2MoeModel
+    def prepare_tensors(self):
+        super().prepare_tensors()
+
+        if self._experts is not None:
+            # flatten `list[dict[str, Tensor]]` into `list[str]`
+            experts = [k for d in self._experts for k in d.keys()]
+            if len(experts) > 0:
+                raise ValueError(f"Unprocessed experts: {experts}")
+
+
@Model.register("JinaBertModel", "JinaBertForMaskedLM")
 class JinaBertV2Model(BertModel):
    model_arch = gguf.MODEL_ARCH.JINA_BERT_V2
@@ -2876,6 +3143,14 @@ class JinaBertV2Model(BertModel):
        self.gguf_writer.add_add_bos_token(True)
        self.gguf_writer.add_add_eos_token(True)

+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # if name starts with "bert.", remove the prefix
+        # e.g. https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
+        if name.startswith("bert."):
+            name = name[5:]
+
+        return super().modify_tensors(data_torch, name, bid)
+

@Model.register("OpenELMForCausalLM")
 class OpenELMModel(Model):
@@ -3478,10 +3753,7 @@ class JaisModel(Model):

        # Embeddings scale
        self.embeddings_scale = 1.0
-        # note: For some JAIS flavors, output is tied to (same as) wte in original model
-        self.output_is_wte = False
        if 'mup_embeddings_scale' in self.hparams:
-            self.output_is_wte = True   # Hack (?)
            self.embeddings_scale = self.hparams['mup_embeddings_scale']
        elif 'embeddings_scale' in self.hparams:
            self.embeddings_scale = self.hparams['embeddings_scale']
@@ -3538,10 +3810,7 @@ class JaisModel(Model):

        if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
            tensors.append((new_name, data_torch * self.embeddings_scale))
-            if self.output_is_wte:
-                tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch * self.width_scale))
        elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT):
-            assert not self.output_is_wte
            tensors.append((new_name, data_torch * self.width_scale))
        else:
            tensors.append((new_name, data_torch))
@@ -3816,7 +4085,7 @@ class ExaoneModel(Model):
                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
                self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])

-    def prepare_tensors(self):
+    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
        if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
            if rope_scaling.get("rope_type", '').lower() == "llama3":
                base = self.hparams.get("rope_theta", 10000.0)
@@ -3843,14 +4112,112 @@ class ExaoneModel(Model):
                        smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
                        rope_factors.append(1 / ((1 - smooth) / factor + smooth))

-                if not self.is_lora:
-                    self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
+                yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))

-        super().prepare_tensors()
+
+@Model.register("GraniteForCausalLM")
+class GraniteModel(LlamaModel):
+    """Conversion for IBM's GraniteForCausalLM"""
+    model_arch = gguf.MODEL_ARCH.GRANITE
+
+    def set_gguf_parameters(self):
+        """Granite uses standard llama parameters with the following differences:
+
+        - No head_dim support
+        - New multiplier params:
+            - attention_scale
+            - embedding_scale
+            - residual_scale
+        - logits_scaling
+        """
+        if head_dim := self.hparams.pop("head_dim", None):
+            logger.warning("Ignoring head_dim (%s) from config for Granite", head_dim)
+        super().set_gguf_parameters()
+        # NOTE: Convert _multiplier params to _scale params for naming
+        #   consistency
+        if attention_scale := self.hparams.get("attention_multiplier"):
+            self.gguf_writer.add_attention_scale(attention_scale)
+            logger.info("gguf: (granite) attention_scale = %s", attention_scale)
+        if embedding_scale := self.hparams.get("embedding_multiplier"):
+            self.gguf_writer.add_embedding_scale(embedding_scale)
+            logger.info("gguf: (granite) embedding_scale = %s", embedding_scale)
+        if residual_scale := self.hparams.get("residual_multiplier"):
+            self.gguf_writer.add_residual_scale(residual_scale)
+            logger.info("gguf: (granite) residual_scale = %s", residual_scale)
+        if logits_scale := self.hparams.get("logits_scaling"):
+            self.gguf_writer.add_logit_scale(logits_scale)
+            logger.info("gguf: (granite) logits_scale = %s", logits_scale)
+
+
+@Model.register("GraniteMoeForCausalLM")
+class GraniteMoeModel(GraniteModel):
+    """Conversion for IBM's GraniteMoeForCausalLM"""
+    model_arch = gguf.MODEL_ARCH.GRANITE_MOE
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        """In modeling_granitemoe, the JetMoe implementation of parallel experts
+        is used. This essentially merges w1 and w3 into a single tensor with 2x
+        the hidden size that is then split during forward. To keep compatibility
+        with existing mixtral support, we pull them apart here.
+        """
+
+        if name.endswith("block_sparse_moe.input_linear.weight"):
+            ffn_dim = self.hparams["intermediate_size"]
+            assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * intermediate_size"
+            gate, up = data_torch[..., :ffn_dim, :], data_torch[..., ffn_dim:, :]
+            return [
+                (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), gate),
+                (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), up),
+            ]
+
+        return super().modify_tensors(data_torch, name, bid)
+
+
+@Model.register("ChameleonForConditionalGeneration")
+@Model.register("ChameleonForCausalLM")  # obsolete
+class ChameleonModel(Model):
+    model_arch = gguf.MODEL_ARCH.CHAMELEON
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_swin_norm(self.hparams.get("swin_norm", False))
+
+    def set_vocab(self):
+        self._set_vocab_gpt2()
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # ignore image tokenizer for now
+        # TODO: remove this once image support is implemented for Chameleon
+        if name.startswith("model.vqmodel"):
+            return []
+
+        n_head = self.hparams["num_attention_heads"]
+        n_kv_head = self.hparams.get("num_key_value_heads")
+        hidden_dim = self.hparams.get("hidden_size")
+
+        if name.endswith(("q_proj.weight", "q_proj.bias")):
+            data_torch = LlamaModel.permute(data_torch, n_head, n_head)
+        if name.endswith(("k_proj.weight", "k_proj.bias")):
+            data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
+        if name.endswith(("q_norm.weight", "q_norm.bias")):
+            data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_head, hidden_dim)
+        if name.endswith(("k_norm.weight", "k_norm.bias")):
+            data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_kv_head, hidden_dim)
+
+        return [(self.map_tensor_name(name), data_torch)]
+
+    # see: https://github.com/huggingface/transformers/blob/72fb02c47dbbe1999ae105319f24631cad6e2e00/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py#L176-L203
+    @staticmethod
+    def _reverse_hf_permute(data_torch, n_heads, hidden_dim):
+        head_dim = hidden_dim // n_heads
+        data_torch = data_torch[0].view(2, head_dim // 2).t().reshape(1, -1)
+        data_torch = data_torch.repeat_interleave(n_heads, 0)
+        return data_torch


 ###### CONVERSION LOGIC ######

+
 # tree of lazy tensors
 class LazyTorchTensor(gguf.LazyBase):
    _tensor_type = torch.Tensor
@@ -3929,8 +4296,8 @@ def parse_args() -> argparse.Namespace:
        help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
    )
    parser.add_argument(
-        "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
-        help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
+        "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="f16",
+        help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
    )
    parser.add_argument(
        "--bigendian", action="store_true",
@@ -4017,6 +4384,8 @@ def main() -> None:
        "f16": gguf.LlamaFileType.MOSTLY_F16,
        "bf16": gguf.LlamaFileType.MOSTLY_BF16,
        "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
+        "tq1_0": gguf.LlamaFileType.MOSTLY_TQ1_0,
+        "tq2_0": gguf.LlamaFileType.MOSTLY_TQ2_0,
        "auto": gguf.LlamaFileType.GUESSED,
    }

--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -31,6 +31,7 @@ import re
 import requests
 import sys
 import json
+import shutil

 from hashlib import sha256
 from enum import IntEnum, auto
@@ -71,6 +72,7 @@ models = [
    {"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
    {"name": "falcon",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
    {"name": "bert-bge",       "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
+    {"name": "bert-bge-large", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/BAAI/bge-large-zh-v1.5", },
    {"name": "mpt",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
    {"name": "starcoder",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
    {"name": "gpt-2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
@@ -80,6 +82,7 @@ models = [
    {"name": "qwen2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
    {"name": "olmo",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
    {"name": "dbrx",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
+    {"name": "jina-v1-en",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-reranker-v1-tiny-en", },
    {"name": "jina-v2-en",     "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
    {"name": "jina-v2-es",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
    {"name": "jina-v2-de",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
@@ -97,6 +100,8 @@ models = [
    {'name': "bloom",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", },
    {'name': "gpt3-finnish",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", },
    {"name": "exaone",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
+    {"name": "phi-2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
+    {"name": "chameleon",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", },
 ]


@@ -125,12 +130,27 @@ def download_model(model):
    if tokt == TOKENIZER_TYPE.UGM:
        files.append("spiece.model")

-    for file in files:
-        save_path = f"models/tokenizers/{name}/{file}"
-        if os.path.isfile(save_path):
-            logger.info(f"{name}: File {save_path} already exists - skipping")
-            continue
-        download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)
+    if os.path.isdir(repo):
+        # If repo is a path on the file system, copy the directory
+        for file in files:
+            src_path = os.path.join(repo, file)
+            dst_path = f"models/tokenizers/{name}/{file}"
+            if os.path.isfile(dst_path):
+                logger.info(f"{name}: File {dst_path} already exists - skipping")
+                continue
+            if os.path.isfile(src_path):
+                shutil.copy2(src_path, dst_path)
+                logger.info(f"{name}: Copied {src_path} to {dst_path}")
+            else:
+                logger.warning(f"{name}: Source file {src_path} does not exist")
+    else:
+        # If repo is a URL, download the files
+        for file in files:
+            save_path = f"models/tokenizers/{name}/{file}"
+            if os.path.isfile(save_path):
+                logger.info(f"{name}: File {save_path} already exists - skipping")
+                continue
+            download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)


 for model in models:
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@@ -12,6 +12,7 @@ import json
 from math import prod
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Sequence, SupportsIndex, cast
+from transformers import AutoConfig

 import torch

@@ -230,7 +231,7 @@ def get_base_tensor_name(lora_tensor_name: str) -> str:

 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
-        description="Convert a huggingface PEFT LoRA adapter to a GGML compatible file")
+        description="Convert a Hugging Face PEFT LoRA adapter to a GGUF file")
    parser.add_argument(
        "--outfile", type=Path,
        help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
@@ -256,17 +257,23 @@ def parse_args() -> argparse.Namespace:
        help="only print out what will be done, without writing any new files",
    )
    parser.add_argument(
-        "--base", type=Path, required=True,
-        help="directory containing base model file",
+        "--base", type=Path,
+        help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required. If base model is unspecified, it will be loaded from Hugging Face hub based on the adapter config",
    )
    parser.add_argument(
        "lora_path", type=Path,
-        help="directory containing LoRA adapter file",
+        help="directory containing Hugging Face PEFT LoRA config (adapter_model.json) and weights (adapter_model.safetensors or adapter_model.bin)",
    )

    return parser.parse_args()


+def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]:
+    # normally, adapter does not come with base model config, we need to load it from AutoConfig
+    config = AutoConfig.from_pretrained(hf_model_id)
+    return config.to_dict()
+
+
 if __name__ == '__main__':
    args = parse_args()
    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
@@ -281,7 +288,7 @@ if __name__ == '__main__':

    ftype = ftype_map[args.outtype]

-    dir_base_model: Path = args.base
+    dir_base_model: Path | None = args.base
    dir_lora: Path = args.lora_path
    lora_config = dir_lora / "adapter_config.json"
    input_model = dir_lora / "adapter_model.safetensors"
@@ -301,9 +308,29 @@ if __name__ == '__main__':
        input_model = os.path.join(dir_lora, "adapter_model.bin")
        lora_model = torch.load(input_model, map_location="cpu", weights_only=True)

+    # load LoRA config
+    with open(lora_config, "r") as f:
+        lparams: dict[str, Any] = json.load(f)
+
    # load base model
-    logger.info(f"Loading base model: {dir_base_model.name}")
-    hparams = Model.load_hparams(dir_base_model)
+    if dir_base_model is None:
+        if "base_model_name_or_path" in lparams:
+            model_id = lparams["base_model_name_or_path"]
+            logger.info(f"Loading base model from Hugging Face: {model_id}")
+            try:
+                hparams = load_hparams_from_hf(model_id)
+            except OSError as e:
+                logger.error(f"Failed to load base model config: {e}")
+                logger.error("Please try downloading the base model and add its path to --base")
+                sys.exit(1)
+        else:
+            logger.error("'base_model_name_or_path' is not found in adapter_config.json")
+            logger.error("Base model config is required. Please download the base model and add its path to --base")
+            sys.exit(1)
+    else:
+        logger.info(f"Loading base model: {dir_base_model.name}")
+        hparams = Model.load_hparams(dir_base_model)
+
    with torch.inference_mode():
        try:
            model_class = Model.from_model_architecture(hparams["architectures"][0])
@@ -323,13 +350,19 @@ if __name__ == '__main__':
                self.dir_model_card = dir_lora_model
                self.lora_alpha = float(lora_alpha)

+            def set_vocab(self):
+                pass
+
            def set_type(self):
                self.gguf_writer.add_type(gguf.GGUFType.ADAPTER)
                self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")

            def set_gguf_parameters(self):
                self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha)
-                super().set_gguf_parameters()
+
+            def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
+                # Never add extra tensors (e.g. rope_freqs) for LoRA adapters
+                return ()

            def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
                tensor_map: dict[str, PartialLoraTensor] = {}
@@ -344,6 +377,9 @@ if __name__ == '__main__':
                        if ".base_layer.weight" in name:
                            continue
                        logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
+                        if ".embed_tokens.weight" in name or ".lm_head.weight" in name:
+                            logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning")
+                            logger.error("Please refer to https://github.com/ggerganov/llama.cpp/pull/9948")
                        sys.exit(1)

                    if base_name in tensor_map:
@@ -363,7 +399,13 @@ if __name__ == '__main__':
                    yield (name, cast(torch.Tensor, LoraTorchTensor(tensor.A, tensor.B)))

            def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-                dest = super().modify_tensors(data_torch, name, bid)
+                dest = list(super().modify_tensors(data_torch, name, bid))
+                # some archs may have the same tensor for lm_head and output (tie word embeddings)
+                # in this case, adapters targeting lm_head will fail when using llama-export-lora
+                # therefore, we ignore them for now
+                # see: https://github.com/ggerganov/llama.cpp/issues/9065
+                if name == "lm_head.weight" and len(dest) == 0:
+                    raise ValueError("lm_head is present in adapter, but is ignored in base model")
                for dest_name, dest_data in dest:
                    assert isinstance(dest_data, LoraTorchTensor)
                    lora_a, lora_b = dest_data.get_lora_A_B()
@@ -371,9 +413,6 @@ if __name__ == '__main__':
                    yield (dest_name + ".lora_a", lora_a)
                    yield (dest_name + ".lora_b", lora_b)

-        with open(lora_config, "r") as f:
-            lparams: dict[str, Any] = json.load(f)
-
        alpha: float = lparams["lora_alpha"]

        model_instance = LoraModel(
@@ -386,7 +425,7 @@ if __name__ == '__main__':
            dry_run=args.dry_run,
            dir_lora_model=dir_lora,
            lora_alpha=alpha,
-            is_lora=True,
+            hparams=hparams,
        )

        logger.info("Exporting model...")
--- a/docs/android.md
+++ b/docs/android.md
@@ -2,55 +2,82 @@
 # Android

 ## Build on Android using Termux
-[Termux](https://github.com/termux/termux-app#installation) is a method to execute `llama.cpp` on an Android device (no root required).
-```
-apt update && apt upgrade -y
-apt install git make cmake
-```

-It's recommended to move your model inside the `~/` directory for best performance:
-```
-cd storage/downloads
-mv model.gguf ~/
-```
+[Termux](https://termux.dev/en/) is an Android terminal emulator and Linux environment app (no root required). As of writing, Termux is available experimentally in the Google Play Store; otherwise, it may be obtained directly from the project repo or on F-Droid.

-[Get the code](https://github.com/ggerganov/llama.cpp#get-the-code) & [follow the Linux build instructions](https://github.com/ggerganov/llama.cpp#build) to build `llama.cpp`.
-
-## Building the Project using Android NDK
-Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake.
-
-Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux:
-```
-$ mkdir build-android
-$ cd build-android
-$ export NDK=<your_ndk_directory>
-$ cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
-$ make
-```
-
-Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).
-
-Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
-
-(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
-```
-$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/
-$cd /data/data/com.termux/files/home/bin
-$chmod +x ./*
-```
-
-Download model [llama-2-7b-chat.Q4_K_M.gguf](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/blob/main/llama-2-7b-chat.Q4_K_M.gguf), and push it to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/`
+With Termux, you can install and run `llama.cpp` as if the environment were Linux. Once in the Termux shell:

 ```
-$mv /sdcard/llama.cpp/llama-2-7b-chat.Q4_K_M.gguf /data/data/com.termux/files/home/model/
+$ apt update && apt upgrade -y
+$ apt install git cmake
 ```

-Now, you can start chatting:
+Then, follow the [build instructions](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md), specifically for CMake.
+
+Once the binaries are built, download your model of choice (e.g., from Hugging Face). It's recommended to place it in the `~/` directory for best performance:
+
 ```
-$cd /data/data/com.termux/files/home/bin
-$./llama-cli -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml
+$ curl -L {model-url} -o ~/{model}.gguf
 ```

-Here's a demo of an interactive session running on Pixel 5 phone:
+Then, if you are not already in the repo directory, `cd` into `llama.cpp` and:
+
+```
+$ ./build/bin/llama-simple -m ~/{model}.gguf -c {context-size} -p "{your-prompt}"
+```
+
+Here, we show `llama-simple`, but any of the executables under `examples` should work, in theory. Be sure to set `context-size` to a reasonable number (say, 4096) to start with; otherwise, memory could spike and kill your terminal.
+
+To see what it might look like visually, here's an old demo of an interactive session running on a Pixel 5 phone:

 https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
+
+## Cross-compile using Android NDK
+It's possible to build `llama.cpp` for Android on your host system via CMake and the Android NDK. If you are interested in this path, ensure you already have an environment prepared to cross-compile programs for Android (i.e., install the Android SDK). Note that, unlike desktop environments, the Android environment ships with a limited set of native libraries, and so only those libraries are available to CMake when building with the Android NDK (see: https://developer.android.com/ndk/guides/stable_apis.)
+
+Once you're ready and have cloned `llama.cpp`, invoke the following in the project directory:
+
+```
+$ cmake \
+  -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
+  -DANDROID_ABI=arm64-v8a \
+  -DANDROID_PLATFORM=android-28 \
+  -DCMAKE_C_FLAGS="-march=armv8.7a" \
+  -DCMAKE_CXX_FLAGS="-march=armv8.7a" \
+  -DGGML_OPENMP=OFF \
+  -DGGML_LLAMAFILE=OFF \
+  -B build-android
+```
+
+Notes:
+  - While later versions of Android NDK ship with OpenMP, it must still be installed by CMake as a dependency, which is not supported at this time
+  - `llamafile` does not appear to support Android devices (see: https://github.com/Mozilla-Ocho/llamafile/issues/325)
+
+The above command should configure `llama.cpp` with the most performant options for modern devices. Even if your device is not running `armv8.7a`, `llama.cpp` includes runtime checks for available CPU features it can use.
+
+Feel free to adjust the Android ABI for your target. Once the project is configured:
+
+```
+$ cmake --build build-android --config Release -j{n}
+$ cmake --install build-android --prefix {install-dir} --config Release
+```
+
+After installing, go ahead and download the model of your choice to your host system. Then:
+
+```
+$ adb shell "mkdir /data/local/tmp/llama.cpp"
+$ adb push {install-dir} /data/local/tmp/llama.cpp/
+$ adb push {model}.gguf /data/local/tmp/llama.cpp/
+$ adb shell
+```
+
+In the `adb shell`:
+
+```
+$ cd /data/local/tmp/llama.cpp
+$ LD_LIBRARY_PATH=lib ./bin/llama-simple -m {model}.gguf -c {context-size} -p "{your-prompt}"
+```
+
+That's it!
+
+Be aware that Android will not find the library path `lib` on its own, so we must specify `LD_LIBRARY_PATH` in order to run the installed executables. Android does support `RPATH` in later API levels, so this could change in the future. Refer to the previous section for information about `context-size` (very important!) and running other `examples`.
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -26,7 +26,7 @@

 ### Llama.cpp + SYCL

-The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it could support other vendor GPUs: Nvidia GPU (*AMD GPU coming*).
+The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it also supports other vendor GPUs: Nvidia and AMD.

 ## Recommended Release

@@ -41,6 +41,8 @@ The following release is verified with good quality:

 ## News

+- 2024.11
+  - Use syclcompat to improve the performance on some platforms. This requires to use oneAPI 2025.0 or newer.

 - 2024.8
  - Use oneDNN as the default GEMM library, improve the compatibility for new Intel GPUs.
@@ -111,10 +113,18 @@ SYCL backend supports Intel GPU Family:

 **Verified devices**

-| Nvidia GPU               | Status  | Verified Model |
-|--------------------------|---------|----------------|
-| Ampere Series            | Support | A100, A4000    |
-| Ampere Series *(Mobile)* | Support | RTX 40 Series  |
+| Nvidia GPU               | Status    | Verified Model |
+|--------------------------|-----------|----------------|
+| Ampere Series            | Supported | A100, A4000    |
+| Ampere Series *(Mobile)* | Supported | RTX 40 Series  |
+
+| AMD GPU                  | Status       | Verified Model |
+|--------------------------|--------------|----------------|
+| Radeon Pro               | Experimental | W6800          |
+| Radeon RX                | Experimental | 6700 XT        |
+
+Note: AMD GPU support is highly experimental and is incompatible with F16.
+Additionally, it only supports GPUs with a sub_group_size (warp size) of 32.

 ## Docker
 The docker build option is currently limited to *intel GPU* targets.
@@ -186,6 +196,10 @@ Platform #0: Intel(R) OpenCL HD Graphics

 In order to target Nvidia GPUs through SYCL, please make sure the CUDA/CUBLAS native requirements *-found [here](README.md#cuda)-* are installed.

+- **AMD GPU**
+
+To target AMD GPUs with SYCL, the ROCm stack must be installed first.
+
 2. **Install Intel® oneAPI Base toolkit**

 - **For Intel GPU**
@@ -212,6 +226,19 @@ cmake -B buildWithCublas -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENAB
 cmake --build buildWithCublas --config Release
 ```

+- **Adding support to AMD GPUs**
+
+**oneAPI Plugin**: In order to enable SYCL support on AMD GPUs, please install the [Codeplay oneAPI Plugin for AMD GPUs](https://developer.codeplay.com/products/oneapi/amd/download). As with Nvidia GPUs, the user should also make sure the plugin version matches the installed base toolkit.
+
+**oneMKL for rocBlas**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* doesn't contain the rocBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *rocBLAS* backend enabled is thus required to run it on AMD GPUs.
+
+```sh
+git clone https://github.com/oneapi-src/oneMKL
+cd oneMKL
+# Find your HIPTARGET with rocminfo, under the key 'Name:'
+cmake -B buildWithrocBLAS -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_ROCBLAS_BACKEND=ON -DHIPTARGETS=${HIPTARGET} -DTARGET_DOMAINS=blas
+cmake --build buildWithrocBLAS --config Release
+```

 3. **Verify installation and environment**

@@ -223,22 +250,32 @@ sycl-ls

 - **Intel GPU**

-When targeting an intel GPU, the user should expect one or more level-zero devices among the available SYCL devices. Please make sure that at least one GPU is present, for instance [`ext_oneapi_level_zero:gpu:0`] in the sample output below:
+When targeting an intel GPU, the user should expect one or more level-zero devices among the available SYCL devices. Please make sure that at least one GPU is present, for instance [`level_zero:gpu`] in the sample output below:

 ```
-[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2  [2023.16.10.0.17_160000]
-[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
-[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO  [23.30.26918.50]
-[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
+[opencl:acc][opencl:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2  [2023.16.10.0.17_160000]
+[opencl:cpu][opencl:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
+[opencl:gpu][opencl:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO  [23.30.26918.50]
+[level_zero:gpu][level_zero:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
 ```

 - **Nvidia GPU**

-Similarly, user targeting Nvidia GPUs should expect at least one SYCL-CUDA device [`ext_oneapi_cuda:gpu`] as bellow:
+Similarly, user targeting Nvidia GPUs should expect at least one SYCL-CUDA device [`cuda:gpu`] as below:
+
 ```
-[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2  [2023.16.12.0.12_195853.xmain-hotfix]
-[opencl:cpu:1] Intel(R) OpenCL, Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz OpenCL 3.0 (Build 0) [2023.16.12.0.12_195853.xmain-hotfix]
-[ext_oneapi_cuda:gpu:0] NVIDIA CUDA BACKEND, NVIDIA A100-PCIE-40GB 8.0 [CUDA 12.2]
+[opencl:acc][opencl:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2  [2023.16.12.0.12_195853.xmain-hotfix]
+[opencl:cpu][opencl:1] Intel(R) OpenCL, Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz OpenCL 3.0 (Build 0) [2023.16.12.0.12_195853.xmain-hotfix]
+[cuda:gpu][cuda:0] NVIDIA CUDA BACKEND, NVIDIA A100-PCIE-40GB 8.0 [CUDA 12.5]
+```
+
+- **AMD GPU**
+
+For AMD GPUs we should expect at least one SYCL-HIP device [`hip:gpu`]:
+
+```
+[opencl:cpu][opencl:0] Intel(R) OpenCL, 12th Gen Intel(R) Core(TM) i9-12900K OpenCL 3.0 (Build 0) [2024.18.6.0.02_160000]
+[hip:gpu][hip:0] AMD HIP BACKEND, AMD Radeon PRO W6800 gfx1030 [HIP 60140.9]
 ```

 ### II. Build llama.cpp
@@ -266,6 +303,7 @@ cmake --build build --config Release -j -v
 ```

 #### Nvidia GPU
+
 ```sh
 # Export relevant ENV variables
 export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LD_LIBRARY_PATH
@@ -274,16 +312,37 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithCublas/include:$CPLUS_INCLUDE_
 export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR

 # Build LLAMA with Nvidia BLAS acceleration through SYCL
+# Setting GGML_SYCL_DEVICE_ARCH is optional but can improve performance
+GGML_SYCL_DEVICE_ARCH=sm_80 # Example architecture

 # Option 1: Use FP32 (recommended for better performance in most cases)
-cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx

 # Option 2: Use FP16
-cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
+cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON

 # build all binary
 cmake --build build --config Release -j -v
+```

+#### AMD GPU
+
+```sh
+# Export relevant ENV variables
+export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LD_LIBRARY_PATH
+export LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LIBRARY_PATH
+export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithrocBLAS/include:$CPLUS_INCLUDE_DIR
+
+# Build LLAMA with rocBLAS acceleration through SYCL
+
+## AMD
+# Use FP32, FP16 is not supported
+# Find your GGML_SYCL_DEVICE_ARCH with rocminfo, under the key 'Name:'
+GGML_SYCL_DEVICE_ARCH=gfx90a # Example architecture
+cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=AMD -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+
+# build all binary
+cmake --build build --config Release -j -v
 ```

 ### III. Run the inference
@@ -323,7 +382,7 @@ found 2 SYCL devices:

 |Chosen Device ID|Setting|
 |-|-|
-|0|`export ONEAPI_DEVICE_SELECTOR="level_zero:1"` or no action|
+|0|`export ONEAPI_DEVICE_SELECTOR="level_zero:0"` or no action|
 |1|`export ONEAPI_DEVICE_SELECTOR="level_zero:1"`|
 |0 & 1|`export ONEAPI_DEVICE_SELECTOR="level_zero:0;level_zero:1"`|

@@ -336,12 +395,12 @@ Choose one of following methods to run.
 - Use device 0:

 ```sh
-./examples/sycl/run_llama2.sh 0
+./examples/sycl/run-llama2.sh 0
 ```
 - Use multiple devices:

 ```sh
-./examples/sycl/run_llama2.sh
+./examples/sycl/run-llama2.sh
 ```

 2. Command line
@@ -586,11 +645,12 @@ use 1 SYCL GPUs: [0] with Max compute units:512

 #### Build

-| Name               | Value                             | Function                                    |
-|--------------------|-----------------------------------|---------------------------------------------|
-| GGML_SYCL          | ON (mandatory)                    | Enable build with SYCL code path.<br>FP32 path - recommended for better perforemance than FP16 on quantized model|
-| GGML_SYCL_TARGET   | INTEL *(default)* \| NVIDIA       | Set the SYCL target device type.            |
-| GGML_SYCL_F16      | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path.      |
+| Name               | Value                                 | Function                                    |
+|--------------------|---------------------------------------|---------------------------------------------|
+| GGML_SYCL          | ON (mandatory)                        | Enable build with SYCL code path.<br>FP32 path - recommended for better perforemance than FP16 on quantized model|
+| GGML_SYCL_TARGET   | INTEL *(default)* \| NVIDIA \| AMD    | Set the SYCL target device type.            |
+| GGML_SYCL_DEVICE_ARCH | Optional (except for AMD)          | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. |
+| GGML_SYCL_F16      | OFF *(default)* \|ON *(optional)*     | Enable FP16 build with SYCL code path.      |
 | CMAKE_C_COMPILER   | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path.      |
 | CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)*   | Set `icpx/icx` compiler for SYCL code path. |

@@ -636,6 +696,14 @@ use 1 SYCL GPUs: [0] with Max compute units:512

  It's same for other projects including llama.cpp SYCL backend.

+- Meet issue: `Native API failed. Native API returns: -6 (PI_ERROR_OUT_OF_HOST_MEMORY) -6 (PI_ERROR_OUT_OF_HOST_MEMORY) -999 (UNKNOWN PI error)` or `failed to allocate SYCL0 buffer`
+
+  Device Memory is not enough.
+
+  |Reason|Solution|
+  |-|-|
+  |Default Context is too big. It leads to more memory usage.|Set `-c 8192` or smaller value.|
+  |Model is big and require more memory than device's.|Choose smaller quantized model, like Q5 -> Q4;<br>Use more than one devices to load model.|

 ### **GitHub contribution**:
 Please add the **[SYCL]** prefix/tag in issues/PRs titles to help the SYCL-team check/address them without delay.
--- a/docs/build.md
+++ b/docs/build.md
@@ -186,18 +186,16 @@ The following compilation options are also available to tweak performance:

 | Option                        | Legal values           | Default | Description                                                                                                                                                                                                                                                                             |
 |-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| GGML_CUDA_FORCE_DMMV          | Boolean                | false   | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
-| GGML_CUDA_DMMV_X              | Positive integer >= 32 | 32      | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants.                                         |
-| GGML_CUDA_MMV_Y               | Positive integer       | 1       | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended.                                                                                                                                         |
 | GGML_CUDA_FORCE_MMQ           | Boolean                | false   | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower.                       |
 | GGML_CUDA_FORCE_CUBLAS        | Boolean                | false   | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models                                                                                                                                                                                       |
 | GGML_CUDA_F16                 | Boolean                | false   | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs.                                                           |
-| GGML_CUDA_KQUANTS_ITER        | 1 or 2                 | 2       | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs.                                                                                                                     |
 | GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer       | 128     | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial.                                                                         |
 | GGML_CUDA_FA_ALL_QUANTS       | Boolean                | false   | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer.                                                                                                  |

 ### MUSA

+This provides GPU acceleration using the MUSA cores of your Moore Threads MTT GPU. Make sure to have the MUSA SDK installed. You can download it from here: [MUSA SDK](https://developer.mthreads.com/sdk/download/musa).
+
 - Using `make`:
  ```bash
  make GGML_MUSA=1
@@ -209,6 +207,12 @@ The following compilation options are also available to tweak performance:
  cmake --build build --config Release
  ```

+The environment variable [`MUSA_VISIBLE_DEVICES`](https://docs.mthreads.com/musa-sdk/musa-sdk-doc-online/programming_guide/Z%E9%99%84%E5%BD%95/) can be used to specify which GPU(s) will be used.
+
+The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted.
+
+Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet.
+
 ### hipBLAS

 This provides BLAS acceleration on HIP-supported AMD GPUs.
@@ -222,7 +226,7 @@ You can download it from your Linux distro's package manager or from here: [ROCm
 - Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
  ```bash
  HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
-      cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
+      cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
      && cmake --build build --config Release -- -j 16
  ```
  On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`.
@@ -239,7 +243,7 @@ You can download it from your Linux distro's package manager or from here: [ROCm
  ```bash
  HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \
  HIP_DEVICE_LIB_PATH=<directory-you-just-found> \
-      cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
+      cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
      && cmake --build build -- -j 16
  ```

@@ -251,7 +255,7 @@ You can download it from your Linux distro's package manager or from here: [ROCm
 - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
  ```bash
  set PATH=%HIP_PATH%\bin;%PATH%
-  cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DGGML_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
+  cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DGGML_HIP=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
  cmake --build build
  ```
  Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
@@ -260,13 +264,6 @@ You can download it from your Linux distro's package manager or from here: [ROCm

 The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
 If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
-The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
-
-| Option                 | Legal values           | Default | Description                                                                                                                                                                                                                                    |
-|------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| GGML_CUDA_DMMV_X       | Positive integer >= 32 | 32      | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
-| GGML_CUDA_MMV_Y        | Positive integer       | 1       | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants.                                                                       |
-| GGML_CUDA_KQUANTS_ITER | 1 or 2                 | 2       | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs.                                                                             |

 ### Vulkan

@@ -274,9 +271,9 @@ The following compilation options are also available to tweak performance (yes,

 #### w64devkit

-Download and extract [w64devkit](https://github.com/skeeto/w64devkit/releases).
+Download and extract [`w64devkit`](https://github.com/skeeto/w64devkit/releases).

-Download and install the [Vulkan SDK](https://vulkan.lunarg.com/sdk/home#windows). When selecting components, only the Vulkan SDK Core is required.
+Download and install the [`Vulkan SDK`](https://vulkan.lunarg.com/sdk/home#windows) with the default settings.

 Launch `w64devkit.exe` and run the following commands to copy Vulkan dependencies:
 ```sh
@@ -294,6 +291,29 @@ EOF
 ```
 Switch into the `llama.cpp` directory and run `make GGML_VULKAN=1`.

+#### Git Bash MINGW64
+
+Download and install [`Git-SCM`](https://git-scm.com/downloads/win) with the default settings
+
+Download and install [`Visual Studio Community Edition`](https://visualstudio.microsoft.com/) and make sure you select `C++`
+
+Download and install [`CMake`](https://cmake.org/download/) with the default settings
+
+Download and install the [`Vulkan SDK`](https://vulkan.lunarg.com/sdk/home#windows) with the default settings.
+
+Go into your `llama.cpp` directory and right click, select `Open Git Bash Here` and then run the following commands
+
+```
+cmake -B build -DGGML_VULKAN=ON
+cmake --build build --config Release
+```
+
+Now you can load the model in conversation mode using `Vulkan`
+
+```
+build/bin/release/llama-cli -m "[PATH TO MODEL]" -ngl 100 -c 16384 -t 10 -n -2 -cnv
+```
+
 #### MSYS2
 Install [MSYS2](https://www.msys2.org/) and then run the following commands in a UCRT terminal to install dependencies.
  ```sh
@@ -367,7 +387,7 @@ cmake --build build --config release

 You can test with:

-`./build/llama-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32`
+`./build/bin/llama-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32`

 If the fllowing info is output on screen, you are using `llama.cpp by CANN backend`:
 ```bash
@@ -380,3 +400,9 @@ For detailed info, such as model/device supports, CANN install, please refer to
 ### Android

 To read documentation for how to build on Android, [click here](./android.md)
+
+### Arm CPU optimized mulmat kernels
+
+Llama.cpp includes a set of optimized mulmat kernels for the Arm architecture, leveraging Arm® Neon™, int8mm and SVE instructions. These kernels are enabled at build time through the appropriate compiler cpu-type flags, such as `-DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+sve`. Note that these optimized kernels require the model to be quantized into one of the formats: `Q4_0_4_4` (Arm Neon), `Q4_0_4_8` (int8mm) or `Q4_0_8_8` (SVE). The SVE mulmat kernel specifically requires a vector width of 256 bits. When running on devices with a different vector width, it is recommended to use the `Q4_0_4_8` (int8mm) or `Q4_0_4_4` (Arm Neon) formats for better performance. Refer to [examples/quantize/README.md](../examples/quantize/README.md) for more information on the quantization formats.
+
+To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_LLAMAFILE=OFF` (`cmake`).
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -19,8 +19,11 @@ Additionally, there the following images, similar to the above:
 - `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
 - `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
 - `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
+- `ghcr.io/ggerganov/llama.cpp:full-musa`: Same as `full` but compiled with MUSA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggerganov/llama.cpp:light-musa`: Same as `light` but compiled with MUSA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggerganov/llama.cpp:server-musa`: Same as `server` but compiled with MUSA support. (platforms: `linux/amd64`)

-The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).
+The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA, ROCm or MUSA library, you'll need to build the images locally for now).

 ## Usage

@@ -66,8 +69,8 @@ You may want to pass in some different `ARGS`, depending on the CUDA environment

 The defaults are:

- `CUDA_VERSION` set to `11.7.1`
- `CUDA_DOCKER_ARCH` set to `all`
+- `CUDA_VERSION` set to `12.6.0`
+- `CUDA_DOCKER_ARCH` set to the cmake build default, which includes all the supported architectures

 The resulting images, are essentially the same as the non-CUDA images:

@@ -84,3 +87,37 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run
 docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
 docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
 ```
+
+## Docker With MUSA
+
+Assuming one has the [mt-container-toolkit](https://developer.mthreads.com/musa/native) properly installed on Linux, `muBLAS` should be accessible inside the container.
+
+## Building Docker locally
+
+```bash
+docker build -t local/llama.cpp:full-musa -f .devops/full-musa.Dockerfile .
+docker build -t local/llama.cpp:light-musa -f .devops/llama-cli-musa.Dockerfile .
+docker build -t local/llama.cpp:server-musa -f .devops/llama-server-musa.Dockerfile .
+```
+
+You may want to pass in some different `ARGS`, depending on the MUSA environment supported by your container host, as well as the GPU architecture.
+
+The defaults are:
+
+- `MUSA_VERSION` set to `rc3.1.0`
+
+The resulting images, are essentially the same as the non-MUSA images:
+
+1. `local/llama.cpp:full-musa`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
+2. `local/llama.cpp:light-musa`: This image only includes the main executable file.
+3. `local/llama.cpp:server-musa`: This image only includes the server executable file.
+
+## Usage
+
+After building locally, Usage is similar to the non-MUSA examples, but you'll need to set `mthreads` as default Docker runtime. This can be done by executing `(cd /usr/bin/musa && sudo ./docker setup $PWD)` and verifying the changes by executing `docker info | grep mthreads` on the host machine. You will also want to use the `--n-gpu-layers` flag.
+
+```bash
+docker run -v /path/to/models:/models local/llama.cpp:full-musa --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
+docker run -v /path/to/models:/models local/llama.cpp:light-musa -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
+docker run -v /path/to/models:/models local/llama.cpp:server-musa -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
+```
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -13,10 +13,8 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 if (EMSCRIPTEN)
 else()
    add_subdirectory(cvector-generator)
-    add_subdirectory(baby-llama)
    add_subdirectory(batched-bench)
    add_subdirectory(batched)
-    add_subdirectory(benchmark)
    add_subdirectory(convert-llama2c-to-ggml)
    add_subdirectory(embedding)
    add_subdirectory(eval-callback)
@@ -50,6 +48,7 @@ else()
    endif()
    add_subdirectory(save-load-state)
    add_subdirectory(simple)
+    add_subdirectory(simple-chat)
    add_subdirectory(speculative)
    add_subdirectory(tokenize)
 endif()
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
--- a/examples/batched-bench/README.md
+++ b/examples/batched-bench/README.md
@@ -49,3 +49,12 @@ There are 2 modes of operation:
 |   128 |    256 |    8 |   3072 |    0.751 |  1363.92 |   15.110 |   135.54 |   15.861 |   193.69 |
 |   128 |    256 |   16 |   6144 |    1.569 |  1304.93 |   18.073 |   226.64 |   19.642 |   312.80 |
 |   128 |    256 |   32 |  12288 |    3.409 |  1201.35 |   19.223 |   426.15 |   22.633 |   542.93 |
+
+### JSONL output
+
+Pass `--output-format jsonl` to output JSONL instead of Markdown, á la
+
+```json lines
+{"n_kv_max": 2048, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "is_pp_shared": 0, "n_gpu_layers": 99, "n_threads": 8, "n_threads_batch": 8, "pp": 128, "tg": 128, "pl": 1, "n_kv": 256, "t_pp": 0.233810, "speed_pp": 547.453064, "t_tg": 3.503684, "speed_tg": 36.532974, "t": 3.737494, "speed": 68.495094}
+{"n_kv_max": 2048, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "is_pp_shared": 0, "n_gpu_layers": 99, "n_threads": 8, "n_threads_batch": 8, "pp": 128, "tg": 128, "pl": 2, "n_kv": 512, "t_pp": 0.422602, "speed_pp": 605.770935, "t_tg": 11.106112, "speed_tg": 23.050371, "t": 11.528713, "speed": 44.410854}
+```
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -1,49 +1,28 @@
+#include "arg.h"
 #include "common.h"
+#include "log.h"
 #include "llama.h"

 #include <algorithm>
-#include <cmath>
 #include <cstdio>
 #include <string>
 #include <vector>

-// mutates the input string
-static std::vector<int> parse_list(char * p) {
-    std::vector<int> ret;
-
-    char * q = p;
-
-    while (*p) {
-        if (*p == ',') {
-            *p = '\0';
-            ret.push_back(std::atoi(q));
-            q = p + 1;
-        }
-
-        ++p;
-    }
-
-    ret.push_back(std::atoi(q));
-
-    return ret;
-}
-
-static void print_usage(int argc, char ** argv, const gpt_params & params) {
-    gpt_params_print_usage(argc, argv, params);
-
-    LOG_TEE("\nexample usage:\n");
-    LOG_TEE("\n    %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
-    LOG_TEE("\n");
+static void print_usage(int, char ** argv) {
+    LOG("\nexample usage:\n");
+    LOG("\n    %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
+    LOG("\n");
 }

 int main(int argc, char ** argv) {
-    gpt_params params;
+    common_params params;

-    if (!gpt_params_parse(argc, argv, params)) {
-        print_usage(argc, argv, params);
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_BENCH, print_usage)) {
        return 1;
    }

+    common_init();
+
    int is_pp_shared = params.is_pp_shared;

    std::vector<int> n_pp = params.n_pp;
@@ -57,7 +36,7 @@ int main(int argc, char ** argv) {

    // initialize the model

-    llama_model_params model_params = llama_model_params_from_gpt_params(params);
+    llama_model_params model_params = common_model_params_to_llama(params);

    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);

@@ -66,7 +45,7 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
+    llama_context_params ctx_params = common_context_params_to_llama(params);

    // ensure enough sequences are available
    ctx_params.n_seq_max = n_pl.empty() ? 1 : *std::max_element(n_pl.begin(), n_pl.end());
@@ -95,12 +74,11 @@ int main(int argc, char ** argv) {
                batch.n_seq_id + i,
                batch.seq_id   + i,
                batch.logits   + i,
-                0, 0, 0, // unused
            };

            const int ret = llama_decode(ctx, batch_view);
            if (ret != 0) {
-                LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
+                LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
                return false;
            }

@@ -113,21 +91,22 @@ int main(int argc, char ** argv) {
    // warm up
    {
        for (int i = 0; i < 16; ++i) {
-            llama_batch_add(batch, 0, i, { 0 }, false);
+            common_batch_add(batch, 0, i, { 0 }, false);
        }

        if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
-            LOG_TEE("%s: llama_decode() failed\n", __func__);
+            LOG_ERR("%s: llama_decode() failed\n", __func__);
            return 1;
        }
    }

-    LOG_TEE("\n");
-    LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
-    LOG_TEE("\n");
-
-    LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP",     "TG",     "B",    "N_KV",     "T_PP s",   "S_PP t/s", "T_TG s",   "S_TG t/s", "T s",      "S t/s");
-    LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
+    if (!params.batched_bench_output_jsonl) {
+        LOG("\n");
+        LOG("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
+        LOG("\n");
+        LOG("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
+        LOG("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
+    }

    for (        int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) {
        for (    int i_tg = 0; i_tg < (int) n_tg.size(); ++i_tg) {
@@ -142,11 +121,11 @@ int main(int argc, char ** argv) {
                    continue;
                }

-                llama_batch_clear(batch);
+                common_batch_clear(batch);

                for (int i = 0; i < pp; ++i) {
                    for (int j = 0; j < (is_pp_shared ? 1 : pl); ++j) {
-                        llama_batch_add(batch, 0, i, { j }, false);
+                        common_batch_add(batch, 0, i, { j }, false);
                    }
                }
                batch.logits[batch.n_tokens - 1] = true;
@@ -156,7 +135,7 @@ int main(int argc, char ** argv) {
                llama_kv_cache_clear(ctx);

                if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
-                    LOG_TEE("%s: llama_decode() failed\n", __func__);
+                    LOG_ERR("%s: llama_decode() failed\n", __func__);
                    return 1;
                }

@@ -171,14 +150,14 @@ int main(int argc, char ** argv) {
                const auto t_tg_start = ggml_time_us();

                for (int i = 0; i < tg; ++i) {
-                    llama_batch_clear(batch);
+                    common_batch_clear(batch);

                    for (int j = 0; j < pl; ++j) {
-                        llama_batch_add(batch, 0, pp + i, { j }, true);
+                        common_batch_add(batch, 0, pp + i, { j }, true);
                    }

                    if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
-                        LOG_TEE("%s: llama_decode() failed\n", __func__);
+                        LOG_ERR("%s: llama_decode() failed\n", __func__);
                        return 1;
                    }
                }
@@ -195,12 +174,22 @@ int main(int argc, char ** argv) {
                const float speed_tg = pl*tg / t_tg;
                const float speed    = n_kv / t;

-                LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
+                if(params.batched_bench_output_jsonl) {
+                    LOG(
+                        "{\"n_kv_max\": %d, \"n_batch\": %d, \"n_ubatch\": %d, \"flash_attn\": %d, \"is_pp_shared\": %d, \"n_gpu_layers\": %d, \"n_threads\": %u, \"n_threads_batch\": %u, "
+                        "\"pp\": %d, \"tg\": %d, \"pl\": %d, \"n_kv\": %d, \"t_pp\": %f, \"speed_pp\": %f, \"t_tg\": %f, \"speed_tg\": %f, \"t\": %f, \"speed\": %f}\n",
+                        n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch,
+                        pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed
+                    );
+                } else {
+                    LOG("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
+                }
            }
        }
    }

-    llama_print_timings(ctx);
+    LOG("\n");
+    llama_perf_context_print(ctx);

    llama_batch_free(batch);

@@ -209,7 +198,7 @@ int main(int argc, char ** argv) {

    llama_backend_free();

-    fprintf(stderr, "\n\n");
+    LOG("\n\n");

    return 0;
 }
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@@ -27,7 +27,6 @@ guard let model = llama_load_model_from_file(modelPath.cString(using: .utf8), mo
    print("Failed to load model")
    exit(1)
 }
-
 defer {
    llama_free_model(model)
 }
@@ -37,7 +36,6 @@ var tokens = tokenize(text: prompt, add_bos: true)
 let n_kv_req = UInt32(tokens.count) + UInt32((n_len - Int(tokens.count)) * n_parallel)

 var context_params = llama_context_default_params()
-context_params.seed = 1234
 context_params.n_ctx = n_kv_req
 context_params.n_batch = UInt32(max(n_len, n_parallel))
 context_params.n_threads = 8
@@ -48,11 +46,26 @@ guard context != nil else {
    print("Failed to initialize context")
    exit(1)
 }
-
 defer {
    llama_free(context)
 }

+var sparams = llama_sampler_chain_default_params()
+
+let smpl = llama_sampler_chain_init(sparams)
+guard smpl != nil else {
+    print("Failed to initialize sampling")
+    exit(1)
+}
+defer {
+    llama_sampler_free(smpl)
+}
+
+llama_sampler_chain_add(smpl, llama_sampler_init_top_k(40));
+llama_sampler_chain_add(smpl, llama_sampler_init_top_p(0.9, 1));
+llama_sampler_chain_add(smpl, llama_sampler_init_temp (0.4));
+llama_sampler_chain_add(smpl, llama_sampler_init_dist (1234));
+
 let n_ctx = llama_n_ctx(context)

 print("\nn_len = \(n_len), n_ctx = \(n_ctx), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n")
@@ -125,32 +138,7 @@ while n_cur <= n_len {
            continue
        }

-        var n_vocab = llama_n_vocab(model)
-        var logits = llama_get_logits_ith(context, i_batch[i])
-
-        var candidates: [llama_token_data] = .init(repeating: llama_token_data(), count: Int(n_vocab))
-
-        for token_id in 0 ..< n_vocab {
-            candidates.append(llama_token_data(id: token_id, logit: logits![Int(token_id)], p: 0.0))
-        }
-
-        var candidates_p: llama_token_data_array = .init(
-            data: &candidates,
-            size: candidates.count,
-            sorted: false
-        )
-
-        let top_k: Int32 = 40
-        let top_p: Float = 0.9
-        let temp: Float = 0.4
-
-        llama_sample_top_k(context, &candidates_p, top_k, 1)
-        llama_sample_top_p(context, &candidates_p, top_p, 1)
-        llama_sample_temp(context, &candidates_p, temp)
-
-        let new_token_id = llama_sample_token(context, &candidates_p)
-
-        // const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
+        let new_token_id = llama_sampler_sample(smpl, context, i_batch[i])

        // is it an end of stream? -> mark the stream as finished
        if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
@@ -210,9 +198,10 @@ if n_parallel > 1 {

 let t_main_end = ggml_time_us()

-print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n")
+print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n\n")

-llama_print_timings(context)
+llama_perf_sampler_print(smpl)
+llama_perf_context_print(context)

 private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
    let utf8Count = text.utf8.count
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -1,31 +1,30 @@
+#include "arg.h"
 #include "common.h"
+#include "log.h"
 #include "llama.h"

 #include <algorithm>
-#include <cmath>
 #include <cstdio>
 #include <string>
 #include <vector>

-static void print_usage(int argc, char ** argv, const gpt_params & params) {
-    gpt_params_print_usage(argc, argv, params);
-
-    LOG_TEE("\nexample usage:\n");
-    LOG_TEE("\n    %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
-    LOG_TEE("\n");
+static void print_usage(int, char ** argv) {
+    LOG("\nexample usage:\n");
+    LOG("\n    %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
+    LOG("\n");
 }

 int main(int argc, char ** argv) {
-    gpt_params params;
+    common_params params;

    params.prompt = "Hello my name is";
    params.n_predict = 32;

-    if (!gpt_params_parse(argc, argv, params)) {
-        print_usage(argc, argv, params);
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
        return 1;
    }

+    common_init();

    // number of parallel batches
    int n_parallel = params.n_parallel;
@@ -40,57 +39,64 @@ int main(int argc, char ** argv) {

    // initialize the model

-    llama_model_params model_params = llama_model_params_from_gpt_params(params);
+    llama_model_params model_params = common_model_params_to_llama(params);

    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);

    if (model == NULL) {
-        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
+        LOG_ERR("%s: error: unable to load model\n" , __func__);
        return 1;
    }

    // tokenize the prompt

    std::vector<llama_token> tokens_list;
-    tokens_list = ::llama_tokenize(model, params.prompt, true);
+    tokens_list = common_tokenize(model, params.prompt, true);

    const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size())*n_parallel;

    // initialize the context

-    llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
+    llama_context_params ctx_params = common_context_params_to_llama(params);

    ctx_params.n_ctx   = n_kv_req;
    ctx_params.n_batch = std::max(n_predict, n_parallel);

    llama_context * ctx = llama_new_context_with_model(model, ctx_params);

+    auto sparams = llama_sampler_chain_default_params();
+
+    llama_sampler * smpl = llama_sampler_chain_init(sparams);
+
+    llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sparams.top_k));
+    llama_sampler_chain_add(smpl, llama_sampler_init_top_p(params.sparams.top_p, params.sparams.min_keep));
+    llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sparams.temp));
+    llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sparams.seed));
+
    if (ctx == NULL) {
-        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
+        LOG_ERR("%s: error: failed to create the llama_context\n" , __func__);
        return 1;
    }

    const int n_ctx = llama_n_ctx(ctx);

-    LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
+    LOG_INF("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);

    // make sure the KV cache is big enough to hold all the prompt and generated tokens
    if (n_kv_req > n_ctx) {
-        LOG_TEE("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__,  n_kv_req);
-        LOG_TEE("%s:        either reduce n_parallel or increase n_ctx\n", __func__);
+        LOG_ERR("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__,  n_kv_req);
+        LOG_ERR("%s:        either reduce n_parallel or increase n_ctx\n", __func__);
        return 1;
    }

    // print the prompt token-by-token

-    fprintf(stderr, "\n");
+    LOG("\n");

    for (auto id : tokens_list) {
-        fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
+        LOG("%s", common_token_to_piece(ctx, id).c_str());
    }

-    fflush(stderr);
-
    // create a llama_batch
    // we use this object to submit token data for decoding
    llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t) n_parallel), 0, n_parallel);
@@ -102,13 +108,13 @@ int main(int argc, char ** argv) {

    // evaluate the initial prompt
    for (size_t i = 0; i < tokens_list.size(); ++i) {
-        llama_batch_add(batch, tokens_list[i], i, seq_ids, false);
+        common_batch_add(batch, tokens_list[i], i, seq_ids, false);
    }
    GGML_ASSERT(batch.n_tokens == (int) tokens_list.size());

    if (llama_model_has_encoder(model)) {
        if (llama_encode(ctx, batch)) {
-            LOG_TEE("%s : failed to eval\n", __func__);
+            LOG_ERR("%s : failed to eval\n", __func__);
            return 1;
        }

@@ -117,15 +123,15 @@ int main(int argc, char ** argv) {
            decoder_start_token_id = llama_token_bos(model);
        }

-        llama_batch_clear(batch);
-        llama_batch_add(batch, decoder_start_token_id, 0, seq_ids, false);
+        common_batch_clear(batch);
+        common_batch_add(batch, decoder_start_token_id, 0, seq_ids, false);
    }

    // llama_decode will output logits only for the last token of the prompt
    batch.logits[batch.n_tokens - 1] = true;

    if (llama_decode(ctx, batch) != 0) {
-        LOG_TEE("%s: llama_decode() failed\n", __func__);
+        LOG_ERR("%s: llama_decode() failed\n", __func__);
        return 1;
    }

@@ -136,7 +142,7 @@ int main(int argc, char ** argv) {
    //}

    if (n_parallel > 1) {
-        LOG_TEE("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
+        LOG("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
    }

    // main loop
@@ -155,7 +161,7 @@ int main(int argc, char ** argv) {

    while (n_cur <= n_predict) {
        // prepare the next batch
-        llama_batch_clear(batch);
+        common_batch_clear(batch);

        // sample the next token for each parallel sequence / stream
        for (int32_t i = 0; i < n_parallel; ++i) {
@@ -164,36 +170,14 @@ int main(int argc, char ** argv) {
                continue;
            }

-            auto   n_vocab = llama_n_vocab(model);
-            auto * logits  = llama_get_logits_ith(ctx, i_batch[i]);
-
-            std::vector<llama_token_data> candidates;
-            candidates.reserve(n_vocab);
-
-            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-                candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
-            }
-
-            llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-
-            const int   top_k = 40;
-            const float top_p = 0.9f;
-            const float temp  = 0.4f;
-
-            llama_sample_top_k(ctx, &candidates_p, top_k, 1);
-            llama_sample_top_p(ctx, &candidates_p, top_p, 1);
-            llama_sample_temp (ctx, &candidates_p, temp);
-
-            const llama_token new_token_id = llama_sample_token(ctx, &candidates_p);
-
-            //const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
+            const llama_token new_token_id = llama_sampler_sample(smpl, ctx, i_batch[i]);

            // is it an end of generation? -> mark the stream as finished
            if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
                i_batch[i] = -1;
-                LOG_TEE("\n");
+                LOG("\n");
                if (n_parallel > 1) {
-                    LOG_TEE("%s: stream %d finished at n_cur = %d", __func__, i, n_cur);
+                    LOG_INF("%s: stream %d finished at n_cur = %d", __func__, i, n_cur);
                }

                continue;
@@ -201,16 +185,15 @@ int main(int argc, char ** argv) {

            // if there is only one stream, we print immediately to stdout
            if (n_parallel == 1) {
-                LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
-                fflush(stdout);
+                LOG("%s", common_token_to_piece(ctx, new_token_id).c_str());
            }

-            streams[i] += llama_token_to_piece(ctx, new_token_id);
+            streams[i] += common_token_to_piece(ctx, new_token_id);

            i_batch[i] = batch.n_tokens;

            // push this new token for next evaluation
-            llama_batch_add(batch, new_token_id, n_cur, { i }, true);
+            common_batch_add(batch, new_token_id, n_cur, { i }, true);

            n_decode += 1;
        }
@@ -224,32 +207,33 @@ int main(int argc, char ** argv) {

        // evaluate the current batch with the transformer model
        if (llama_decode(ctx, batch)) {
-            fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
+            LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
            return 1;
        }
    }

-    LOG_TEE("\n");
-
    if (n_parallel > 1) {
-        LOG_TEE("\n");
+        LOG("\n");

        for (int32_t i = 0; i < n_parallel; ++i) {
-            LOG_TEE("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str());
+            LOG("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str());
        }
    }

    const auto t_main_end = ggml_time_us();

-    LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
+    LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
            __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));

-    llama_print_timings(ctx);
+    LOG("\n");
+    llama_perf_sampler_print(smpl);
+    llama_perf_context_print(ctx);

    fprintf(stderr, "\n");

    llama_batch_free(batch);

+    llama_sampler_free(smpl);
    llama_free(ctx);
    llama_free_model(model);

--- a/examples/benchmark/CMakeLists.txt
+++ b/examples/benchmark/CMakeLists.txt
@@ -1,6 +0,0 @@
-set(TARGET llama-bench-matmult)
-add_executable(${TARGET} benchmark-matmult.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
-target_include_directories(${TARGET} PRIVATE ../../common)
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@@ -1,275 +0,0 @@
-#include "common.h"
-#include "ggml.h"
-
-#include <locale.h>
-#include <assert.h>
-#include <math.h>
-#include <cstring>
-#include <cstdio>
-#include <cinttypes>
-#include <unordered_map>
-#include <queue>
-#include <string.h>
-#include <cassert>
-#include <fstream>
-#include <string>
-#include <iterator>
-#include <algorithm>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
-    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
-
-    if (plan.work_size > 0) {
-        buf.resize(plan.work_size);
-        plan.work_data = buf.data();
-    }
-
-    ggml_graph_compute(graph, &plan);
-}
-
-static float tensor_sum_elements(const ggml_tensor * tensor) {
-    double sum = 0;
-    if (tensor->type == GGML_TYPE_F32) {
-        for (int j = 0; j < tensor->ne[1]; j++) {
-            for (int k = 0; k < tensor->ne[0]; k++) {
-                sum += ((float *) tensor->data)[j*tensor->ne[0] + k];
-            }
-        }
-    }
-    return sum;
-}
-
-static void tensor_dump(const ggml_tensor * tensor, const char * name) {
-    printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi) - ", name,
-        tensor->type, ggml_type_name(tensor->type),
-        tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]);
-    float sum = tensor_sum_elements(tensor);
-    printf("Sum of tensor %s is %6.2f\n", name, sum);
-}
-
-#define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor)
-
-struct benchmark_params_struct {
-    int32_t n_threads     = 1;
-    int32_t n_iterations  = 10;
-};
-
-static void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) {
-    fprintf(stderr, "usage: %s [options]\n", argv[0]);
-    fprintf(stderr, "\n");
-    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h, --help            show this help message and exit\n");
-    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
-    fprintf(stderr, "  -i N, --iter N     number of iterations to use during computation (default: %d)\n", params.n_iterations);
-    fprintf(stderr, "\n");
-}
-
-int main(int argc, char ** argv)  {
-    struct benchmark_params_struct benchmark_params;
-
-    bool invalid_param = false;
-    std::string arg;
-    for (int i = 1; i < argc; i++) {
-        arg = argv[i];
-
-        if (arg == "-t" || arg == "--threads") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            benchmark_params.n_threads = std::stoi(argv[i]);
-        } else if (arg == "-i" || arg == "--iter") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            benchmark_params.n_iterations = std::stoi(argv[i]);
-        }  else if (arg == "-h" || arg == "--help") {
-            print_usage(argc, argv, benchmark_params);
-            exit(0);
-        }
-    }
-    if (invalid_param) {
-        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
-        print_usage(argc, argv, benchmark_params);
-        exit(1);
-    }
-
-    print_build_info();
-    printf("Starting Test\n");
-
-    // create the ggml context
-    struct ggml_context * ctx;
-    //const int sizex = 4096;
-    //const int sizey = 11008;
-
-#undef VERBOSE_DEBUGGING
-#ifndef VERBOSE_DEBUGGING
-    const int sizey = 4096;
-    const int sizex = 11008;
-    const int sizez = 128;
-#else
-    /* Working - let's increase size */
-    const int sizey = 1;
-    const int sizex = (8*32);
-    const int sizez = 1;
-
-    /*const int sizey = 1;
-    const int sizex = 3*(8*32);
-    const int sizez = 1;*/
-#endif
-
-    //printf("Memsize required = %i\n", sizex*sizex);
-
-    // TODO: perform the bench for all types or for a user specified type
-    const ggml_type qtype = GGML_TYPE_Q4_1;
-
-    size_t ctx_size = 0;
-    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
-    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
-    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizez);
-    ctx_size += ggml_row_size(qtype,         sizex*sizey);
-    ctx_size += ggml_row_size(qtype,         sizex*sizey);
-    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
-    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
-    ctx_size += 1024*1024*16;
-
-    printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024));
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ ctx_size,
-        /*.mem_buffer =*/ NULL,
-        /* no_alloc   =*/ 0
-    };
-
-    ctx = ggml_init(params);
-    if (!ctx) {
-        fprintf(stderr, "%s: ggml_init() failed\n", __func__);
-        return 1;
-    }
-
-
-    printf("Creating new tensors\n");
-    // printf("Creating new tensor m1\n");
-    struct ggml_tensor * m11 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
-    ggml_set_f32(m11, 1.0f);
-
-    // printf("Creating new tensor m1\n");
-    struct ggml_tensor * m12 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
-    ggml_set_f32(m12, 1.5f);
-
-    // printf("Creating new tensor m2\n");
-    struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez);
-    ggml_set_f32(m2, 2.0f);
-
-    printf("\n------ Test 1 - Matrix Mult via F32 code\n");
-    // printf("Creating new tensor m11xm2\n");
-    struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2);
-
-    // printf("Creating compute graph\n");
-    struct ggml_cgraph * gf = ggml_new_graph(ctx);
-    ggml_build_forward_expand(gf, m11xm2);
-
-    printf("n_threads=%i\n", benchmark_params.n_threads);
-
-    TENSOR_DUMP(m11);
-    TENSOR_DUMP(m2);
-
-    std::vector<uint8_t> work_buffer;
-
-    ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads);
-
-    TENSOR_DUMP(gf->nodes[0]);
-
-    printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype));
-
-    int32_t nelements = sizex*sizey;
-
-    // Set up a the benchmark matrices
-    // printf("Creating new tensor q11 & Running quantize\n");
-    struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
-    ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements/m11->ne[0], m11->ne[0], nullptr);
-
-    // Set up a the compute graph
-    // printf("Creating new tensor q31\n");
-    struct ggml_tensor * q31 = ggml_mul_mat(ctx, q11, m2);
-
-    // printf("Creating compute graph\n");
-    struct ggml_cgraph * gf31 = ggml_new_graph(ctx);
-    ggml_build_forward_expand(gf31, q31);
-
-    // Set up a second graph computation to make sure we override the CPU cache lines
-    // printf("Creating new tensor q12 & Running quantize\n");
-    struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
-    ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements/m12->ne[0], m12->ne[0], nullptr);
-
-    // printf("Creating new tensor q32\n");
-    struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
-
-    //printf("Creating compute graph\n");
-    struct ggml_cgraph * gf32 = ggml_new_graph(ctx);
-    ggml_build_forward_expand(gf32, q32);
-    printf("n_threads=%i\n", benchmark_params.n_threads);
-
-    const int dimx = sizex;
-    const int dimy = sizey;
-    const int dimz = sizez;
-    long long int flops_per_dot_product = dimy + dimy;
-    long long int flops_per_matrix = flops_per_dot_product * dimx * dimz; ;
-    printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - about %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000);
-
-
-    // Let's use the F32 result from above as a reference for the quantized multiplication
-    float sum_of_F32_reference = tensor_sum_elements(gf->nodes[0]);
-
-    printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
-    printf("=====================================================================================\n");
-
-    double  gflops_sum = 0;
-    for (int i=0;i<benchmark_params.n_iterations ;i++) {
-
-        long long int start = ggml_time_us();
-        //printf("Running ggml_graph_compute\n");
-        ggml_graph_compute_helper(work_buffer, gf31, benchmark_params.n_threads);
-
-        long long int stop = ggml_time_us();
-        long long int usec = stop-start;
-        double gflops = (double)(flops_per_matrix)/usec/1000.0;
-        gflops_sum += gflops;
-        printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%10.2f\n",
-            i,
-            benchmark_params.n_threads,
-            sizex, sizey, sizez, flops_per_matrix,
-            usec,gflops);
-
-#ifdef VERBOSE_DEBUGGING
-        TENSOR_DUMP("res",gf31.nodes[0])
-#endif
-
-        // Check that the matrix multiplication result is in the right ballpark
-        // We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
-        float sum_of_Q4_result = tensor_sum_elements(gf31->nodes[0]);
-        float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference);
-        float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; //  Let's accept an epsilon of 10^-6
-
-        if (delta > allowed_delta)  {
-            printf("\nABORT - ERROR in Matrix Multiplication result - expected %6.2f, got %6.2f (delta %6.2f > allowed_delta %6.2f)\n",
-                sum_of_F32_reference,
-                sum_of_Q4_result,
-                delta,
-                allowed_delta
-            );
-            exit(0);
-        }
-
-        // Running a different graph computation to make sure we override the CPU cache lines
-        ggml_graph_compute_helper(work_buffer, gf32, benchmark_params.n_threads);
-    }
-    printf("\n");
-    printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));
-    printf("=====================================================================================\n");
-}
--- a/examples/chat-persistent.sh
+++ b/examples/chat-persistent.sh
@@ -23,8 +23,9 @@ CUR_PROMPT_CACHE="${CHAT_SAVE_DIR}/current-cache.bin"
 NEXT_PROMPT_FILE="${CHAT_SAVE_DIR}/next-prompt.txt"
 NEXT_PROMPT_CACHE="${CHAT_SAVE_DIR}/next-cache.bin"

-SESSION_SIZE_MSG_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+'
-SAMPLE_TIME_MSG_PATTERN='sample time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+'
+SESSION_AND_SAMPLE_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+'\
+'|'\
+'sampling time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+'
 SED_DELETE_MESSAGES="/^(${USER_NAME}:|${AI_NAME}:|\\.\\.\\.)/,\$d"

 CTX_SIZE=2048
@@ -129,15 +130,12 @@ while read -e line; do

    printf ' '

-    # HACK get num tokens from debug message
-    # TODO get both messages in one go
-    if  ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" ||
-        ! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
+    if ! session_and_sample_msg=$(tail -n30 "$LOG" | grep -oE "$SESSION_AND_SAMPLE_PATTERN"); then
        echo >&2 "Couldn't get number of tokens from ./llama-cli output!"
        exit 1
    fi

-    n_tokens=$(($(cut -d/ -f2 <<<"$session_size_msg") + $(cut -d/ -f2 <<<"$sample_time_msg")))
+    n_tokens=$(awk '{sum+=$1} END {print sum}' <<< "$(cut -d/ -f2 <<< "$session_and_sample_msg")")

    if ((n_tokens > CTX_ROTATE_POINT)); then
        tail -c+$((n_prompt_len_pre + 1)) "$CUR_PROMPT_FILE" >>"$NEXT_PROMPT_FILE"
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -9,6 +9,7 @@
 #include <climits>
 #include <cstring>
 #include <cstdarg>
+#include <cinttypes>
 #include <ctime>
 #include <random>
 #include <stdexcept>
@@ -105,43 +106,43 @@ static void alloc_weights(TransformerWeights * w, const Config * p, bool shared_
    const int n_multiqueries = p->n_kv_heads <= 0 || p->n_kv_heads >= p->n_heads ? 1 : p->n_heads / p->n_kv_heads;
    try {
        w->token_embedding_table.resize(p->vocab_size * p->dim);
-        LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
+        LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);

        w->rms_att_weight.resize(p->n_layers * p->dim);
-        LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
+        LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);

        w->rms_ffn_weight.resize(p->n_layers * p->dim);
-        LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
+        LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);

        w->wq.resize(p->n_layers * p->dim * p->dim);
-        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);

        w->wk.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
-        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
+        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);

        w->wv.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
-        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
+        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);

        w->wo.resize(p->n_layers * p->dim * p->dim);
-        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);

        w->w1.resize(p->n_layers * p->hidden_dim * p->dim);
-        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
+        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);

        w->w2.resize(p->n_layers * p->hidden_dim * p->dim);
-        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
+        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);

        w->w3.resize(p->n_layers * p->hidden_dim * p->dim);
-        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
+        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);

        w->rms_final_weight.resize(p->dim);
-        LOG("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
+        LOG_INF("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);

        if (shared_weights) {
            w->wcls = {};
        } else {
            w->wcls.resize(p->vocab_size * p->dim);
-            LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
+            LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
        }
    }
    catch (std::length_error &) {
@@ -173,7 +174,7 @@ static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FIL
    fseek(f, 0, SEEK_END);
    auto end = ftell(f);
    if (curr != end) {
-        LOG("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end =  %ld)\n", __func__, curr, end);
+        LOG_ERR("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end =  %ld)\n", __func__, curr, end);
        return 1;
    }

@@ -181,26 +182,26 @@ static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FIL
 }

 static void print_sample_weights(TransformerWeights *w){
-    LOG("----- Quick print of first of the weight vales of all the variables\n");
-    LOG("%f\n", w->token_embedding_table[0]);
-    LOG("%f\n", w->rms_att_weight[0]);
-    LOG("%f\n", w->rms_ffn_weight[0]);
+    LOG_INF("----- Quick print of first of the weight vales of all the variables\n");
+    LOG_INF("%f\n", w->token_embedding_table[0]);
+    LOG_INF("%f\n", w->rms_att_weight[0]);
+    LOG_INF("%f\n", w->rms_ffn_weight[0]);

-    LOG("%f\n", w->wq[0]);
-    LOG("%f\n", w->wk[0]);
-    LOG("%f\n", w->wv[0]);
-    LOG("%f\n", w->wo[0]);
-    LOG("%f\n", w->w1[0]);
-    LOG("%f\n", w->w2[0]);
-    LOG("%f\n", w->w3[0]);
-    LOG("%f\n", w->rms_att_weight[0]);
-    if (!w->wcls.empty()) LOG("%f\n", w->wcls[0]);
+    LOG_INF("%f\n", w->wq[0]);
+    LOG_INF("%f\n", w->wk[0]);
+    LOG_INF("%f\n", w->wv[0]);
+    LOG_INF("%f\n", w->wo[0]);
+    LOG_INF("%f\n", w->w1[0]);
+    LOG_INF("%f\n", w->w2[0]);
+    LOG_INF("%f\n", w->w3[0]);
+    LOG_INF("%f\n", w->rms_att_weight[0]);
+    if (!w->wcls.empty()) LOG_INF("%f\n", w->wcls[0]);
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////

 //////////////////////////////////////// ggml structs and functions required to load models, configs and save the model.

-struct llama_vocab {
+struct my_llama_vocab {
    using id    = int32_t;
    using token = std::string;
    using ttype = llama_token_type;
@@ -318,20 +319,20 @@ struct train_params {
 };

 static void print_params(struct my_llama_hparams * params) {
-    LOG("%s: n_vocab:   %u\n", __func__, params->n_vocab);
-    LOG("%s: n_ctx:     %u\n", __func__, params->n_ctx);
-    LOG("%s: n_embd:    %u\n", __func__, params->n_embd);
-    LOG("%s: n_mult:    %u\n", __func__, params->n_mult);
-    LOG("%s: n_head:    %u\n", __func__, params->n_head);
-    LOG("%s: n_head_kv: %u\n", __func__, params->n_head_kv);
-    LOG("%s: n_ff:      %u\n", __func__, params->n_ff);
-    LOG("%s: n_layer:   %u\n", __func__, params->n_layer);
-    LOG("%s: n_rot:     %u\n", __func__, params->n_rot);
+    LOG_INF("%s: n_vocab:   %u\n", __func__, params->n_vocab);
+    LOG_INF("%s: n_ctx:     %u\n", __func__, params->n_ctx);
+    LOG_INF("%s: n_embd:    %u\n", __func__, params->n_embd);
+    LOG_INF("%s: n_mult:    %u\n", __func__, params->n_mult);
+    LOG_INF("%s: n_head:    %u\n", __func__, params->n_head);
+    LOG_INF("%s: n_head_kv: %u\n", __func__, params->n_head_kv);
+    LOG_INF("%s: n_ff:      %u\n", __func__, params->n_ff);
+    LOG_INF("%s: n_layer:   %u\n", __func__, params->n_layer);
+    LOG_INF("%s: n_rot:     %u\n", __func__, params->n_rot);
 }

 static void print_tensor_info(const struct ggml_context * ctx) {
    for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
-        LOG("%s: Allocating ", __func__);
+        LOG_INF("%s: Allocating ", __func__);
        int64_t total = 1;
        int i = 0;
        for (; i < ggml_n_dims(t); ++i) {
@@ -524,9 +525,9 @@ static std::string llama_escape_whitespaces(const std::string & text) {
    return out.str();
 }

-static void load_vocab(const char * filename, const Config * config, struct llama_vocab * vocab) {
+static void load_vocab(const char * filename, const Config * config, struct my_llama_vocab * vocab) {
    if (is_ggml_file(filename)) {
-        LOG("%s: Loading vocabulary from gguf file %s\n", __func__, filename);
+        LOG_INF("%s: Loading vocabulary from gguf file %s\n", __func__, filename);
        struct ggml_context * ctx_data = NULL;

        struct gguf_init_params params = {
@@ -574,7 +575,7 @@ static void load_vocab(const char * filename, const Config * config, struct llam
        gguf_free(ctx);
    } else {
        // assume llama2.c vocabulary
-        LOG("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
+        LOG_INF("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
        llama_file file(filename, "rb");
        if (!file.fp) {
            die_fmt("%s: %s", strerror(errno), filename);
@@ -582,13 +583,13 @@ static void load_vocab(const char * filename, const Config * config, struct llam
        const int  n_vocab = config->vocab_size;
        /* uint32_t max_token_length =  */ file.read_u32(); // unused
        vocab->id_to_token.resize(n_vocab);
-        for (llama_vocab::id id=0; id<n_vocab; ++id) {
+        for (my_llama_vocab::id id=0; id<n_vocab; ++id) {
            float_t score = file.read_f32();
            uint32_t len = file.read_u32();
            std::string text = file.read_string(len);

            unsigned char byte_val;
-            llama_vocab::ttype type = LLAMA_TOKEN_TYPE_NORMAL;
+            my_llama_vocab::ttype type = LLAMA_TOKEN_TYPE_NORMAL;
            if (id == UNKNOWN_TOKEN_ID) {
                text = "<unk>";
                type = LLAMA_TOKEN_TYPE_UNKNOWN;
@@ -630,7 +631,7 @@ static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const floa
 }

 static void save_as_llama_model(
-    struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename
+    struct my_llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename
 ) {
    // convert AK weights into GG weights one by one.
    // w->token_embedding_table -> model->tok_embeddings
@@ -670,7 +671,7 @@ static void save_as_llama_model(
    std::vector<const char*> tokens;
    std::vector<float> scores;
    std::vector<llama_token_type> token_types;
-    for (const llama_vocab::token_data & token_data : vocab->id_to_token) {
+    for (const my_llama_vocab::token_data & token_data : vocab->id_to_token) {
        tokens.push_back(token_data.text.c_str());
        scores.push_back(token_data.score);
        token_types.push_back(token_data.type);
@@ -871,23 +872,25 @@ static std::string basename(const std::string &path) {
 }

 int main(int argc, char ** argv) {
+    common_init();
+
    struct train_params params = get_default_train_params();
    if (!params_parse(argc, argv, &params)) {
        return 1;
    }
-    log_set_target(stdout);
+
    Config config;
    TransformerWeights weights = {};
    {
-        LOG("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model);
+        LOG_INF("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model);
        FILE * file = fopen(params.fn_llama2c_model, "rb");
        if (!file) {
-            LOG("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model);
+            LOG_ERR("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model);
            return 1;
        }
        // read in the config header
        if (fread(&config, sizeof(Config), 1, file) != 1) {
-            LOG("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model);
+            LOG_ERR("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model);
            return 1;
        }
        auto shared_weights = config.vocab_size > 0;
@@ -896,13 +899,13 @@ int main(int argc, char ** argv) {
        // read in the Transformer weights
        alloc_weights(&weights, &config, shared_weights);
        if (checkpoint_init_weights(&weights, &config, file, shared_weights)) {
-            LOG("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model);
+            LOG_ERR("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model);
            return 1;
        }
        fclose(file);
    }

-    struct llama_vocab vocab;
+    struct my_llama_vocab vocab;
    load_vocab(params.fn_vocab_model, &config, &vocab);

    struct my_llama_model model;
@@ -929,7 +932,7 @@ int main(int argc, char ** argv) {
    model.name = basename(params.fn_llama2c_model);
    save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model);

-    LOG("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model);
+    LOG_INF("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model);

    ggml_free(model.ctx);
    return 0;
--- a/examples/convert_legacy_llama.py
+++ b/examples/convert_legacy_llama.py
@@ -840,6 +840,8 @@ class OutputFile:
                        self.gguf.add_base_model_version(key, base_model_entry["version"])
                    if "organization" in base_model_entry:
                        self.gguf.add_base_model_organization(key, base_model_entry["organization"])
+                    if "description" in base_model_entry:
+                        self.gguf.add_base_model_description(key, base_model_entry["description"])
                    if "url" in base_model_entry:
                        self.gguf.add_base_model_url(key, base_model_entry["url"])
                    if "doi" in base_model_entry:
@@ -849,12 +851,32 @@ class OutputFile:
                    if "repo_url" in base_model_entry:
                        self.gguf.add_base_model_repo_url(key, base_model_entry["repo_url"])

+            if metadata.datasets is not None:
+                self.gguf.add_dataset_count(len(metadata.datasets))
+                for key, dataset_entry in enumerate(metadata.datasets):
+                    if "name" in dataset_entry:
+                        self.gguf.add_dataset_name(key, dataset_entry["name"])
+                    if "author" in dataset_entry:
+                        self.gguf.add_dataset_author(key, dataset_entry["author"])
+                    if "version" in dataset_entry:
+                        self.gguf.add_dataset_version(key, dataset_entry["version"])
+                    if "organization" in dataset_entry:
+                        self.gguf.add_dataset_organization(key, dataset_entry["organization"])
+                    if "description" in dataset_entry:
+                        self.gguf.add_dataset_description(key, dataset_entry["description"])
+                    if "url" in dataset_entry:
+                        self.gguf.add_dataset_url(key, dataset_entry["url"])
+                    if "doi" in dataset_entry:
+                        self.gguf.add_dataset_doi(key, dataset_entry["doi"])
+                    if "uuid" in dataset_entry:
+                        self.gguf.add_dataset_uuid(key, dataset_entry["uuid"])
+                    if "repo_url" in dataset_entry:
+                        self.gguf.add_dataset_repo_url(key, dataset_entry["repo_url"])
+
            if metadata.tags is not None:
                self.gguf.add_tags(metadata.tags)
            if metadata.languages is not None:
                self.gguf.add_languages(metadata.languages)
-            if metadata.datasets is not None:
-                self.gguf.add_datasets(metadata.datasets)

    def add_meta_arch(self, params: Params) -> None:
        # Metadata About The Neural Architecture Itself
--- a/examples/cvector-generator/cvector-generator.cpp
+++ b/examples/cvector-generator/cvector-generator.cpp
@@ -1,3 +1,4 @@
+#include "arg.h"
 #include "common.h"
 #include "llama.h"
 #include "ggml.h"
@@ -12,14 +13,15 @@
 #include "ggml-metal.h"
 #endif

+#include <algorithm>
+#include <climits>
 #include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <iostream>
 #include <string>
 #include <tuple>
 #include <vector>
-#include <algorithm>
-#include <iostream>
-#include <fstream>
-#include <climits>


 //////////////////////////////////////////////////
@@ -29,15 +31,13 @@ template <class Iter>
 static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
    std::string ret;
    for (; begin != end; ++begin) {
-        ret += llama_token_to_piece(ctx, *begin);
+        ret += common_token_to_piece(ctx, *begin);
    }

    return ret;
 }

-static void print_usage(int argc, char ** argv, const gpt_params & params) {
-    gpt_params_print_usage(argc, argv, params);
-
+static void print_usage(int, char ** argv) {
    printf("\nexample usage:\n");
    printf("\n    CPU only:   %s -m ./llama-3.Q4_K_M.gguf\n", argv[0]);
    printf("\n    with GPU:   %s -m ./llama-3.Q4_K_M.gguf -ngl 99\n", argv[0]);
@@ -272,8 +272,8 @@ struct tokenized_prompt {

    tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
        const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
-        tokens_pos = ::llama_tokenize(ctx, pos, add_bos, true);
-        tokens_neg = ::llama_tokenize(ctx, neg, add_bos, true);
+        tokens_pos = common_tokenize(ctx, pos, add_bos, true);
+        tokens_neg = common_tokenize(ctx, neg, add_bos, true);
        max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
        padding_seq(ctx, tokens_pos, max_seq_len);
        padding_seq(ctx, tokens_neg, max_seq_len);
@@ -281,7 +281,7 @@ struct tokenized_prompt {

    void padding_seq(llama_context * ctx, std::vector<llama_token> & tokens, size_t len) {
        // TODO: customize padding token
-        std::vector<llama_token> pad_tokens = ::llama_tokenize(ctx, " ", false);
+        std::vector<llama_token> pad_tokens = common_tokenize(ctx, " ", false);
        llama_token pad_tok = pad_tokens.back();
        while (tokens.size() < len) {
            tokens.push_back(pad_tok);
@@ -339,7 +339,7 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {

 static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
    llama_kv_cache_clear(ctx);
-    if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
+    if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
        fprintf(stderr, "%s : failed to eval\n", __func__);
        return false;
    }
@@ -370,7 +370,7 @@ static void export_gguf(const std::vector<struct ggml_tensor *> & v_ctrl, const
 * Load prompt files and completion file.
 * Then format each pair of prompt + completion to make an entry.
 */
-static int prepare_entries(gpt_params & params, train_context & ctx_train) {
+static int prepare_entries(common_params & params, train_context & ctx_train) {
    // load prompts
    std::vector<std::string> positive_prompts = ctrlvec_load_prompt_file(params.cvector_positive_file, true);
    std::vector<std::string> negative_prompts = ctrlvec_load_prompt_file(params.cvector_negative_file, true);
@@ -388,10 +388,9 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) {
 }

 int main(int argc, char ** argv) {
-    gpt_params params;
+    common_params params;

-    if (!gpt_params_parse(argc, argv, params)) {
-        print_usage(argc, argv, params);
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) {
        return 1;
    }

@@ -414,7 +413,7 @@ int main(int argc, char ** argv) {
    llama_numa_init(params.numa);

    // load the model to get hparams
-    llama_init_result llama_init = llama_init_from_gpt_params(params);
+    common_init_result llama_init = common_init_from_params(params);

    llama_model * model = llama_init.model;
    llama_context * ctx = llama_init.context;
@@ -486,8 +485,8 @@ int main(int argc, char ** argv) {
    if (use_pca) {
        // run PCA
        PCA::pca_params pca_params;
-        pca_params.n_threads = params.n_threads;
-        pca_params.n_batch = params.n_pca_batch;
+        pca_params.n_threads    = params.cpuparams.n_threads;
+        pca_params.n_batch      = params.n_pca_batch;
        pca_params.n_iterations = params.n_pca_iterations;
        PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
    } else {
--- a/examples/cvector-generator/pca.hpp
+++ b/examples/cvector-generator/pca.hpp
@@ -12,12 +12,9 @@

 #include <cstdio>
 #include <ctime>
+#include <random>
 #include <string>
-#include <tuple>
 #include <vector>
-#include <algorithm>
-#include <iostream>
-#include <fstream>

 #define DEBUG_POS 5

@@ -207,13 +204,6 @@ static ggml_status compute_piter(
        ggml_backend_cpu_set_n_threads(model.backend, params.n_threads);
    }

-// TODO: enable GPU support when support for GGML_OP_SQRT is added
-//#ifdef GGML_USE_METAL
-//    if (ggml_backend_is_metal(model.backend)) {
-//        ggml_backend_metal_set_n_cb(model.backend, params.n_threads);
-//    }
-//#endif
-
    ggml_status res = ggml_backend_graph_compute(model.backend, gf);
    if (res == GGML_STATUS_SUCCESS) {
        auto extract_i = [](std::string prefix, std::string str) -> int {
@@ -229,8 +219,8 @@ static ggml_status compute_piter(
        result.eigenvectors.resize(params.n_batch);
        result.distances.resize(params.n_batch);
        // get output nodes
-        for (int i = 0; i < gf->n_nodes; ++i) {
-            auto node = gf->nodes[i];
+        for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) {
+            auto node = ggml_graph_node(gf, i);
            int iter = -1;
            // find b_tensor (without copying data from device)
            if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) {
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -1,4 +1,6 @@
+#include "arg.h"
 #include "common.h"
+#include "log.h"
 #include "llama.h"

 #include <ctime>
@@ -26,7 +28,7 @@ static std::vector<std::string> split_lines(const std::string & s, const std::st
 static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
    size_t n_tokens = tokens.size();
    for (size_t i = 0; i < n_tokens; i++) {
-        llama_batch_add(batch, tokens[i], i, { seq_id }, true);
+        common_batch_add(batch, tokens[i], i, { seq_id }, true);
    }
 }

@@ -38,16 +40,16 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
    llama_kv_cache_clear(ctx);

    // run model
-    fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
+    LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
    if (llama_model_has_encoder(model) && !llama_model_has_decoder(model)) {
        // encoder-only model
        if (llama_encode(ctx, batch) < 0) {
-            fprintf(stderr, "%s : failed to encode\n", __func__);
+            LOG_ERR("%s : failed to encode\n", __func__);
        }
    } else if (!llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
        // decoder-only model
        if (llama_decode(ctx, batch) < 0) {
-            fprintf(stderr, "%s : failed to decode\n", __func__);
+            LOG_ERR("%s : failed to decode\n", __func__);
        }
    }

@@ -72,42 +74,33 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
        }

        float * out = output + embd_pos * n_embd;
-        llama_embd_normalize(embd, out, n_embd, embd_norm);
+        common_embd_normalize(embd, out, n_embd, embd_norm);
    }
 }

 int main(int argc, char ** argv) {
-    gpt_params params;
+    common_params params;

-    if (!gpt_params_parse(argc, argv, params)) {
-        gpt_params_print_usage(argc, argv, params);
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) {
        return 1;
    }

+    common_init();
+
    params.embedding = true;
    // For non-causal models, batch size must be equal to ubatch size
    params.n_ubatch = params.n_batch;

-    print_build_info();
-
-    if (params.seed == LLAMA_DEFAULT_SEED) {
-        params.seed = time(NULL);
-    }
-
-    fprintf(stderr, "%s: seed  = %u\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
-
    llama_backend_init();
    llama_numa_init(params.numa);

    // load the model
-    llama_init_result llama_init = llama_init_from_gpt_params(params);
+    common_init_result llama_init = common_init_from_params(params);

    llama_model * model = llama_init.model;
    llama_context * ctx = llama_init.context;
    if (model == NULL) {
-        fprintf(stderr, "%s: error: unable to load model\n", __func__);
+        LOG_ERR("%s: unable to load model\n", __func__);
        return 1;
    }

@@ -117,19 +110,19 @@ int main(int argc, char ** argv) {
    const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);

    if (llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
-        fprintf(stderr, "%s: error: computing embeddings in encoder-decoder models is not supported\n", __func__);
+        LOG_ERR("%s: computing embeddings in encoder-decoder models is not supported\n", __func__);
        return 1;
    }

    if (n_ctx > n_ctx_train) {
-        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
+        LOG_WRN("%s: warning: model was trained on only %d context tokens (%d specified)\n",
                __func__, n_ctx_train, n_ctx);
    }

    // print system information
    {
-        fprintf(stderr, "\n");
-        fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
+        LOG_INF("\n");
+        LOG_INF("%s\n", common_params_get_system_info(params).c_str());
    }

    // split the prompt into lines
@@ -142,9 +135,9 @@ int main(int argc, char ** argv) {
    // tokenize the prompts and trim
    std::vector<std::vector<int32_t>> inputs;
    for (const auto & prompt : prompts) {
-        auto inp = ::llama_tokenize(ctx, prompt, true, false);
+        auto inp = common_tokenize(ctx, prompt, true, true);
        if (inp.size() > n_batch) {
-            fprintf(stderr, "%s: error: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
+            LOG_ERR("%s: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
                    __func__, (long long int) inp.size(), (long long int) n_batch);
            return 1;
        }
@@ -155,20 +148,20 @@ int main(int argc, char ** argv) {
    // it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
    for (auto & inp : inputs) {
        if (inp.empty() || inp.back() != llama_token_sep(model)) {
-            fprintf(stderr, "%s: warning: last token in the prompt is not SEP\n", __func__);
-            fprintf(stderr, "%s:          'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
+            LOG_WRN("%s: last token in the prompt is not SEP\n", __func__);
+            LOG_WRN("%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
        }
    }

    // tokenization stats
    if (params.verbose_prompt) {
        for (int i = 0; i < (int) inputs.size(); i++) {
-            fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
-            fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
+            LOG_INF("%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
+            LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
            for (int j = 0; j < (int) inputs[i].size(); j++) {
-                fprintf(stderr, "%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str());
+                LOG("%6d -> '%s'\n", inputs[i][j], common_token_to_piece(ctx, inputs[i][j]).c_str());
            }
-            fprintf(stderr, "\n\n");
+            LOG("\n\n");
        }
    }

@@ -206,7 +199,7 @@ int main(int argc, char ** argv) {
            batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
            e += pooling_type == LLAMA_POOLING_TYPE_NONE ? batch.n_tokens : s;
            s = 0;
-            llama_batch_clear(batch);
+            common_batch_clear(batch);
        }

        // add to batch
@@ -219,57 +212,62 @@ int main(int argc, char ** argv) {
    batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);

    if (params.embd_out.empty()) {
-        fprintf(stdout, "\n");
+        LOG("\n");

        if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
            for (int j = 0; j < n_embd_count; j++) {
-                fprintf(stdout, "embedding %d: ", j);
+                LOG("embedding %d: ", j);
                for (int i = 0; i < std::min(3, n_embd); i++) {
                    if (params.embd_normalize == 0) {
-                        fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
+                        LOG("%6.0f ", emb[j * n_embd + i]);
                    } else {
-                        fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
+                        LOG("%9.6f ", emb[j * n_embd + i]);
                    }
                }
-                fprintf(stdout, " ... ");
+                LOG(" ... ");
                for (int i = n_embd - 3; i < n_embd; i++) {
                    if (params.embd_normalize == 0) {
-                        fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
+                        LOG("%6.0f ", emb[j * n_embd + i]);
                    } else {
-                        fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
+                        LOG("%9.6f ", emb[j * n_embd + i]);
                    }
                }
-                fprintf(stdout, "\n");
+                LOG("\n");
+            }
+        } else if (pooling_type == LLAMA_POOLING_TYPE_RANK) {
+            for (int j = 0; j < n_embd_count; j++) {
+                // NOTE: if you change this log - update the tests in ci/run.sh
+                LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd]);
            }
        } else {
            // print the first part of the embeddings or for a single prompt, the full embedding
            for (int j = 0; j < n_prompts; j++) {
-                fprintf(stdout, "embedding %d: ", j);
+                LOG("embedding %d: ", j);
                for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
                    if (params.embd_normalize == 0) {
-                        fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
+                        LOG("%6.0f ", emb[j * n_embd + i]);
                    } else {
-                        fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
+                        LOG("%9.6f ", emb[j * n_embd + i]);
                    }
                }
-                fprintf(stdout, "\n");
+                LOG("\n");
            }

            // print cosine similarity matrix
            if (n_prompts > 1) {
-                fprintf(stdout, "\n");
-                printf("cosine similarity matrix:\n\n");
+                LOG("\n");
+                LOG("cosine similarity matrix:\n\n");
                for (int i = 0; i < n_prompts; i++) {
-                    fprintf(stdout, "%6.6s ", prompts[i].c_str());
+                    LOG("%6.6s ", prompts[i].c_str());
                }
-                fprintf(stdout, "\n");
+                LOG("\n");
                for (int i = 0; i < n_prompts; i++) {
                    for (int j = 0; j < n_prompts; j++) {
-                        float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
-                        fprintf(stdout, "%6.2f ", sim);
+                        float sim = common_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
+                        LOG("%6.2f ", sim);
                    }
-                    fprintf(stdout, "%1.10s", prompts[i].c_str());
-                    fprintf(stdout, "\n");
+                    LOG("%1.10s", prompts[i].c_str());
+                    LOG("\n");
                }
            }
        }
@@ -278,43 +276,45 @@ int main(int argc, char ** argv) {
    if (params.embd_out == "json" || params.embd_out == "json+" || params.embd_out == "array") {
        const bool notArray = params.embd_out != "array";

-        fprintf(stdout, notArray ? "{\n  \"object\": \"list\",\n  \"data\": [\n" : "[");
+        LOG(notArray ? "{\n  \"object\": \"list\",\n  \"data\": [\n" : "[");
        for (int j = 0;;) { // at least one iteration (one prompt)
-            if (notArray) fprintf(stdout, "    {\n      \"object\": \"embedding\",\n      \"index\": %d,\n      \"embedding\": ",j);
-            fprintf(stdout, "[");
+            if (notArray) LOG("    {\n      \"object\": \"embedding\",\n      \"index\": %d,\n      \"embedding\": ",j);
+            LOG("[");
            for (int i = 0;;) { // at least one iteration (n_embd > 0)
-                fprintf(stdout, params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
+                LOG(params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
                i++;
-                if (i < n_embd) fprintf(stdout, ","); else break;
+                if (i < n_embd) LOG(","); else break;
            }
-            fprintf(stdout, notArray ? "]\n    }" : "]");
+            LOG(notArray ? "]\n    }" : "]");
            j++;
-            if (j < n_embd_count) fprintf(stdout, notArray ? ",\n" : ","); else break;
+            if (j < n_embd_count) LOG(notArray ? ",\n" : ","); else break;
        }
-        fprintf(stdout, notArray ? "\n  ]" : "]\n");
+        LOG(notArray ? "\n  ]" : "]\n");

        if (params.embd_out == "json+" && n_prompts > 1) {
-            fprintf(stdout, ",\n  \"cosineSimilarity\": [\n");
+            LOG(",\n  \"cosineSimilarity\": [\n");
            for (int i = 0;;) { // at least two iteration (n_embd_count > 1)
-                fprintf(stdout, "    [");
+                LOG("    [");
                for (int j = 0;;) { // at least two iteration (n_embd_count > 1)
-                    float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
-                    fprintf(stdout, "%6.2f", sim);
+                    float sim = common_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
+                    LOG("%6.2f", sim);
                    j++;
-                    if (j < n_embd_count) fprintf(stdout, ", "); else break;
+                    if (j < n_embd_count) LOG(", "); else break;
                }
-                fprintf(stdout, " ]");
+                LOG(" ]");
                i++;
-                if (i < n_embd_count) fprintf(stdout, ",\n"); else break;
+                if (i < n_embd_count) LOG(",\n"); else break;
            }
-            fprintf(stdout, "\n  ]");
+            LOG("\n  ]");
        }

-        if (notArray) fprintf(stdout, "\n}\n");
+        if (notArray) LOG("\n}\n");
    }

+    LOG("\n");
+    llama_perf_context_print(ctx);
+
    // clean up
-    llama_print_timings(ctx);
    llama_batch_free(batch);
    llama_free(ctx);
    llama_free_model(model);
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -1,11 +1,11 @@
+#include "arg.h"
 #include "common.h"
+#include "log.h"
 #include "llama.h"
 #include "ggml.h"

 #include <cstdio>
-#include <random>
 #include <string>
-#include <tuple>
 #include <vector>

 /**
@@ -31,22 +31,22 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
    GGML_ASSERT(n > 0);
    float sum = 0;
    for (int64_t i3 = 0; i3 < ne[3]; i3++) {
-        printf("                                     [\n");
+        LOG("                                     [\n");
        for (int64_t i2 = 0; i2 < ne[2]; i2++) {
            if (i2 == n && ne[2] > 2*n) {
-                printf("                                      ..., \n");
+                LOG("                                      ..., \n");
                i2 = ne[2] - n;
            }
-            printf("                                      [\n");
+            LOG("                                      [\n");
            for (int64_t i1 = 0; i1 < ne[1]; i1++) {
                if (i1 == n && ne[1] > 2*n) {
-                    printf("                                       ..., \n");
+                    LOG("                                       ..., \n");
                    i1 = ne[1] - n;
                }
-                printf("                                       [");
+                LOG("                                       [");
                for (int64_t i0 = 0; i0 < ne[0]; i0++) {
                    if (i0 == n && ne[0] > 2*n) {
-                        printf("..., ");
+                        LOG("..., ");
                        i0 = ne[0] - n;
                    }
                    size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
@@ -64,16 +64,16 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
                    } else {
                        GGML_ABORT("fatal error");
                    }
-                    printf("%12.4f", v);
+                    LOG("%12.4f", v);
                    sum += v;
-                    if (i0 < ne[0] - 1) printf(", ");
+                    if (i0 < ne[0] - 1) LOG(", ");
                }
-                printf("],\n");
+                LOG("],\n");
            }
-            printf("                                      ],\n");
+            LOG("                                      ],\n");
        }
-        printf("                                     ]\n");
-        printf("                                     sum = %f\n", sum);
+        LOG("                                     ]\n");
+        LOG("                                     sum = %f\n", sum);
    }
 }

@@ -102,11 +102,11 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
        snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
    }

-    printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
-           t->name, ggml_type_name(t->type), ggml_op_desc(t),
-           src0->name, ggml_ne_string(src0).c_str(),
-           src1 ? src1_str : "",
-           ggml_ne_string(t).c_str());
+    LOG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
+         t->name, ggml_type_name(t->type), ggml_op_desc(t),
+         src0->name, ggml_ne_string(src0).c_str(),
+         src1 ? src1_str : "",
+         ggml_ne_string(t).c_str());


    // copy the data from the GPU memory if needed
@@ -126,13 +126,13 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
    return true;
 }

-static bool run(llama_context * ctx, const gpt_params & params) {
+static bool run(llama_context * ctx, const common_params & params) {
    const bool add_bos = llama_add_bos_token(llama_get_model(ctx));

-    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
+    std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);

-    if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
-        fprintf(stderr, "%s : failed to eval\n", __func__);
+    if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
+        LOG_ERR("%s : failed to eval\n", __func__);
        return false;
    }

@@ -142,16 +142,13 @@ static bool run(llama_context * ctx, const gpt_params & params) {
 int main(int argc, char ** argv) {
    callback_data cb_data;

-    gpt_params params;
+    common_params params;

-    if (!gpt_params_parse(argc, argv, params)) {
-        gpt_params_print_usage(argc, argv, params);
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
        return 1;
    }

-    print_build_info();
-
-    std::mt19937 rng(params.seed);
+    common_init();

    llama_backend_init();
    llama_numa_init(params.numa);
@@ -163,19 +160,20 @@ int main(int argc, char ** argv) {
    params.warmup = false;

    // init
-    llama_init_result llama_init = llama_init_from_gpt_params(params);
+    common_init_result llama_init = common_init_from_params(params);

    llama_model * model = llama_init.model;
    llama_context * ctx = llama_init.context;
    if (model == nullptr || ctx == nullptr) {
-        fprintf(stderr, "%s : failed to init\n", __func__);
+        LOG_ERR("%s : failed to init\n", __func__);
        return 1;
    }

    // print system information
    {
-        fprintf(stderr, "\n");
-        fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
+        LOG_INF("\n");
+        LOG_INF("%s\n", common_params_get_system_info(params).c_str());
+        LOG_INF("\n");
    }

    bool OK = run(ctx, params);
@@ -183,7 +181,8 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    llama_print_timings(ctx);
+    LOG("\n");
+    llama_perf_context_print(ctx);

    llama_free(ctx);
    llama_free_model(model);
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@@ -1,3 +1,4 @@
+#include "arg.h"
 #include "common.h"
 #include "ggml.h"
 #include "ggml-alloc.h"
@@ -127,7 +128,7 @@ struct lora_merge_ctx {

    lora_merge_ctx(
            std::string & base_fname,
-            std::vector<llama_lora_adapter_info> & lora_files,
+            std::vector<common_lora_adapter_info> & lora_files,
            std::string & outfile,
            int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) {
        fout.exceptions(std::ofstream::failbit); // fail fast on write errors
@@ -313,9 +314,9 @@ struct lora_merge_ctx {
            // optionally dequantize it
            printf("%s :   + dequantize base tensor from %s to F32\n", __func__, ggml_type_name(base->type));
            auto nels = ggml_nelements(inp_base);
-            ggml_type_traits_t qtype = ggml_internal_get_type_traits(base->type);
+            const auto * qtype = ggml_get_type_traits(base->type);
            std::vector<uint8_t> dequant_buf(nels * sizeof(float));
-            qtype.to_float(read_buf.data(), (float *)dequant_buf.data(), nels);
+            qtype->to_float(read_buf.data(), (float *)dequant_buf.data(), nels);
            ggml_backend_tensor_set(inp_base, dequant_buf.data(), 0, dequant_buf.size());
        } else {
            ggml_backend_tensor_set(inp_base, read_buf.data(), 0, ggml_nbytes(inp_base));
@@ -369,7 +370,7 @@ struct lora_merge_ctx {

        // write data to output file
        {
-            auto result = gf->nodes[gf->n_nodes - 1];
+            auto * result = ggml_graph_node(gf, -1);
            size_t len = ggml_nbytes(result);
            if (read_buf.size() < len) {
                read_buf.resize(len);
@@ -391,9 +392,7 @@ struct lora_merge_ctx {
    }
 };

-static void print_usage(int argc, char ** argv, const gpt_params & params) {
-    gpt_params_print_usage(argc, argv, params);
-
+static void print_usage(int, char ** argv) {
    printf("\nexample usage:\n");
    printf("\n  %s -m base-model.gguf --lora lora-file.gguf -o merged-model-f16.gguf\n", argv[0]);
    printf("\nNOTE: output model is F16\n");
@@ -401,16 +400,15 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) {
 }

 int main(int argc, char ** argv) {
-    gpt_params params;
+    common_params params;

-    if (!gpt_params_parse(argc, argv, params)) {
-        print_usage(argc, argv, params);
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) {
        return 1;
    }

-    g_verbose = (params.verbosity == 1);
+    g_verbose = (params.verbosity > 1);
    try {
-        lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.n_threads);
+        lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.cpuparams.n_threads);
        ctx.run_merge();
    } catch (const std::exception & err) {
        fprintf(stderr, "%s\n", err.what());
--- a/examples/gbnf-validator/gbnf-validator.cpp
+++ b/examples/gbnf-validator/gbnf-validator.cpp
@@ -1,9 +1,5 @@
-#define LLAMA_API_INTERNAL
-
-#include "grammar-parser.h"
-#include "ggml.h"
-#include "llama.h"
 #include "unicode.h"
+#include "llama-grammar.h"

 #include <cstdio>
 #include <cstdlib>
@@ -12,29 +8,28 @@
 #include <string>
 #include <vector>

-static bool llama_sample_grammar_string(struct llama_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) {
-    auto decoded = decode_utf8(input_str, {});
-    const auto & code_points = decoded.first;
+static bool llama_grammar_validate(struct llama_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) {
+    const auto cpts = unicode_cpts_from_utf8(input_str);

    const llama_grammar_rules  & rules      = llama_grammar_get_rules (grammar);
-          llama_grammar_stacks & cur_stacks = llama_grammar_get_stacks(grammar);
+          llama_grammar_stacks & stacks_cur = llama_grammar_get_stacks(grammar);

    size_t pos = 0;
-    for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
-        const llama_grammar_stacks prev_stacks = llama_grammar_get_stacks(grammar); // copy
+    for (const auto & cpt : cpts) {
+        const llama_grammar_stacks stacks_prev = llama_grammar_get_stacks(grammar); // copy

-        llama_grammar_accept(rules, prev_stacks, *it, cur_stacks);
+        llama_grammar_accept(rules, stacks_prev, cpt, stacks_cur);

-        if (cur_stacks.empty()) {
+        if (stacks_cur.empty()) {
            error_pos = pos;
-            error_msg = "Unexpected character '" + unicode_cpt_to_utf8(*it) + "'";
-            cur_stacks = prev_stacks;
+            error_msg = "Unexpected character '" + unicode_cpt_to_utf8(cpt) + "'";
+            stacks_cur = stacks_prev;
            return false;
        }
        ++pos;
    }

-    for (const auto & stack : cur_stacks) {
+    for (const auto & stack : stacks_cur) {
        if (stack.empty()) {
            return true;
        }
@@ -85,27 +80,7 @@ int main(int argc, char** argv) {
        grammar_str = buffer.str();
    }

-    // Parse the GBNF grammar
-    auto parsed_grammar = grammar_parser::parse(grammar_str.c_str());
-
-    // will be empty (default) if there are parse errors
-    if (parsed_grammar.rules.empty()) {
-        fprintf(stdout, "%s: failed to parse grammar\n", __func__);
-        return 1;
-    }
-
-    // Ensure that there is a "root" node.
-    if (parsed_grammar.symbol_ids.find("root") == parsed_grammar.symbol_ids.end()) {
-        fprintf(stdout, "%s: grammar does not contain a 'root' symbol\n", __func__);
-        return 1;
-    }
-
-    std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
-
-    // Create the LLAMA grammar
-    auto grammar = llama_grammar_init(
-            grammar_rules.data(),
-            grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
+    llama_grammar * grammar = llama_grammar_init_impl(nullptr, grammar_str.c_str(), "root");
    if (grammar == nullptr) {
        throw std::runtime_error("Failed to initialize llama_grammar");
    }
@@ -122,7 +97,7 @@ int main(int argc, char** argv) {
    // Validate the input string against the grammar
    size_t error_pos;
    std::string error_msg;
-    bool is_valid = llama_sample_grammar_string(grammar, input_str, error_pos, error_msg);
+    bool is_valid = llama_grammar_validate(grammar, input_str, error_pos, error_msg);

    if (is_valid) {
        fprintf(stdout, "Input string is valid according to the grammar.\n");
@@ -131,7 +106,7 @@ int main(int argc, char** argv) {
    }

    // Clean up
-    llama_grammar_free(grammar);
+    llama_grammar_free_impl(grammar);

    return 0;
 }
--- a/examples/baby-llama/CMakeLists.txt
+++ b/examples/baby-llama/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(TARGET llama-baby-llama)
-add_executable(${TARGET} baby-llama.cpp)
+set(TARGET llama-gen-docs)
+add_executable(${TARGET} gen-docs.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/gen-docs/gen-docs.cpp
+++ b/examples/gen-docs/gen-docs.cpp
@@ -0,0 +1,83 @@
+#include "arg.h"
+#include "common.h"
+
+#include <fstream>
+#include <string>
+
+// Export usage message (-h) to markdown format
+
+static void write_table_header(std::ofstream & file) {
+    file << "| Argument | Explanation |\n";
+    file << "| -------- | ----------- |\n";
+}
+
+static void write_table_entry(std::ofstream & file, const common_arg & opt) {
+    file << "| `";
+    // args
+    for (const auto & arg : opt.args) {
+    if (arg == opt.args.front()) {
+            file << arg;
+            if (opt.args.size() > 1) file << ", ";
+        } else {
+            file << arg << (arg != opt.args.back() ? ", " : "");
+        }
+    }
+    // value hint
+    if (opt.value_hint) {
+        std::string md_value_hint(opt.value_hint);
+        string_replace_all(md_value_hint, "|", "\\|");
+        file << " " << md_value_hint;
+    }
+    if (opt.value_hint_2) {
+        std::string md_value_hint_2(opt.value_hint_2);
+        string_replace_all(md_value_hint_2, "|", "\\|");
+        file << " " << md_value_hint_2;
+    }
+    // help text
+    std::string md_help(opt.help);
+    string_replace_all(md_help, "\n", "<br/>");
+    string_replace_all(md_help, "|", "\\|");
+    file << "` | " << md_help << " |\n";
+}
+
+static void write_table(std::ofstream & file, std::vector<common_arg *> & opts) {
+    write_table_header(file);
+    for (const auto & opt : opts) {
+        write_table_entry(file, *opt);
+    }
+}
+
+static void export_md(std::string fname, llama_example ex) {
+    std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc);
+
+    common_params params;
+    auto ctx_arg = common_params_parser_init(params, ex);
+
+    std::vector<common_arg *> common_options;
+    std::vector<common_arg *> sparam_options;
+    std::vector<common_arg *> specific_options;
+    for (auto & opt : ctx_arg.options) {
+        // in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example
+        if (opt.is_sparam) {
+            sparam_options.push_back(&opt);
+        } else if (opt.in_example(ctx_arg.ex)) {
+            specific_options.push_back(&opt);
+        } else {
+            common_options.push_back(&opt);
+        }
+    }
+
+    file << "**Common params**\n\n";
+    write_table(file, common_options);
+    file << "\n\n**Sampling params**\n\n";
+    write_table(file, sparam_options);
+    file << "\n\n**Example-specific params**\n\n";
+    write_table(file, specific_options);
+}
+
+int main(int, char **) {
+    export_md("autogen-main.md", LLAMA_EXAMPLE_MAIN);
+    export_md("autogen-server.md", LLAMA_EXAMPLE_SERVER);
+
+    return 0;
+}
--- a/examples/gguf-split/gguf-split.cpp
+++ b/examples/gguf-split/gguf-split.cpp
@@ -22,12 +22,20 @@
 #endif

 enum split_operation : uint8_t {
-    SPLIT_OP_SPLIT,
-    SPLIT_OP_MERGE,
+    OP_NONE,
+    OP_SPLIT,
+    OP_MERGE,
+};
+
+enum split_mode : uint8_t {
+    MODE_NONE,
+    MODE_TENSOR,
+    MODE_SIZE,
 };

 struct split_params {
-    split_operation operation = SPLIT_OP_SPLIT;
+    split_operation operation = OP_NONE;
+    split_mode mode = MODE_NONE;
    size_t n_bytes_split = 0;
    int n_split_tensors = 128;
    std::string input;
@@ -87,59 +95,52 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p
        }

        bool arg_found = false;
-        bool is_op_set = false;
-        bool is_mode_set = false;
        if (arg == "-h" || arg == "--help") {
            split_print_usage(argv[0]);
            exit(0);
-        }
-        if (arg == "--version") {
+        } else if (arg == "--version") {
            fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
            fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
            exit(0);
-        }
-        if (arg == "--dry-run") {
+        } else if (arg == "--dry-run") {
            arg_found = true;
            params.dry_run = true;
-        }
-        if (arg == "--no-tensor-first-split") {
+        } else if (arg == "--no-tensor-first-split") {
            arg_found = true;
            params.no_tensor_first_split = true;
-        }
-
-        if (is_op_set) {
-            throw std::invalid_argument("error: either --split or --merge can be specified, but not both");
-        }
-        if (arg == "--merge") {
+        } else if (arg == "--merge") {
            arg_found = true;
-            is_op_set = true;
-            params.operation = SPLIT_OP_MERGE;
-        }
-        if (arg == "--split") {
+            if (params.operation != OP_NONE && params.operation != OP_MERGE) {
+                throw std::invalid_argument("error: either --split or --merge can be specified, but not both");
+            }
+            params.operation = OP_MERGE;
+        } else if (arg == "--split") {
            arg_found = true;
-            is_op_set = true;
-            params.operation = SPLIT_OP_SPLIT;
-        }
-
-        if (is_mode_set) {
-            throw std::invalid_argument("error: either --split-max-tensors or --split-max-size can be specified, but not both");
-        }
-        if (arg == "--split-max-tensors") {
+            if (params.operation != OP_NONE && params.operation != OP_SPLIT) {
+                throw std::invalid_argument("error: either --split or --merge can be specified, but not both");
+            }
+            params.operation = OP_SPLIT;
+        } else if (arg == "--split-max-tensors") {
            if (++arg_idx >= argc) {
                invalid_param = true;
                break;
            }
            arg_found = true;
-            is_mode_set = true;
+            if (params.mode != MODE_NONE && params.mode != MODE_TENSOR) {
+                throw std::invalid_argument("error: either --split-max-tensors or --split-max-size can be specified, but not both");
+            }
+            params.mode = MODE_TENSOR;
            params.n_split_tensors = atoi(argv[arg_idx]);
-        }
-        if (arg == "--split-max-size") {
+        } else if (arg == "--split-max-size") {
            if (++arg_idx >= argc) {
                invalid_param = true;
                break;
            }
            arg_found = true;
-            is_mode_set = true;
+            if (params.mode != MODE_NONE && params.mode != MODE_SIZE) {
+                throw std::invalid_argument("error: either --split-max-tensors or --split-max-size can be specified, but not both");
+            }
+            params.mode = MODE_SIZE;
            params.n_bytes_split = split_str_to_n_bytes(argv[arg_idx]);
        }

@@ -148,11 +149,20 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p
        }
    }

+    // the operation is split if not specified
+    if (params.operation == OP_NONE) {
+        params.operation = OP_SPLIT;
+    }
+    // the split mode is by tensor if not specified
+    if (params.mode == MODE_NONE) {
+        params.mode = MODE_TENSOR;
+    }
+
    if (invalid_param) {
        throw std::invalid_argument("error: invalid parameter for argument: " + arg);
    }

-    if (argc - arg_idx < 2) {
+    if (argc - arg_idx != 2) {
        throw std::invalid_argument("error: bad arguments");
    }

@@ -265,13 +275,15 @@ struct split_strategy {
    }

    bool should_split(int i_tensor, size_t next_size) {
-        if (params.n_bytes_split > 0) {
+        if (params.mode == MODE_SIZE) {
            // split by max size per file
            return next_size > params.n_bytes_split;
-        } else {
+        } else if (params.mode == MODE_TENSOR) {
            // split by number of tensors per file
            return i_tensor > 0 && i_tensor < n_tensors && i_tensor % params.n_split_tensors == 0;
        }
+        // should never happen
+        GGML_ABORT("invalid mode");
    }

    void print_info() {
@@ -389,10 +401,17 @@ static void gguf_merge(const split_params & split_params) {
    int n_split = 1;
    int total_tensors = 0;

-    auto * ctx_out = gguf_init_empty();
+    // avoid overwriting existing output file
+    if (std::ifstream(split_params.output.c_str())) {
+        fprintf(stderr, "%s: output file %s already exists\n", __func__, split_params.output.c_str());
+        exit(EXIT_FAILURE);
+    }
+
    std::ofstream fout(split_params.output.c_str(), std::ios::binary);
    fout.exceptions(std::ofstream::failbit); // fail fast on write errors

+    auto * ctx_out = gguf_init_empty();
+
    std::vector<uint8_t> read_data;
    std::vector<ggml_context *> ctx_metas;
    std::vector<gguf_context *> ctx_ggufs;
@@ -552,9 +571,9 @@ int main(int argc, const char ** argv) {
    split_params_parse(argc, argv, params);

    switch (params.operation) {
-        case SPLIT_OP_SPLIT: gguf_split(params);
+        case OP_SPLIT: gguf_split(params);
            break;
-        case SPLIT_OP_MERGE: gguf_merge(params);
+        case OP_MERGE: gguf_merge(params);
            break;
        default: split_print_usage(argv[0]);
            exit(EXIT_FAILURE);
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@@ -1,3 +1,4 @@
+#include "arg.h"
 #include "common.h"
 #include "llama.h"

@@ -9,25 +10,25 @@
 static std::vector<std::vector<float>> encode(llama_context * ctx, const std::vector<std::string> & sentences, const std::string & instruction) {
    std::vector<std::vector<float>> result;

-    const llama_model * mdl = llama_get_model(ctx);
+    const llama_model * model = llama_get_model(ctx);

    llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1);

    for (uint64_t i = 0; i < sentences.size(); i++) {
-        llama_batch_clear(batch);
+        common_batch_clear(batch);

        const std::string input_string = instruction + sentences[i];

-        std::vector<llama_token> inputs = llama_tokenize(mdl, input_string, true, false);
+        std::vector<llama_token> inputs = common_tokenize(model, input_string, true, false);

        const int32_t n_toks = inputs.size();

        // GritLM seems to have EOS = ""
        // https://github.com/ContextualAI/gritlm/blob/92025b16534712b31b3c4aaaf069350e222bd5f8/gritlm/gritlm.py#L18
-        // inputs.push_back(llama_token_eos(mdl));
+        // inputs.push_back(llama_token_eos(model));

        // we want to ignore instruction tokens for mean pooling
-        const int32_t n_inst = llama_tokenize(mdl, instruction, true, false).size();
+        const int32_t n_inst = common_tokenize(model, instruction, true, false).size();

 #ifdef GRIT_DEBUG
        // debug tokens - should be matching as referenced in the GritLM sample
@@ -39,7 +40,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve

        // add input to batch (this increments n_tokens)
        for (int32_t j = 0; j < n_toks; j++) {
-            llama_batch_add(batch, inputs[j], j, { 0 }, j >= n_inst);
+            common_batch_add(batch, inputs[j], j, { 0 }, j >= n_inst);
        }

        // clear previous kv_cache values (irrelevant for embeddings)
@@ -51,7 +52,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
        llama_decode(ctx, batch);

        // get embedding dimensions
-        uint64_t n_embd = llama_n_embd(mdl);
+        uint64_t n_embd = llama_n_embd(model);

        // allocate embedding output
        std::vector<float> emb_unorm(n_embd, 0.0f);
@@ -74,7 +75,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
        }

        std::vector<float> emb_norm(emb_unorm.size());
-        llama_embd_normalize(emb_unorm.data(), emb_norm.data(), n_embd);
+        common_embd_normalize(emb_unorm.data(), emb_norm.data(), n_embd);
        result.push_back(emb_norm);

 #ifdef GRIT_DEBUG
@@ -92,11 +93,11 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
    return result;
 }

-static std::string generate(llama_context * ctx, const std::string & prompt, bool stream) {
+static std::string generate(llama_context * ctx, llama_sampler * smpl, const std::string & prompt, bool stream) {
    std::string result;

-    const llama_model * mdl = llama_get_model(ctx);
-    llama_token eos_token = llama_token_eos(mdl);
+    const llama_model * model = llama_get_model(ctx);
+    llama_token eos_token = llama_token_eos(model);

    llama_kv_cache_clear(ctx);
    llama_set_embeddings(ctx, false);
@@ -104,33 +105,29 @@ static std::string generate(llama_context * ctx, const std::string & prompt, boo

    llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);

-    std::vector<llama_token> inputs = llama_tokenize(mdl, prompt, false, true);
+    std::vector<llama_token> inputs = common_tokenize(model, prompt, false, true);
    int32_t i_current_token = 0;

    while (true) {
-        llama_batch_clear(bat);
-        auto n_inputs = (int32_t)inputs.size();
-        for (int32_t i = 0; i < n_inputs; i++) {
-            llama_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1);
+        common_batch_clear(bat);
+        {
+            const int32_t n_inputs = inputs.size();
+
+            for (int32_t i = 0; i < n_inputs; i++) {
+                common_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1);
+            }
        }
        inputs.clear();

        llama_decode(ctx, bat);
-        auto logits = llama_get_logits_ith(ctx, bat.n_tokens - 1);

-        auto candidates = std::vector<llama_token_data>(llama_n_vocab(mdl));
-        auto n_candidates = (int32_t)candidates.size();
-        for (int32_t token = 0; token < n_candidates; token++) {
-            candidates[token] = llama_token_data{ token, logits[token], 0.0f };
-        }
-        auto candidates_p = llama_token_data_array{ candidates.data(), candidates.size(), false };
+        llama_token token = llama_sampler_sample(smpl, ctx, bat.n_tokens - 1);

-        llama_token token = llama_sample_token_greedy(ctx, &candidates_p);
        if (token == eos_token) {
            break;
        }

-        std::string piece = llama_token_to_piece(ctx, token);
+        std::string piece = common_token_to_piece(ctx, token);
        if (stream) {
            std::printf("%s", piece.c_str());
            std::fflush(stdout);
@@ -155,22 +152,31 @@ static std::string gritlm_instruction(const std::string & instruction) {
 }

 int main(int argc, char * argv[]) {
-    gpt_params params;
+    common_params params;

-    if (!gpt_params_parse(argc, argv, params)) {
-        gpt_params_print_usage(argc, argv, params);
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
        return 1;
    }

-    llama_model_params mparams = llama_model_params_from_gpt_params(params);
-    llama_context_params cparams = llama_context_params_from_gpt_params(params);
+    common_init();
+
+    llama_model_params mparams = common_model_params_to_llama(params);
+    llama_context_params cparams = common_context_params_to_llama(params);

    llama_backend_init();

-    llama_model * mdl = llama_load_model_from_file(params.model.c_str(), mparams);
+    llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams);

    // create generation context
-    llama_context * ctx = llama_new_context_with_model(mdl, cparams);
+    llama_context * ctx = llama_new_context_with_model(model, cparams);
+
+    auto sparams = llama_sampler_chain_default_params();
+
+    sparams.no_perf = false;
+
+    llama_sampler * smpl = llama_sampler_chain_init(sparams);
+
+    llama_sampler_chain_add(smpl, llama_sampler_init_greedy());

    // ### Embedding/Representation ###
    // samples taken from: https://github.com/ContextualAI/gritlm#basic
@@ -191,12 +197,12 @@ int main(int argc, char * argv[]) {
        const std::vector<std::vector<float>> d_rep = encode(ctx, documents, gritlm_instruction(""));
        const std::vector<std::vector<float>> q_rep = encode(ctx, queries,   gritlm_instruction(instruction));

-        const int n_embd = llama_n_embd(mdl);
+        const int n_embd = llama_n_embd(model);

-        const float cosine_sim_q0_d0 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd);
-        const float cosine_sim_q0_d1 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd);
-        const float cosine_sim_q1_d0 = llama_embd_similarity_cos(q_rep[1].data(), d_rep[0].data(), n_embd);
-        const float cosine_sim_q1_d1 = llama_embd_similarity_cos(q_rep[1].data(), d_rep[1].data(), n_embd);
+        const float cosine_sim_q0_d0 = common_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd);
+        const float cosine_sim_q0_d1 = common_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd);
+        const float cosine_sim_q1_d0 = common_embd_similarity_cos(q_rep[1].data(), d_rep[0].data(), n_embd);
+        const float cosine_sim_q1_d1 = common_embd_similarity_cos(q_rep[1].data(), d_rep[1].data(), n_embd);

        std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[0].c_str(), cosine_sim_q0_d0);
        std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[1].c_str(), cosine_sim_q0_d1);
@@ -208,11 +214,12 @@ int main(int argc, char * argv[]) {
    // GritLM models are not finetuned with system prompts, as you can just include system-like instructions together with your user instruction
    {
        const std::string prompt = "<|user|>\nPlease write me a poem about my recent hike of Mt. Fuji at midnight in the style of Shakespeare.\n<|assistant|>\n";
-        std::string response = generate(ctx, prompt, true);
+        std::string response = generate(ctx, smpl, prompt, true);
    }

+    llama_sampler_free(smpl);
    llama_free(ctx);
-    llama_free_model(mdl);
+    llama_free_model(model);
    llama_backend_free();

    return 0;
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -1,4 +1,6 @@
+#include "arg.h"
 #include "common.h"
+#include "log.h"
 #include "llama.h"

 #include <cmath>
@@ -17,15 +19,13 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

-static void print_usage(int argc, char ** argv, const gpt_params & params) {
-    gpt_params_print_usage(argc, argv, params);
-
-    LOG_TEE("\nexample usage:\n");
-    LOG_TEE("\n    %s \\\n"
-            "       -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n"
+static void print_usage(int, char ** argv) {
+    LOG("\nexample usage:\n");
+    LOG("\n    %s \\\n"
+            "       -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] \\\n"
            "       [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n"
            "       [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]);
-    LOG_TEE("\n");
+    LOG("\n");
 }

 struct Stats {
@@ -37,13 +37,13 @@ struct Stats {
 class IMatrixCollector {
 public:
    IMatrixCollector() = default;
-    void set_params(gpt_params params) { m_params = std::move(params); }
+    void set_params(common_params params) { m_params = std::move(params); }
    bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
    void save_imatrix(int ncall = -1) const;
    bool load_imatrix(const char * file_name);
 private:
    std::unordered_map<std::string, Stats> m_stats;
-    gpt_params                             m_params;
+    common_params                          m_params;
    std::mutex                             m_mutex;
    int                                    m_last_call = 0;
    std::vector<float>                     m_src1_data;
@@ -126,12 +126,10 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
            e.counts.resize(src1->ne[0]*n_as, 0);
        }
        else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
-            fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
+            LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
            exit(1); //GGML_ABORT("fatal error");
        }
-        if (m_params.verbosity > 1) {
-            printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
-        }
+        LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
        // loop over all possible experts, regardless if they are used or not in the batch
        for (int ex = 0; ex < n_as; ++ex) {
            size_t e_start = ex*src1->ne[0];
@@ -152,7 +150,8 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
                        e.values[e_start + j] += x[j]*x[j];
                        e.counts[e_start + j]++;
                        if (!std::isfinite(e.values[e_start + j])) {
-                            fprintf(stderr, "%f detected in %s\n", e.values[e_start + j], wname.c_str());
+                            LOG("\n");
+                            LOG_ERR("%f detected in %s\n", e.values[e_start + j], wname.c_str());
                            exit(1);
                        }
                    }
@@ -175,20 +174,18 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
            e.counts.resize(src1->ne[0], 0);
        }
        else if (e.values.size() != (size_t)src1->ne[0]) {
-            fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
+            LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
            exit(1); //GGML_ABORT("fatal error");
        }
        ++e.ncall;
-        if (m_params.verbosity > 1) {
-            printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
-        }
+        LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
        for (int row = 0; row < (int)src1->ne[1]; ++row) {
            const float * x = data + row * src1->ne[0];
            for (int j = 0; j < (int)src1->ne[0]; ++j) {
                e.values[j] += x[j]*x[j];
                e.counts[j]++;
                if (!std::isfinite(e.values[j])) {
-                    fprintf(stderr, "%f detected in %s\n", e.values[j], wname.c_str());
+                    LOG_ERR("%f detected in %s\n", e.values[j], wname.c_str());
                    exit(1);
                }
            }
@@ -240,17 +237,17 @@ void IMatrixCollector::save_imatrix(int ncall) const {
        }

        if (n_zeros != 0 && is_first) {
-            fprintf(stderr, "\n");
+            LOG_INF("\n");
            is_first = false;
        }

        if (n_zeros == n_all) {
-            fprintf(stderr, "%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str());
+            LOG_WRN("%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str());
            continue;
        }

        if (n_zeros > 0) {
-            fprintf(stderr, "%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
+            LOG_WRN("%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
            continue;
        }

@@ -259,7 +256,7 @@ void IMatrixCollector::save_imatrix(int ncall) const {
    }

    if (to_store.size() < m_stats.size()) {
-        fprintf(stderr, "%s: warning: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
+        LOG_WRN("%s: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
    }

    std::ofstream out(fname, std::ios::binary);
@@ -291,21 +288,20 @@ void IMatrixCollector::save_imatrix(int ncall) const {
        out.write(m_params.prompt_file.c_str(), len);
    }

-    if (m_params.verbosity > 0) {
-        fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str());
-    }
+    LOGV(1, "\n");
+    LOG_DBGV(1, "%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str());
 }

 bool IMatrixCollector::load_imatrix(const char * fname) {
    std::ifstream in(fname, std::ios::binary);
    if (!in) {
-        printf("%s: failed to open %s\n",__func__, fname);
+        LOG_ERR("%s: failed to open %s\n",__func__, fname);
        return false;
    }
    int n_entries;
    in.read((char*)&n_entries, sizeof(n_entries));
    if (in.fail() || n_entries < 1) {
-        printf("%s: no data in file %s\n", __func__, fname);
+        LOG_ERR("%s: no data in file %s\n", __func__, fname);
        return false;
    }
    for (int i = 0; i < n_entries; ++i) {
@@ -313,7 +309,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
        std::vector<char> name_as_vec(len+1);
        in.read((char *)name_as_vec.data(), len);
        if (in.fail()) {
-            printf("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname);
+            LOG_ERR("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname);
            return false;
        }
        name_as_vec[len] = 0;
@@ -324,7 +320,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
        int nval;
        in.read((char *)&nval, sizeof(nval));
        if (in.fail() || nval < 1) {
-            printf("%s: failed reading number of values for entry %d\n",__func__,i);
+            LOG_ERR("%s: failed reading number of values for entry %d\n",__func__,i);
            m_stats = {};
            return false;
        }
@@ -337,7 +333,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
        std::vector<float> tmp(nval);
        in.read((char*)tmp.data(), nval*sizeof(float));
        if (in.fail()) {
-            printf("%s: failed reading data for entry %d\n",__func__,i);
+            LOG_ERR("%s: failed reading data for entry %d\n",__func__,i);
            m_stats = {};
            return false;
        }
@@ -432,32 +428,31 @@ static void process_logits(
    }
 }

-static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
+static bool compute_imatrix(llama_context * ctx, const common_params & params) {
    const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
    GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
    const int n_ctx = llama_n_ctx(ctx);

    auto tim1 = std::chrono::high_resolution_clock::now();
-    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
+    LOG_INF("%s: tokenizing the input ..\n", __func__);

-    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
+    std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, true);

    auto tim2 = std::chrono::high_resolution_clock::now();
-    fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
+    LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());

    if (params.i_chunk > 0) {
        if (size_t((params.i_chunk + 2)*n_ctx) >= tokens.size()) {
-            fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk);
+            LOG_ERR("%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk);
            return false;
        }
-        fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx);
+        LOG_INF("%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx);
        tokens.erase(tokens.begin(), tokens.begin() + params.i_chunk*n_ctx);
    }

    if (int(tokens.size()) < 2*n_ctx) {
-        fprintf(stderr, "%s: you need at least %d tokens for a context of %d tokens\n",__func__,2*n_ctx,
-                n_ctx);
-        fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
+        LOG_ERR("%s: you need at least %d tokens for a context of %d tokens\n", __func__, 2*n_ctx, n_ctx);
+        LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n", __func__, tokens.size());
        return false;
    }

@@ -479,7 +474,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
    double nll = 0.0;
    double nll2 = 0.0;

-    fprintf(stderr, "%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch);
+    LOG_INF("%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch);

    std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);

@@ -501,6 +496,8 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
        // clear the KV cache
        llama_kv_cache_clear(ctx);

+        llama_batch batch = llama_batch_init(n_batch, 0, 1);
+
        for (int j = 0; j < num_batches; ++j) {
            const int batch_start = start + j * n_batch;
            const int batch_size  = std::min(end - batch_start, n_batch);
@@ -513,9 +510,14 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
                tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
            }

-            // TODO: use batch.logits to save computations instead of relying on logits_all == true
-            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
-                fprintf(stderr, "%s : failed to eval\n", __func__);
+            common_batch_clear(batch);
+            for (int i = 0; i < batch_size; i++) {
+                common_batch_add(batch, tokens[batch_start + i], j*n_batch + i, {0}, true);
+            }
+
+            if (llama_decode(ctx, batch)) {
+                LOG_ERR("%s : failed to eval\n", __func__);
+                llama_batch_free(batch);
                return false;
            }

@@ -528,33 +530,35 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
            }
        }

+        llama_batch_free(batch);
+
        const auto t_end = std::chrono::high_resolution_clock::now();

        if (i == 0) {
            const float t_total = std::chrono::duration<float>(t_end - t_start).count();
-            fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
+            LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
            int total_seconds = (int)(t_total * n_chunk);
            if (total_seconds >= 60*60) {
-                fprintf(stderr, "%d hours ", total_seconds / (60*60));
+                LOG("%d hours ", total_seconds / (60*60));
                total_seconds = total_seconds % (60*60);
            }
-            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
+            LOG("%.2f minutes\n", total_seconds / 60.0);
        }

        if (params.compute_ppl) {
            const int first = n_ctx/2;
-            const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
+            const auto * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
            process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
                    workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
            count += n_ctx - first - 1;

-            printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
+            LOG("[%d]%.4lf,", i + 1, std::exp(nll / count));
            fflush(stdout);

            logits.clear();
        }
    }
-    printf("\n");
+    LOG("\n");

    if (params.compute_ppl) {
        nll2 /= count;
@@ -563,9 +567,9 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
        nll2 -= nll * nll;
        if (nll2 > 0) {
            nll2 = sqrt(nll2/(count-1));
-            printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
+            LOG("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
        } else {
-            printf("Unexpected negative standard deviation of log(prob)\n");
+            LOG("Unexpected negative standard deviation of log(prob)\n");
        }
    }

@@ -573,31 +577,32 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
 }

 int main(int argc, char ** argv) {
-    gpt_params params;
+    common_params params;

    params.n_ctx = 512;
    params.logits_all = true;
-    params.verbosity = 1;
+    params.escape = false;

-    if (!gpt_params_parse(argc, argv, params)) {
-        print_usage(argc, argv, params);
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) {
        return 1;
    }

+    common_init();
+
    params.n_batch = std::min(params.n_batch, params.n_ctx);

    g_collector.set_params(params);

    for (const auto & in_file : params.in_files) {
-        printf("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
+        LOG_INF("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
        if (!g_collector.load_imatrix(in_file.c_str())) {
-            fprintf(stderr, "%s : failed to load %s\n", __func__, in_file.c_str());
+            LOG_ERR("%s : failed to load %s\n", __func__, in_file.c_str());
            return 1;
        }
    }

    if (params.in_files.size() > 1) {
-        printf("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str());
+        LOG_INF("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str());
        g_collector.save_imatrix();
    }

@@ -611,25 +616,25 @@ int main(int argc, char ** argv) {
    params.warmup = false;

    // init
-    llama_init_result llama_init = llama_init_from_gpt_params(params);
+    common_init_result llama_init = common_init_from_params(params);

    llama_model * model = llama_init.model;
    llama_context * ctx = llama_init.context;
    if (model == nullptr || ctx == nullptr) {
-        fprintf(stderr, "%s : failed to init\n", __func__);
+        LOG_ERR("%s : failed to init\n", __func__);
        return 1;
    }

    const int n_ctx_train = llama_n_ctx_train(model);
    if (params.n_ctx > n_ctx_train) {
-        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
+        LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
                __func__, n_ctx_train, params.n_ctx);
    }

    // print system information
    {
-        fprintf(stderr, "\n");
-        fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
+        LOG_INF("\n");
+        LOG_INF("%s\n", common_params_get_system_info(params).c_str());
    }

    if (!compute_imatrix(ctx, params)) {
@@ -638,7 +643,8 @@ int main(int argc, char ** argv) {

    g_collector.save_imatrix();

-    llama_print_timings(ctx);
+    LOG("\n");
+    llama_perf_context_print(ctx);

    llama_free(ctx);
    llama_free_model(model);
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -1,8 +1,9 @@
+#include "arg.h"
 #include "common.h"
-
 #include "console.h"
+#include "sampling.h"
+#include "log.h"
 #include "llama.h"
-#include "grammar-parser.h"

 #include <cassert>
 #include <cinttypes>
@@ -34,57 +35,14 @@

 static llama_context           ** g_ctx;
 static llama_model             ** g_model;
-static gpt_params               * g_params;
+static common_sampler          ** g_smpl;
+static common_params            * g_params;
 static std::vector<llama_token> * g_input_tokens;
 static std::ostringstream       * g_output_ss;
 static std::vector<llama_token> * g_output_tokens;

 static bool is_interacting = false;

-static void write_logfile(
-    const llama_context * ctx, const gpt_params & params, const llama_model * model,
-    const std::vector<llama_token> & input_tokens, const std::string & output,
-    const std::vector<llama_token> & output_tokens
-) {
-    if (params.logdir.empty()) {
-        return;
-    }
-
-    const std::string timestamp = string_get_sortable_timestamp();
-
-    const bool success = fs_create_directory_with_parents(params.logdir);
-    if (!success) {
-        fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
-                __func__, params.logdir.c_str());
-        return;
-    }
-
-    const std::string logfile_path = params.logdir + timestamp + ".yml";
-    FILE * logfile = fopen(logfile_path.c_str(), "w");
-
-    if (logfile == NULL) {
-        fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
-        return;
-    }
-
-    fprintf(logfile, "binary: infill\n");
-    char model_desc[128];
-    llama_model_desc(model, model_desc, sizeof(model_desc));
-    yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
-
-    fprintf(logfile, "\n");
-    fprintf(logfile, "######################\n");
-    fprintf(logfile, "# Generation Results #\n");
-    fprintf(logfile, "######################\n");
-    fprintf(logfile, "\n");
-
-    yaml_dump_string_multiline(logfile, "output", output.c_str());
-    yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
-
-    llama_dump_timing_info_yaml(logfile, ctx);
-    fclose(logfile);
-}
-
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
 static void sigint_handler(int signo) {
    if (signo == SIGINT) {
@@ -92,9 +50,13 @@ static void sigint_handler(int signo) {
            is_interacting = true;
        } else {
            console::cleanup();
-            printf("\n");
-            llama_print_timings(*g_ctx);
-            write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
+            LOG("\n");
+            common_perf_print(*g_ctx, *g_smpl);
+
+            // make sure all logs are flushed
+            LOG("Interrupted by user\n");
+            common_log_pause(common_log_main());
+
            _exit(130);
        }
    }
@@ -102,121 +64,107 @@ static void sigint_handler(int signo) {
 #endif

 int main(int argc, char ** argv) {
-    gpt_params params;
-    llama_sampling_params & sparams = params.sparams;
+    common_params params;
    g_params = &params;

-    if (!gpt_params_parse(argc, argv, params)) {
-        gpt_params_print_usage(argc, argv, params);
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_INFILL)) {
        return 1;
    }

-#ifndef LOG_DISABLE_LOGS
-    log_set_target(log_filename_generator("infill", "log"));
-    LOG_TEE("Log start\n");
-    log_dump_cmdline(argc, argv);
-#endif // LOG_DISABLE_LOGS
+    common_init();
+
+    auto & sparams = params.sparams;

    console::init(params.simple_io, params.use_color);
    atexit([]() { console::cleanup(); });

    if (params.logits_all) {
-        printf("\n************\n");
-        printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
-        printf("************\n\n");
+        LOG_ERR("\n************\n");
+        LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
+        LOG_ERR("************\n\n");

        return 0;
    }

    if (params.embedding) {
-        printf("\n************\n");
-        printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
-        printf("************\n\n");
+        LOG_ERR("\n************\n");
+        LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
+        LOG_ERR("************\n\n");

        return 0;
    }

    if (params.n_ctx != 0 && params.n_ctx < 8) {
-        LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
+        LOG_WRN("%s: minimum context size is 8, using minimum size.\n", __func__);
        params.n_ctx = 8;
    }
+
    if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) {
-        printf("\n************\n");
-        printf("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
-        printf("************\n\n");
+        LOG_ERR("\n************\n");
+        LOG_ERR("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
+        LOG_ERR("************\n\n");

        return 0;
    }

    if (params.rope_freq_base != 0.0) {
-        LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
+        LOG_WRN("%s: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
    }

    if (params.rope_freq_scale != 0.0) {
-        LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
+        LOG_WRN("%s: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
    }

-    LOG_TEE("%s: build = %d (%s)\n",      __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
-    LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);
-
-    if (params.seed == LLAMA_DEFAULT_SEED) {
-        params.seed = time(NULL);
-    }
-
-    LOG_TEE("%s: seed  = %u\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
-
-    LOG("%s: llama backend init\n", __func__);
+    LOG_INF("%s: llama backend init\n", __func__);
    llama_backend_init();
    llama_numa_init(params.numa);

-    llama_model * model;
-    llama_context * ctx;
+    llama_model * model = nullptr;
+    llama_context * ctx = nullptr;
+    common_sampler * smpl = nullptr;

    g_model = &model;
    g_ctx = &ctx;
+    g_smpl = &smpl;

    // load the model and apply lora adapter, if any
-    LOG("%s: load the model and apply lora adapter, if any\n", __func__);
-    llama_init_result llama_init = llama_init_from_gpt_params(params);
+    LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
+    common_init_result llama_init = common_init_from_params(params);

    model = llama_init.model;
    ctx = llama_init.context;

    if (model == NULL) {
-        LOG_TEE("%s: error: unable to load model\n", __func__);
+        LOG_ERR("%s: unable to load model\n", __func__);
        return 1;
    }

    const int n_ctx_train = llama_n_ctx_train(model);
    const int n_ctx = llama_n_ctx(ctx);
-    LOG("n_ctx: %d\n", n_ctx);
+    LOG_DBG("n_ctx: %d\n", n_ctx);

    if (n_ctx > n_ctx_train) {
-        LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
-                __func__, n_ctx_train, n_ctx);
+        LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
    }

    // print system information
    {
-        LOG_TEE("\n");
-        LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
+        LOG_INF("\n");
+        LOG_INF("%s\n", common_params_get_system_info(params).c_str());
    }
    const bool add_bos = llama_add_bos_token(model);
    GGML_ASSERT(!llama_add_eos_token(model));
-    LOG("add_bos: %d\n", add_bos);

    std::vector<llama_token> embd_inp;
    std::vector<llama_token> embd_end;
-    std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
-    std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
+    std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
+    std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);

-    GGML_ASSERT(llama_token_prefix(model) >= 0);
-    GGML_ASSERT(llama_token_suffix(model) >= 0);
+    GGML_ASSERT(llama_token_fim_pre(model) >= 0);
+    GGML_ASSERT(llama_token_fim_suf(model) >= 0);

-    inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
-    inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
+    inp_pfx.insert(inp_pfx.begin(), llama_token_fim_pre(model));
+    inp_sfx.insert(inp_sfx.begin(), llama_token_fim_suf(model));

    embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
    embd_end = params.spm_infill ? inp_pfx : inp_sfx;
@@ -225,23 +173,24 @@ int main(int argc, char ** argv) {
    }
    embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());

-    const llama_token middle_token = llama_token_middle(model);
+    const llama_token middle_token = llama_token_fim_mid(model);
    if (middle_token >= 0) {
        embd_inp.push_back(middle_token);
    }

-    LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix));
-    LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix));
-    LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
+    LOG_DBG("add_bos: %d\n", add_bos);
+    LOG_DBG("prefix: \"%s\"\n", params.input_prefix.c_str());
+    LOG_DBG("suffix: \"%s\"\n", params.input_suffix.c_str());
+    LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());

    // Should not run without any tokens
    if (embd_inp.empty()) {
        embd_inp.push_back(llama_token_bos(model));
-        LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
+        LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
    }

    if ((int) embd_inp.size() > n_ctx - 4) {
-        LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
+        LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
        return 1;
    }

@@ -250,9 +199,8 @@ int main(int argc, char ** argv) {
        params.n_keep = (int)embd_inp.size();
    }

-    LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
-    LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
-
+    LOG_INF("inp_pfx: %s\n", string_from(ctx, inp_pfx).c_str());
+    LOG_INF("inp_sfx: %s\n", string_from(ctx, inp_sfx).c_str());

    // enable interactive mode if interactive start is specified
    if (params.interactive_first) {
@@ -260,21 +208,21 @@ int main(int argc, char ** argv) {
    }

    if (params.verbose_prompt) {
-        LOG_TEE("\n");
-        LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
-        LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
+        LOG_INF("\n");
+        LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
+        LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
        for (int i = 0; i < (int) embd_inp.size(); i++) {
-            LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
+            LOG_INF("%6d -> '%s'\n", embd_inp[i], common_token_to_piece(ctx, embd_inp[i]).c_str());
        }

        if (params.n_keep > 0) {
-        LOG_TEE("%s: static prompt based on n_keep: '", __func__);
+        LOG_INF("%s: static prompt based on n_keep: '", __func__);
            for (int i = 0; i < params.n_keep; i++) {
-                LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
+                LOG_CNT("%s", common_token_to_piece(ctx, embd_inp[i]).c_str());
            }
-            LOG_TEE("'\n");
+            LOG_CNT("'\n");
        }
-        LOG_TEE("\n");
+        LOG_INF("\n");
    }

    if (params.interactive) {
@@ -291,30 +239,30 @@ int main(int argc, char ** argv) {
        SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
 #endif

-        LOG_TEE("%s: interactive mode on.\n", __func__);
+        LOG_INF("%s: interactive mode on.\n", __func__);

        if (params.input_prefix_bos) {
-            LOG_TEE("Input prefix with BOS\n");
+            LOG_INF("Input prefix with BOS\n");
        }

        if (!params.input_prefix.empty()) {
-            LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
+            LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str());
        }

        if (!params.input_suffix.empty()) {
-            LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
+            LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
        }
    }
-    LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
-    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
-    LOG_TEE("\n\n");
+    smpl = common_sampler_init(model, sparams);

-    LOG_TEE("\n#####  Infill mode  #####\n\n");
-    if (params.infill) {
-        printf("\n************\n");
-        printf("no need to specify '--infill', always running infill\n");
-        printf("************\n\n");
-    }
+    LOG_INF("sampler seed: %u\n",     common_sampler_get_seed(smpl));
+    LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
+    LOG_INF("sampler chain: %s\n",    common_sampler_print(smpl).c_str());
+
+    LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
+
+    LOG_INF("\n");
+    LOG_INF("\n#####  Infill mode  #####\n\n");
    if (params.interactive) {
        const char *control_message;
        if (params.multiline_input) {
@@ -325,11 +273,11 @@ int main(int argc, char ** argv) {
                              " - To return control without starting a new line, end your input with '/'.\n"
                              " - If you want to submit another line, end your input with '\\'.\n";
        }
-        LOG_TEE("== Running in interactive mode. ==\n");
+        LOG_INF("== Running in interactive mode. ==\n");
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
-        LOG_TEE(       " - Press Ctrl+C to interject at any time.\n");
+        LOG_INF(       " - Press Ctrl+C to interject at any time.\n");
 #endif
-        LOG_TEE(       "%s\n", control_message);
+        LOG_INF(       "%s\n", control_message);

        is_interacting = params.interactive_first;
    }
@@ -349,8 +297,6 @@ int main(int argc, char ** argv) {

    std::vector<llama_token> embd;

-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
-
    while (n_remain != 0 || params.interactive) {
        // predict
        if (!embd.empty()) {
@@ -364,9 +310,8 @@ int main(int argc, char ** argv) {
                embd.resize(max_embd_size);

                console::set_display(console::error);
-                printf("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
+                LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
                console::set_display(console::reset);
-                fflush(stdout);
            }

            // infinite text generation via context swapping
@@ -375,14 +320,14 @@ int main(int argc, char ** argv) {
            // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
            if (n_past + (int) embd.size() > n_ctx) {
                if (params.n_predict == -2) {
-                    LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
+                    LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
                    break;
                }

                const int n_left    = n_past - params.n_keep - 1;
                const int n_discard = n_left/2;

-                LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
+                LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
                    n_past, n_left, n_ctx, params.n_keep, n_discard);

                llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
@@ -390,9 +335,9 @@ int main(int argc, char ** argv) {

                n_past -= n_discard;

-                LOG("after swap: n_past = %d\n", n_past);
+                LOG_DBG("after swap: n_past = %d\n", n_past);

-                LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
+                LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());

            }

@@ -404,16 +349,16 @@ int main(int argc, char ** argv) {
                    n_eval = params.n_batch;
                }

-                LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
+                LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());

-                if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
-                    LOG_TEE("%s : failed to eval\n", __func__);
+                if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) {
+                    LOG_ERR("%s : failed to eval\n", __func__);
                    return 1;
                }

                n_past += n_eval;

-                LOG("n_past = %d\n", n_past);
+                LOG_DBG("n_past = %d\n", n_past);
            }

        }
@@ -421,11 +366,11 @@ int main(int argc, char ** argv) {
        embd.clear();

        if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
-            const llama_token id = llama_sampling_sample(ctx_sampling, ctx, nullptr);
+            const llama_token id = common_sampler_sample(smpl, ctx, -1);

-            llama_sampling_accept(ctx_sampling, ctx, id, true);
+            common_sampler_accept(smpl, id, true);

-            LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
+            // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());

            embd.push_back(id);

@@ -435,16 +380,16 @@ int main(int argc, char ** argv) {
            // decrement remaining sampling budget
            --n_remain;

-            LOG("n_remain: %d\n", n_remain);
+            LOG_DBG("n_remain: %d\n", n_remain);
        } else {
            // some user input remains from prompt or interaction, forward it to processing
-            LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
+            LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
            while ((int) embd_inp.size() > n_consumed) {
                embd.push_back(embd_inp[n_consumed]);

                // push the prompt in the sampling context in order to apply repetition penalties later
                // for the prompt, we don't apply grammar rules
-                llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false);
+                common_sampler_accept(smpl, embd_inp[n_consumed], false);

                ++n_consumed;
                if ((int) embd.size() >= params.n_batch) {
@@ -456,8 +401,8 @@ int main(int argc, char ** argv) {
        // display text
        if (input_echo) {
            for (auto id : embd) {
-                const std::string token_str = llama_token_to_piece(ctx, id);
-                printf("%s", token_str.c_str());
+                const std::string token_str = common_token_to_piece(ctx, id);
+                LOG("%s", token_str.c_str());

                if (embd.size() > 1) {
                    input_tokens.push_back(id);
@@ -466,7 +411,6 @@ int main(int argc, char ** argv) {
                    output_ss << token_str;
                }
            }
-            fflush(stdout);
        }
        // reset color to default if we there is no pending user input
        if (input_echo && (int) embd_inp.size() == n_consumed) {
@@ -476,13 +420,12 @@ int main(int argc, char ** argv) {
        // if not currently processing queued inputs;
        if ((int) embd_inp.size() <= n_consumed) {
            // deal with eot token in infill mode
-            if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){
+            if ((common_sampler_last(smpl) == llama_token_eot(model) || is_interacting) && params.interactive){
                if (is_interacting && !params.interactive_first) {
                    // print an eot token
-                    printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
+                    LOG("%s", common_token_to_piece(ctx, llama_token_eot(model)).c_str());
                }
-                fflush(stdout);
-                printf("\n");
+                LOG("\n");
                console::set_display(console::user_input);
                std::string buffer;
                std::string line;
@@ -517,11 +460,11 @@ int main(int argc, char ** argv) {
                }

                // tokenize new prefix and suffix
-                std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
-                std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
+                std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
+                std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);

-                inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
-                inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
+                inp_pfx.insert(inp_pfx.begin(), llama_token_fim_pre(model));
+                inp_sfx.insert(inp_sfx.begin(), llama_token_fim_suf(model));

                embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
                embd_end = params.spm_infill ? inp_pfx : inp_sfx;
@@ -538,35 +481,33 @@ int main(int argc, char ** argv) {
                n_remain = params.n_predict;
                n_past = 0;
                n_consumed = 0;
-                // LOG_TEE("took new input\n");
                is_interacting = false;
            }
            // deal with end of generation tokens in interactive mode
-            else if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) {
-                LOG("found EOS token\n");
+            else if (llama_token_is_eog(model, common_sampler_last(smpl))) {
+                LOG_DBG("found EOS token\n");

                if (params.interactive) {

                    is_interacting = true;
-                    printf("\n");
+                    LOG("\n");
                    console::set_display(console::user_input);
-                    fflush(stdout);
               }
            }

            if (n_past > 0 && is_interacting && !params.interactive) {
-                LOG("waiting for user input\n");
+                LOG_DBG("waiting for user input\n");

                if (params.input_prefix_bos) {
-                    LOG("adding input prefix BOS token\n");
+                    LOG_DBG("adding input prefix BOS token\n");
                    embd_inp.push_back(llama_token_bos(model));
                }

                std::string buffer;
                if (!params.input_prefix.empty()) {
-                    LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
+                    LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
                    buffer += params.input_prefix;
-                    printf("%s", buffer.c_str());
+                    LOG("%s", buffer.c_str());
                }

                std::string line;
@@ -584,30 +525,30 @@ int main(int argc, char ** argv) {
                if (buffer.length() > 1) {
                    // append input suffix if any
                    if (!params.input_suffix.empty()) {
-                        LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
+                        LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
                        buffer += params.input_suffix;
-                        printf("%s", params.input_suffix.c_str());
+                        LOG("%s", params.input_suffix.c_str());
                    }

-                    LOG("buffer: '%s'\n", buffer.c_str());
+                    LOG_DBG("buffer: '%s'\n", buffer.c_str());

                    const size_t original_size = embd_inp.size();

-                    const auto line_inp = ::llama_tokenize(ctx, buffer, false);
-                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
+                    const auto line_inp = common_tokenize(ctx, buffer, false);
+                    LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());

                    embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());

                    for (size_t i = original_size; i < embd_inp.size(); ++i) {
                        const llama_token token = embd_inp[i];
                        output_tokens.push_back(token);
-                        output_ss << llama_token_to_piece(ctx, token);
+                        output_ss << common_token_to_piece(ctx, token);
                    }

                    n_remain -= line_inp.size();
-                    LOG("n_remain: %d\n", n_remain);
+                    LOG_DBG("n_remain: %d\n", n_remain);
                } else {
-                    LOG("empty line, passing control back\n");
+                    LOG_DBG("empty line, passing control back\n");
                }

                input_echo = false; // do not echo this again
@@ -615,7 +556,7 @@ int main(int argc, char ** argv) {

            if (n_past > 0) {
                if (is_interacting) {
-                    llama_sampling_reset(ctx_sampling);
+                    common_sampler_reset(smpl);
                }
                is_interacting = false;
            }
@@ -634,22 +575,17 @@ int main(int argc, char ** argv) {
        }
    }
    if (!params.interactive && n_remain <= 0) {
-        printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
-        fflush(stdout);
+        LOG("%s", common_token_to_piece(ctx, llama_token_eot(model)).c_str());
    }

-    llama_print_timings(ctx);
-    write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
+    LOG("\n");
+    common_perf_print(ctx, smpl);

    llama_free(ctx);
    llama_free_model(model);

-    llama_sampling_free(ctx_sampling);
+    common_sampler_free(smpl);
    llama_backend_free();

-#ifndef LOG_DISABLE_LOGS
-    LOG_TEE("Log end\n");
-#endif // LOG_DISABLE_LOGS
-
    return 0;
 }
--- a/examples/json_schema_to_grammar.py
+++ b/examples/json_schema_to_grammar.py
@@ -540,7 +540,7 @@ class SchemaConverter:
        return self._add_rule(
            name,
            to_rule(transform()) if self._raw_pattern \
-                else "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space")
+                else "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space")


    def _resolve_ref(self, ref):
--- a/examples/llama-bench/README.md
+++ b/examples/llama-bench/README.md
@@ -14,7 +14,8 @@ Performance testing tool for llama.cpp.
    1. [Markdown](#markdown)
    2. [CSV](#csv)
    3. [JSON](#json)
-    4. [SQL](#sql)
+    4. [JSONL](#jsonl)
+    5. [SQL](#sql)

 ## Syntax

@@ -23,27 +24,34 @@ usage: ./llama-bench [options]

 options:
  -h, --help
-  -m, --model <filename>              (default: models/7B/ggml-model-q4_0.gguf)
-  -p, --n-prompt <n>                  (default: 512)
-  -n, --n-gen <n>                     (default: 128)
-  -pg <pp,tg>                         (default: 512,128)
-  -b, --batch-size <n>                (default: 2048)
-  -ub, --ubatch-size <n>              (default: 512)
-  -ctk, --cache-type-k <t>            (default: f16)
-  -ctv, --cache-type-v <t>            (default: f16)
-  -t, --threads <n>                   (default: 16)
-  -ngl, --n-gpu-layers <n>            (default: 99)
-  -sm, --split-mode <none|layer|row>  (default: layer)
-  -mg, --main-gpu <i>                 (default: 0)
-  -nkvo, --no-kv-offload <0|1>        (default: 0)
-  -fa, --flash-attn <0|1>             (default: 0)
-  -mmp, --mmap <0|1>                  (default: 1)
-  --numa <distribute|isolate|numactl> (default: disabled)
-  -embd, --embeddings <0|1>           (default: 0)
-  -ts, --tensor-split <ts0/ts1/..>    (default: 0)
-  -r, --repetitions <n>               (default: 5)
-  -o, --output <csv|json|md|sql>      (default: md)
-  -v, --verbose                       (default: 0)
+  -m, --model <filename>                    (default: models/7B/ggml-model-q4_0.gguf)
+  -p, --n-prompt <n>                        (default: 512)
+  -n, --n-gen <n>                           (default: 128)
+  -pg <pp,tg>                               (default: )
+  -b, --batch-size <n>                      (default: 2048)
+  -ub, --ubatch-size <n>                    (default: 512)
+  -ctk, --cache-type-k <t>                  (default: f16)
+  -ctv, --cache-type-v <t>                  (default: f16)
+  -t, --threads <n>                         (default: 8)
+  -C, --cpu-mask <hex,hex>                  (default: 0x0)
+  --cpu-strict <0|1>                        (default: 0)
+  --poll <0...100>                          (default: 50)
+  -ngl, --n-gpu-layers <n>                  (default: 99)
+  -rpc, --rpc <rpc_servers>                 (default: )
+  -sm, --split-mode <none|layer|row>        (default: layer)
+  -mg, --main-gpu <i>                       (default: 0)
+  -nkvo, --no-kv-offload <0|1>              (default: 0)
+  -fa, --flash-attn <0|1>                   (default: 0)
+  -mmp, --mmap <0|1>                        (default: 1)
+  --numa <distribute|isolate|numactl>       (default: disabled)
+  -embd, --embeddings <0|1>                 (default: 0)
+  -ts, --tensor-split <ts0/ts1/..>          (default: 0)
+  -r, --repetitions <n>                     (default: 5)
+  --prio <0|1|2|3>                          (default: 0)
+  --delay <0...N> (seconds)                 (default: 0)
+  -o, --output <csv|json|jsonl|md|sql>      (default: md)
+  -oe, --output-err <csv|json|jsonl|md|sql> (default: none)
+  -v, --verbose                             (default: 0)

 Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.
 ```
@@ -238,6 +246,19 @@ $ ./llama-bench -o json
 ]
 ```

+
+### JSONL
+
+```sh
+$ ./llama-bench -o jsonl
+```
+
+```json lines
+{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":512,"n_gen":0,"test_time":"2023-09-23T12:09:57Z","avg_ns":212365953,"stddev_ns":985423,"avg_ts":2410.974041,"stddev_ts":11.163766,"samples_ns":[213837238,211635853,212328053,211329715,212698907],"samples_ts":[2394.34,2419.25,2411.36,2422.75,2407.16]}
+{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":0,"n_gen":128,"test_time":"2023-09-23T12:09:59Z","avg_ns":977425219,"stddev_ns":9268593,"avg_ts":130.965708,"stddev_ts":1.238924,"samples_ns":[984472709,974901233,989474741,970729355,967548060],"samples_ts":[130.019,131.295,129.362,131.86,132.293]}
+```
+
+
 ### SQL

 SQL output is suitable for importing into a SQLite database. The output can be piped into the `sqlite3` command line tool to add the results to a database.
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -16,16 +16,11 @@
 #include <sstream>
 #include <string>
 #include <vector>
+#include <thread>

 #include "ggml.h"
 #include "llama.h"
 #include "common.h"
-#include "ggml-cuda.h"
-#include "ggml-sycl.h"
-
-#ifdef GGML_USE_CANN
-#include "ggml-cann.h"
-#endif

 #ifdef _WIN32
 #define WIN32_LEAN_AND_MEAN
@@ -81,102 +76,38 @@ static T stdev(const std::vector<T> & v) {
 }

 static std::string get_cpu_info() {
-    std::string id;
-#ifdef __linux__
-    FILE * f = fopen("/proc/cpuinfo", "r");
-    if (f) {
-        char buf[1024];
-        while (fgets(buf, sizeof(buf), f)) {
-            if (strncmp(buf, "model name", 10) == 0) {
-                char * p = strchr(buf, ':');
-                if (p) {
-                    p++;
-                    while (std::isspace(*p)) {
-                        p++;
-                    }
-                    while (std::isspace(p[strlen(p) - 1])) {
-                        p[strlen(p) - 1] = '\0';
-                    }
-                    id = p;
-                    break;
-                }
-            }
+    std::vector<std::string> cpu_list;
+    for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
+        auto * dev = ggml_backend_dev_get(i);
+        auto dev_type = ggml_backend_dev_type(dev);
+        if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU || dev_type == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
+            cpu_list.push_back(ggml_backend_dev_description(dev));
        }
-        fclose(f);
    }
-#elif defined(_WIN32)
-    HKEY hKey;
-    if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
-                     TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
-                     0,
-                     KEY_READ,
-                     &hKey) != ERROR_SUCCESS) {
-        // fail to open registry key
-        return "";
-    }
-    char cpu_brand[256];
-    DWORD cpu_brand_size = sizeof(cpu_brand);
-    if (RegQueryValueExA(hKey,
-                        TEXT("ProcessorNameString"),
-                        NULL,
-                        NULL,
-                        (LPBYTE)cpu_brand,
-                        &cpu_brand_size) == ERROR_SUCCESS) {
-        id.assign(cpu_brand, cpu_brand_size);
-    }
-    RegCloseKey(hKey);
-#endif
-    // TODO: other platforms
-    return id;
+    return join(cpu_list, ", ");
 }

 static std::string get_gpu_info() {
-    std::string id;
-#ifdef GGML_USE_CUDA
-    int count = ggml_backend_cuda_get_device_count();
-    for (int i = 0; i < count; i++) {
-        char buf[128];
-        ggml_backend_cuda_get_device_description(i, buf, sizeof(buf));
-        id += buf;
-        if (i < count - 1) {
-            id += "/";
+    std::vector<std::string> gpu_list;
+    for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
+        auto * dev = ggml_backend_dev_get(i);
+        auto dev_type = ggml_backend_dev_type(dev);
+        if (dev_type == GGML_BACKEND_DEVICE_TYPE_GPU) {
+            gpu_list.push_back(ggml_backend_dev_description(dev));
        }
    }
-#endif
-#ifdef GGML_USE_SYCL
-    int count = ggml_backend_sycl_get_device_count();
-    for (int i = 0; i < count; i++) {
-        char buf[128];
-        ggml_sycl_get_device_description(i, buf, sizeof(buf));
-        id += buf;
-        if (i < count - 1) {
-            id += "/";
-        }
-    }
-#endif
-#ifdef GGML_USE_CANN
-    uint32_t count = ggml_backend_cann_get_device_count();
-    for (uint32_t i = 0; i < count; i++) {
-        char buf[128];
-        ggml_backend_cann_get_device_description(i, buf, sizeof(buf));
-        id += buf;
-        if (i < count - 1) {
-            id += "/";
-        }
-    }
-#endif
-    // TODO: other backends
-    return id;
+    return join(gpu_list, ", ");
 }

 // command line params
-enum output_formats {NONE, CSV, JSON, MARKDOWN, SQL};
+enum output_formats {NONE, CSV, JSON, JSONL, MARKDOWN, SQL};

 static const char * output_format_str(output_formats format) {
    switch (format) {
        case NONE:     return "none";
        case CSV:      return "csv";
        case JSON:     return "json";
+        case JSONL:    return "jsonl";
        case MARKDOWN: return "md";
        case SQL:      return "sql";
        default: GGML_ABORT("invalid output format");
@@ -190,6 +121,8 @@ static bool output_format_from_str(const std::string & s, output_formats & forma
        format = CSV;
    } else if (s == "json") {
        format = JSON;
+    } else if (s == "jsonl") {
+        format = JSONL;
    } else if (s == "md") {
        format = MARKDOWN;
    } else if (s == "sql") {
@@ -225,6 +158,9 @@ struct cmd_params {
    std::vector<ggml_type> type_k;
    std::vector<ggml_type> type_v;
    std::vector<int> n_threads;
+    std::vector<std::string> cpu_mask;
+    std::vector<bool> cpu_strict;
+    std::vector<int> poll;
    std::vector<int> n_gpu_layers;
    std::vector<std::string> rpc_servers;
    std::vector<llama_split_mode> split_mode;
@@ -236,7 +172,10 @@ struct cmd_params {
    std::vector<bool> embeddings;
    ggml_numa_strategy numa;
    int reps;
+    ggml_sched_priority prio;
+    int delay;
    bool verbose;
+    bool progress;
    output_formats output_format;
    output_formats output_format_stderr;
 };
@@ -251,6 +190,9 @@ static const cmd_params cmd_params_defaults = {
    /* type_k               */ {GGML_TYPE_F16},
    /* type_v               */ {GGML_TYPE_F16},
    /* n_threads            */ {cpu_get_num_math()},
+    /* cpu_mask             */ {"0x0"},
+    /* cpu_strict           */ {false},
+    /* poll                 */ {50},
    /* n_gpu_layers         */ {99},
    /* rpc_servers          */ {""},
    /* split_mode           */ {LLAMA_SPLIT_MODE_LAYER},
@@ -262,7 +204,10 @@ static const cmd_params cmd_params_defaults = {
    /* embeddings           */ {false},
    /* numa                 */ GGML_NUMA_STRATEGY_DISABLED,
    /* reps                 */ 5,
+    /* prio                 */ GGML_SCHED_PRIO_NORMAL,
+    /* delay                */ 0,
    /* verbose              */ false,
+    /* progress             */ false,
    /* output_format        */ MARKDOWN,
    /* output_format_stderr */ NONE,
 };
@@ -272,29 +217,37 @@ static void print_usage(int /* argc */, char ** argv) {
    printf("\n");
    printf("options:\n");
    printf("  -h, --help\n");
-    printf("  -m, --model <filename>              (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
-    printf("  -p, --n-prompt <n>                  (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
-    printf("  -n, --n-gen <n>                     (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
-    printf("  -pg <pp,tg>                         (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
-    printf("  -b, --batch-size <n>                (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
-    printf("  -ub, --ubatch-size <n>              (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str());
-    printf("  -ctk, --cache-type-k <t>            (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
-    printf("  -ctv, --cache-type-v <t>            (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
-    printf("  -t, --threads <n>                   (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
-    printf("  -ngl, --n-gpu-layers <n>            (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
-    printf("  -rpc, --rpc <rpc_servers>           (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
-    printf("  -sm, --split-mode <none|layer|row>  (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
-    printf("  -mg, --main-gpu <i>                 (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
-    printf("  -nkvo, --no-kv-offload <0|1>        (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
-    printf("  -fa, --flash-attn <0|1>             (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
-    printf("  -mmp, --mmap <0|1>                  (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
-    printf("  --numa <distribute|isolate|numactl> (default: disabled)\n");
-    printf("  -embd, --embeddings <0|1>           (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
-    printf("  -ts, --tensor-split <ts0/ts1/..>    (default: 0)\n");
-    printf("  -r, --repetitions <n>               (default: %d)\n", cmd_params_defaults.reps);
-    printf("  -o, --output <csv|json|md|sql>      (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
-    printf("  -oe, --output-err <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
-    printf("  -v, --verbose                       (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
+    printf("  -m, --model <filename>                    (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
+    printf("  -p, --n-prompt <n>                        (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
+    printf("  -n, --n-gen <n>                           (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
+    printf("  -pg <pp,tg>                               (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
+    printf("  -b, --batch-size <n>                      (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
+    printf("  -ub, --ubatch-size <n>                    (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str());
+    printf("  -ctk, --cache-type-k <t>                  (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
+    printf("  -ctv, --cache-type-v <t>                  (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
+    printf("  -t, --threads <n>                         (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
+    printf("  -C, --cpu-mask <hex,hex>                  (default: %s)\n", join(cmd_params_defaults.cpu_mask, ",").c_str());
+    printf("  --cpu-strict <0|1>                        (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str());
+    printf("  --poll <0...100>                          (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
+    printf("  -ngl, --n-gpu-layers <n>                  (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
+    if (llama_supports_rpc()) {
+        printf("  -rpc, --rpc <rpc_servers>                 (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
+    }
+    printf("  -sm, --split-mode <none|layer|row>        (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
+    printf("  -mg, --main-gpu <i>                       (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
+    printf("  -nkvo, --no-kv-offload <0|1>              (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
+    printf("  -fa, --flash-attn <0|1>                   (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
+    printf("  -mmp, --mmap <0|1>                        (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
+    printf("  --numa <distribute|isolate|numactl>       (default: disabled)\n");
+    printf("  -embd, --embeddings <0|1>                 (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
+    printf("  -ts, --tensor-split <ts0/ts1/..>          (default: 0)\n");
+    printf("  -r, --repetitions <n>                     (default: %d)\n", cmd_params_defaults.reps);
+    printf("  --prio <0|1|2|3>                          (default: %d)\n", cmd_params_defaults.prio);
+    printf("  --delay <0...N> (seconds)                 (default: %d)\n", cmd_params_defaults.delay);
+    printf("  -o, --output <csv|json|jsonl|md|sql>      (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
+    printf("  -oe, --output-err <csv|json|jsonl|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
+    printf("  -v, --verbose                             (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
+    printf("  --progress                                (default: %s)\n", cmd_params_defaults.progress ? "1" : "0");
    printf("\n");
    printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
 }
@@ -303,6 +256,9 @@ static ggml_type ggml_type_from_name(const std::string & s) {
    if (s == "f16") {
        return GGML_TYPE_F16;
    }
+    if (s == "bf16") {
+        return GGML_TYPE_BF16;
+    }
    if (s == "q8_0") {
        return GGML_TYPE_Q8_0;
    }
@@ -338,6 +294,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
    params.output_format_stderr = cmd_params_defaults.output_format_stderr;
    params.reps = cmd_params_defaults.reps;
    params.numa = cmd_params_defaults.numa;
+    params.prio = cmd_params_defaults.prio;
+    params.delay = cmd_params_defaults.delay;
+    params.progress = cmd_params_defaults.progress;

    for (int i = 1; i < argc; i++) {
        arg = argv[i];
@@ -409,6 +368,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                }
                types.push_back(gt);
            }
+            if (invalid_param) {
+                break;
+            }
            params.type_k.insert(params.type_k.end(), types.begin(), types.end());
        } else if (arg == "-ctv" || arg == "--cache-type-v") {
            if (++i >= argc) {
@@ -425,6 +387,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                }
                types.push_back(gt);
            }
+            if (invalid_param) {
+                break;
+            }
            params.type_v.insert(params.type_v.end(), types.begin(), types.end());
        } else if (arg == "-t" || arg == "--threads") {
            if (++i >= argc) {
@@ -433,6 +398,27 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
            }
            auto p = string_split<int>(argv[i], split_delim);
            params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
+        } else if (arg == "-C" || arg == "--cpu-mask") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = string_split<std::string>(argv[i], split_delim);
+            params.cpu_mask.insert(params.cpu_mask.end(), p.begin(), p.end());
+        } else if (arg == "--cpu-strict") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = string_split<bool>(argv[i], split_delim);
+            params.cpu_strict.insert(params.cpu_strict.end(), p.begin(), p.end());
+        } else if (arg == "--poll") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = string_split<int>(argv[i], split_delim);
+            params.poll.insert(params.poll.end(), p.begin(), p.end());
        } else if (arg == "-ngl" || arg == "--n-gpu-layers") {
            if (++i >= argc) {
                invalid_param = true;
@@ -440,7 +426,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
            }
            auto p = string_split<int>(argv[i], split_delim);
            params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
-        } else if (arg == "-rpc" || arg == "--rpc") {
+        } else if (llama_supports_rpc() && (arg == "-rpc" || arg == "--rpc")) {
            if (++i >= argc) {
                invalid_param = true;
                break;
@@ -467,6 +453,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                }
                modes.push_back(mode);
            }
+            if (invalid_param) {
+                break;
+            }
            params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end());
        } else if (arg == "-mg" || arg == "--main-gpu") {
            if (++i >= argc) {
@@ -541,6 +530,18 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                break;
            }
            params.reps = std::stoi(argv[i]);
+        } else if (arg == "--prio") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.prio = (enum ggml_sched_priority) std::stoi(argv[i]);
+        } else if (arg == "--delay") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.delay = std::stoi(argv[i]);
        } else if (arg == "-o" || arg == "--output") {
            if (++i >= argc) {
                invalid_param = true;
@@ -555,6 +556,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
            invalid_param = !output_format_from_str(argv[i], params.output_format_stderr);
        } else if (arg == "-v" || arg == "--verbose") {
            params.verbose = true;
+        } else if (arg == "--progress") {
+            params.progress = true;
        } else {
            invalid_param = true;
            break;
@@ -585,6 +588,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
    if (params.use_mmap.empty())     { params.use_mmap = cmd_params_defaults.use_mmap; }
    if (params.embeddings.empty())   { params.embeddings = cmd_params_defaults.embeddings; }
    if (params.n_threads.empty())    { params.n_threads = cmd_params_defaults.n_threads; }
+    if (params.cpu_mask.empty())     { params.cpu_mask  = cmd_params_defaults.cpu_mask;  }
+    if (params.cpu_strict.empty())   { params.cpu_strict = cmd_params_defaults.cpu_strict; }
+    if (params.poll.empty())         { params.poll = cmd_params_defaults.poll; }

    return params;
 }
@@ -598,6 +604,9 @@ struct cmd_params_instance {
    ggml_type type_k;
    ggml_type type_v;
    int n_threads;
+    std::string cpu_mask;
+    bool cpu_strict;
+    int poll;
    int n_gpu_layers;
    std::string rpc_servers;
    llama_split_mode split_mode;
@@ -667,7 +676,10 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
    for (const auto & tv : params.type_v)
    for (const auto & nkvo : params.no_kv_offload)
    for (const auto & fa : params.flash_attn)
-    for (const auto & nt : params.n_threads) {
+    for (const auto & nt : params.n_threads)
+    for (const auto & cm : params.cpu_mask)
+    for (const auto & cs : params.cpu_strict)
+    for (const auto & pl : params.poll) {
        for (const auto & n_prompt : params.n_prompt) {
            if (n_prompt == 0) {
                continue;
@@ -681,6 +693,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .type_k       = */ tk,
                /* .type_v       = */ tv,
                /* .n_threads    = */ nt,
+                /* .cpu_mask     = */ cm,
+                /* .cpu_strict   = */ cs,
+                /* .poll         = */ pl,
                /* .n_gpu_layers = */ nl,
                /* .rpc_servers  = */ rpc,
                /* .split_mode   = */ sm,
@@ -707,6 +722,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .type_k       = */ tk,
                /* .type_v       = */ tv,
                /* .n_threads    = */ nt,
+                /* .cpu_mask     = */ cm,
+                /* .cpu_strict   = */ cs,
+                /* .poll         = */ pl,
                /* .n_gpu_layers = */ nl,
                /* .rpc_servers  = */ rpc,
                /* .split_mode   = */ sm,
@@ -733,6 +751,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .type_k       = */ tk,
                /* .type_v       = */ tv,
                /* .n_threads    = */ nt,
+                /* .cpu_mask     = */ cm,
+                /* .cpu_strict   = */ cs,
+                /* .poll         = */ pl,
                /* .n_gpu_layers = */ nl,
                /* .rpc_servers  = */ rpc,
                /* .split_mode   = */ sm,
@@ -753,13 +774,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
 struct test {
    static const std::string build_commit;
    static const int build_number;
-    static const bool cuda;
-    static const bool vulkan;
-    static const bool kompute;
-    static const bool metal;
-    static const bool sycl;
-    static const bool gpu_blas;
-    static const bool blas;
    static const std::string cpu_info;
    static const std::string gpu_info;
    std::string model_filename;
@@ -769,7 +783,9 @@ struct test {
    int n_batch;
    int n_ubatch;
    int n_threads;
-    bool has_rpc;
+    std::string cpu_mask;
+    bool cpu_strict;
+    int poll;
    ggml_type type_k;
    ggml_type type_v;
    int n_gpu_layers;
@@ -795,7 +811,9 @@ struct test {
        n_batch = inst.n_batch;
        n_ubatch = inst.n_ubatch;
        n_threads = inst.n_threads;
-        has_rpc = !inst.rpc_servers.empty();
+        cpu_mask = inst.cpu_mask;
+        cpu_strict = inst.cpu_strict;
+        poll = inst.poll;
        type_k = inst.type_k;
        type_v = inst.type_v;
        n_gpu_layers = inst.n_gpu_layers;
@@ -840,45 +858,31 @@ struct test {
    }

    static std::string get_backend() {
-        if (cuda) {
-            return GGML_CUDA_NAME;
+        std::vector<std::string> backends;
+        for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
+            auto * reg = ggml_backend_reg_get(i);
+            std::string name = ggml_backend_reg_name(reg);
+            if (name != "CPU") {
+                backends.push_back(ggml_backend_reg_name(reg));
+            }
        }
-        if (vulkan) {
-            return "Vulkan";
-        }
-        if (kompute) {
-            return "Kompute";
-        }
-        if (metal) {
-            return "Metal";
-        }
-        if (sycl) {
-            return GGML_SYCL_NAME;
-        }
-        if (gpu_blas) {
-            return "GPU BLAS";
-        }
-        if (blas) {
-            return "BLAS";
-        }
-
-        return "CPU";
+        return backends.empty() ? "CPU" : join(backends, ",");
    }

    static const std::vector<std::string> & get_fields() {
        static const std::vector<std::string> fields = {
            "build_commit", "build_number",
-            "cuda", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas", "blas",
-            "cpu_info", "gpu_info",
+            "cpu_info", "gpu_info", "backends",
            "model_filename", "model_type", "model_size", "model_n_params",
            "n_batch", "n_ubatch",
-            "n_threads", "type_k", "type_v",
+            "n_threads", "cpu_mask", "cpu_strict", "poll",
+            "type_k", "type_v",
            "n_gpu_layers", "split_mode",
            "main_gpu", "no_kv_offload", "flash_attn",
            "tensor_split", "use_mmap", "embeddings",
            "n_prompt", "n_gen", "test_time",
            "avg_ns", "stddev_ns",
-            "avg_ts", "stddev_ts"
+            "avg_ts", "stddev_ts",
        };
        return fields;
    }
@@ -887,15 +891,15 @@ struct test {

    static field_type get_field_type(const std::string & field) {
        if (field == "build_number" || field == "n_batch" || field == "n_ubatch" ||
-            field == "n_threads" ||
+            field == "n_threads" || field == "poll" ||
            field == "model_size" || field == "model_n_params" ||
            field == "n_gpu_layers" || field == "main_gpu" ||
            field == "n_prompt" || field == "n_gen" ||
            field == "avg_ns" || field == "stddev_ns") {
            return INT;
        }
-        if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
-            field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
+        if (field == "f16_kv" || field == "no_kv_offload" ||
+            field == "cpu_strict" ||
            field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
            return BOOL;
        }
@@ -923,12 +927,11 @@ struct test {
        }
        std::vector<std::string> values = {
            build_commit, std::to_string(build_number),
-            std::to_string(cuda), std::to_string(vulkan), std::to_string(vulkan),
-            std::to_string(metal), std::to_string(sycl), std::to_string(has_rpc), std::to_string(gpu_blas), std::to_string(blas),
-            cpu_info, gpu_info,
+            cpu_info, gpu_info, get_backend(),
            model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
            std::to_string(n_batch), std::to_string(n_ubatch),
-            std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
+            std::to_string(n_threads), cpu_mask, std::to_string(cpu_strict), std::to_string(poll),
+            ggml_type_name(type_k), ggml_type_name(type_v),
            std::to_string(n_gpu_layers), split_mode_str(split_mode),
            std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn),
            tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
@@ -951,13 +954,6 @@ struct test {

 const std::string test::build_commit = LLAMA_COMMIT;
 const int         test::build_number = LLAMA_BUILD_NUMBER;
-const bool        test::cuda         = !!ggml_cpu_has_cuda();
-const bool        test::vulkan       = !!ggml_cpu_has_vulkan();
-const bool        test::kompute      = !!ggml_cpu_has_kompute();
-const bool        test::metal        = !!ggml_cpu_has_metal();
-const bool        test::gpu_blas     = !!ggml_cpu_has_gpublas();
-const bool        test::blas         = !!ggml_cpu_has_blas();
-const bool        test::sycl         = !!ggml_cpu_has_sycl();
 const std::string test::cpu_info     = get_cpu_info();
 const std::string test::gpu_info     = get_gpu_info();

@@ -996,38 +992,39 @@ struct csv_printer : public printer {
    }
 };

+
+static std::string escape_json(const std::string & value) {
+    std::string escaped;
+    for (auto c : value) {
+        if (c == '"') {
+            escaped += "\\\"";
+        } else if (c == '\\') {
+            escaped += "\\\\";
+        } else  if (c <= 0x1f) {
+            char buf[8];
+            snprintf(buf, sizeof(buf), "\\u%04x", c);
+            escaped += buf;
+        } else {
+            escaped += c;
+        }
+    }
+    return escaped;
+}
+
+static std::string format_json_value(const std::string & field, const std::string & value) {
+    switch (test::get_field_type(field)) {
+        case test::STRING:
+            return "\"" + escape_json(value) + "\"";
+        case test::BOOL:
+            return value == "0" ? "false" : "true";
+        default:
+            return value;
+    }
+}
+
 struct json_printer : public printer {
    bool first = true;

-    static std::string escape_json(const std::string & value) {
-        std::string escaped;
-        for (auto c : value) {
-            if (c == '"') {
-                escaped += "\\\"";
-            } else if (c == '\\') {
-                escaped += "\\\\";
-            } else  if (c <= 0x1f) {
-                char buf[8];
-                snprintf(buf, sizeof(buf), "\\u%04x", c);
-                escaped += buf;
-            } else {
-                escaped += c;
-            }
-        }
-        return escaped;
-    }
-
-    static std::string format_value(const std::string & field, const std::string & value) {
-        switch (test::get_field_type(field)) {
-            case test::STRING:
-                return "\"" + escape_json(value) + "\"";
-            case test::BOOL:
-                return value == "0" ? "false" : "true";
-            default:
-                return value;
-        }
-    }
-
    void print_header(const cmd_params & params) override {
        fprintf(fout, "[\n");
        (void) params;
@@ -1036,7 +1033,7 @@ struct json_printer : public printer {
    void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
        assert(fields.size() == values.size());
        for (size_t i = 0; i < fields.size(); i++) {
-            fprintf(fout, "    \"%s\": %s,\n", fields.at(i).c_str(), format_value(fields.at(i), values.at(i)).c_str());
+            fprintf(fout, "    \"%s\": %s,\n", fields.at(i).c_str(), format_json_value(fields.at(i), values.at(i)).c_str());
        }
    }

@@ -1059,6 +1056,25 @@ struct json_printer : public printer {
    }
 };

+
+struct jsonl_printer : public printer {
+    void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
+        assert(fields.size() == values.size());
+        for (size_t i = 0; i < fields.size(); i++) {
+            fprintf(fout, "\"%s\": %s, ", fields.at(i).c_str(), format_json_value(fields.at(i), values.at(i)).c_str());
+        }
+    }
+
+    void print_test(const test & t) override {
+        fprintf(fout, "{");
+        print_fields(test::get_fields(), t.get_values());
+        fprintf(fout, "\"samples_ns\": [ %s ],", join(t.samples_ns, ", ").c_str());
+        fprintf(fout, "\"samples_ts\": [ %s ]", join(t.get_ts(), ", ").c_str());
+        fprintf(fout, "}\n");
+        fflush(fout);
+    }
+};
+
 struct markdown_printer : public printer {
    std::vector<std::string> fields;

@@ -1067,7 +1083,7 @@ struct markdown_printer : public printer {
            return -30;
        }
        if (field == "t/s") {
-            return 16;
+            return 20;
        }
        if (field == "size" || field == "params") {
            return 10;
@@ -1142,13 +1158,23 @@ struct markdown_printer : public printer {
        fields.emplace_back("size");
        fields.emplace_back("params");
        fields.emplace_back("backend");
-        bool is_cpu_backend = test::get_backend() == "CPU" || test::get_backend() == "BLAS";
+        bool is_cpu_backend = test::get_backend().find("CPU") != std::string::npos ||
+                              test::get_backend().find("BLAS") != std::string::npos;
        if (!is_cpu_backend) {
            fields.emplace_back("n_gpu_layers");
        }
        if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
            fields.emplace_back("n_threads");
        }
+        if (params.cpu_mask.size() > 1 || params.cpu_mask != cmd_params_defaults.cpu_mask) {
+            fields.emplace_back("cpu_mask");
+        }
+        if (params.cpu_strict.size() > 1 || params.cpu_strict != cmd_params_defaults.cpu_strict) {
+            fields.emplace_back("cpu_strict");
+        }
+        if (params.poll.size() > 1 || params.poll != cmd_params_defaults.poll) {
+            fields.emplace_back("poll");
+        }
        if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
            fields.emplace_back("n_batch");
        }
@@ -1223,9 +1249,6 @@ struct markdown_printer : public printer {
                value = buf;
            } else if (field == "backend") {
                value = test::get_backend();
-                if (t.has_rpc) {
-                    value += "+RPC";
-                }
            } else if (field == "test") {
                if (t.n_prompt > 0 && t.n_gen == 0) {
                    snprintf(buf, sizeof(buf), "pp%d", t.n_prompt);
@@ -1298,7 +1321,7 @@ struct sql_printer : public printer {
    }
 };

-static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) {
+static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_threads) {
    llama_set_n_threads(ctx, n_threads, n_threads);

    const llama_model * model = llama_get_model(ctx);
@@ -1314,14 +1337,14 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_bat
        for (int i = 1; i < n_tokens; i++) {
            tokens[i] = std::rand() % n_vocab;
        }
-        llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens, n_past + n_processed, 0));
+        llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens));
        n_processed += n_tokens;
    }

    llama_synchronize(ctx);
 }

-static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) {
+static void test_gen(llama_context * ctx, int n_gen, int n_threads) {
    llama_set_n_threads(ctx, n_threads, n_threads);

    const llama_model * model = llama_get_model(ctx);
@@ -1330,7 +1353,7 @@ static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads)
    llama_token token = llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab;

    for (int i = 0; i < n_gen; i++) {
-        llama_decode(ctx, llama_batch_get_one(&token, 1, n_past + i, 0));
+        llama_decode(ctx, llama_batch_get_one(&token, 1));
        llama_synchronize(ctx);
        token = std::rand() % n_vocab;
    }
@@ -1350,6 +1373,8 @@ static std::unique_ptr<printer> create_printer(output_formats format) {
            return std::unique_ptr<printer>(new csv_printer());
        case JSON:
            return std::unique_ptr<printer>(new json_printer());
+        case JSONL:
+            return std::unique_ptr<printer>(new jsonl_printer());
        case MARKDOWN:
            return std::unique_ptr<printer>(new markdown_printer());
        case SQL:
@@ -1383,6 +1408,8 @@ int main(int argc, char ** argv) {
    llama_backend_init();
    llama_numa_init(params.numa);

+    set_process_priority(params.prio);
+
    // initialize printer
    std::unique_ptr<printer> p = create_printer(params.output_format);
    std::unique_ptr<printer> p_err = create_printer(params.output_format_stderr);
@@ -1402,7 +1429,13 @@ int main(int argc, char ** argv) {
    llama_model * lmodel = nullptr;
    const cmd_params_instance * prev_inst = nullptr;

+    int params_idx = 0;
+    auto params_count = params_instances.size();
    for (const auto & inst : params_instances) {
+        params_idx ++;
+        if (params.progress) {
+            fprintf(stderr, "llama-bench: benchmark %d/%ld: starting\n", params_idx, params_count);
+        }
        // keep the same model between tests when possible
        if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
            if (lmodel) {
@@ -1428,13 +1461,41 @@ int main(int argc, char ** argv) {

        llama_kv_cache_clear(ctx);

+        // cool off before the test
+        if (params.delay) {
+            std::this_thread::sleep_for(std::chrono::seconds(params.delay));
+        }
+
+        struct ggml_threadpool_params tpp = ggml_threadpool_params_default(t.n_threads);
+        if (!parse_cpu_mask(t.cpu_mask, tpp.cpumask)) {
+            fprintf(stderr, "%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str());
+            exit(1);
+        }
+        tpp.strict_cpu = t.cpu_strict;
+        tpp.poll       = t.poll;
+        tpp.prio       = params.prio;
+
+        struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
+        if (!threadpool) {
+            fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
+            exit(1);
+        }
+
+        llama_attach_threadpool(ctx, threadpool, NULL);
+
        // warmup run
        if (t.n_prompt > 0) {
+            if (params.progress) {
+                fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup prompt run\n", params_idx, params_count);
+            }
            //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
-            test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
+            test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
        }
        if (t.n_gen > 0) {
-            test_gen(ctx, 1, 0, t.n_threads);
+            if (params.progress) {
+                fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup generation run\n", params_idx, params_count);
+            }
+            test_gen(ctx, 1, t.n_threads);
        }

        for (int i = 0; i < params.reps; i++) {
@@ -1443,10 +1504,16 @@ int main(int argc, char ** argv) {
            uint64_t t_start = get_time_ns();

            if (t.n_prompt > 0) {
-                test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
+                if (params.progress) {
+                    fprintf(stderr, "llama-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count, i + 1, params.reps);
+                }
+                test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
            }
            if (t.n_gen > 0) {
-                test_gen(ctx, t.n_gen, t.n_prompt, t.n_threads);
+                if (params.progress) {
+                    fprintf(stderr, "llama-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count, i + 1, params.reps);
+                }
+                test_gen(ctx, t.n_gen, t.n_threads);
            }

            uint64_t t_ns = get_time_ns() - t_start;
@@ -1463,9 +1530,11 @@ int main(int argc, char ** argv) {
            fflush(p_err->fout);
        }

-        llama_print_timings(ctx);
+        llama_perf_context_print(ctx);

        llama_free(ctx);
+
+        ggml_threadpool_free(threadpool);
    }

    llama_free_model(lmodel);
--- a/examples/llama.android/llama/build.gradle.kts
+++ b/examples/llama.android/llama/build.gradle.kts
@@ -18,6 +18,7 @@ android {
        }
        externalNativeBuild {
            cmake {
+                arguments += "-DLLAMA_BUILD_COMMON=ON"
                arguments += "-DCMAKE_BUILD_TYPE=Release"
                cppFlags += listOf()
                arguments += listOf()
--- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
@@ -120,8 +120,8 @@ Java_android_llama_cpp_LLamaAndroid_new_1context(JNIEnv *env, jobject, jlong jmo
    LOGi("Using %d threads", n_threads);

    llama_context_params ctx_params = llama_context_default_params();
-    ctx_params.seed  = 1234;
-    ctx_params.n_ctx = 2048;
+
+    ctx_params.n_ctx           = 2048;
    ctx_params.n_threads       = n_threads;
    ctx_params.n_threads_batch = n_threads;

@@ -186,11 +186,11 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
    for (nri = 0; nri < nr; nri++) {
        LOGi("Benchmark prompt processing (pp)");

-        llama_batch_clear(*batch);
+        common_batch_clear(*batch);

        const int n_tokens = pp;
        for (i = 0; i < n_tokens; i++) {
-            llama_batch_add(*batch, 0, i, { 0 }, false);
+            common_batch_add(*batch, 0, i, { 0 }, false);
        }

        batch->logits[batch->n_tokens - 1] = true;
@@ -210,9 +210,9 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
        const auto t_tg_start = ggml_time_us();
        for (i = 0; i < tg; i++) {

-            llama_batch_clear(*batch);
+            common_batch_clear(*batch);
            for (j = 0; j < pl; j++) {
-                llama_batch_add(*batch, 0, i, { j }, true);
+                common_batch_add(*batch, 0, i, { j }, true);
            }

            LOGi("llama_decode() text generation: %d", i);
@@ -269,12 +269,6 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
    return env->NewStringUTF(result.str().c_str());
 }

-extern "C"
-JNIEXPORT void JNICALL
-Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
-    llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
-}
-
 extern "C"
 JNIEXPORT jlong JNICALL
 Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
@@ -289,9 +283,6 @@ Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens,
        nullptr,
        nullptr,
        nullptr,
-        0,
-        0,
-        0,
    };

    if (embd) {
@@ -311,6 +302,29 @@ Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens,
    return reinterpret_cast<jlong>(batch);
 }

+extern "C"
+JNIEXPORT void JNICALL
+Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
+    llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
+}
+
+extern "C"
+JNIEXPORT jlong JNICALL
+Java_android_llama_cpp_LLamaAndroid_new_1sampler(JNIEnv *, jobject) {
+    auto sparams = llama_sampler_chain_default_params();
+    sparams.no_perf = true;
+    llama_sampler * smpl = llama_sampler_chain_init(sparams);
+    llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
+
+    return reinterpret_cast<jlong>(smpl);
+}
+
+extern "C"
+JNIEXPORT void JNICALL
+Java_android_llama_cpp_LLamaAndroid_free_1sampler(JNIEnv *, jobject, jlong sampler_pointer) {
+    llama_sampler_free(reinterpret_cast<llama_sampler *>(sampler_pointer));
+}
+
 extern "C"
 JNIEXPORT void JNICALL
 Java_android_llama_cpp_LLamaAndroid_backend_1init(JNIEnv *, jobject) {
@@ -340,7 +354,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init(
    const auto context = reinterpret_cast<llama_context *>(context_pointer);
    const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);

-    const auto tokens_list = llama_tokenize(context, text, 1);
+    const auto tokens_list = common_tokenize(context, text, 1);

    auto n_ctx = llama_n_ctx(context);
    auto n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
@@ -352,14 +366,14 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init(
    }

    for (auto id : tokens_list) {
-        LOGi("%s", llama_token_to_piece(context, id).c_str());
+        LOGi("%s", common_token_to_piece(context, id).c_str());
    }

-    llama_batch_clear(*batch);
+    common_batch_clear(*batch);

    // evaluate the initial prompt
    for (auto i = 0; i < tokens_list.size(); i++) {
-        llama_batch_add(*batch, tokens_list[i], i, { 0 }, false);
+        common_batch_add(*batch, tokens_list[i], i, { 0 }, false);
    }

    // llama_decode will output logits only for the last token of the prompt
@@ -381,38 +395,28 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
        jobject,
        jlong context_pointer,
        jlong batch_pointer,
+        jlong sampler_pointer,
        jint n_len,
        jobject intvar_ncur
 ) {
    const auto context = reinterpret_cast<llama_context *>(context_pointer);
-    const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
+    const auto batch   = reinterpret_cast<llama_batch   *>(batch_pointer);
+    const auto sampler = reinterpret_cast<llama_sampler *>(sampler_pointer);
    const auto model = llama_get_model(context);

    if (!la_int_var) la_int_var = env->GetObjectClass(intvar_ncur);
    if (!la_int_var_value) la_int_var_value = env->GetMethodID(la_int_var, "getValue", "()I");
    if (!la_int_var_inc) la_int_var_inc = env->GetMethodID(la_int_var, "inc", "()V");

-    auto n_vocab = llama_n_vocab(model);
-    auto logits = llama_get_logits_ith(context, batch->n_tokens - 1);
-
-    std::vector<llama_token_data> candidates;
-    candidates.reserve(n_vocab);
-
-    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-        candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
-    }
-
-    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-
    // sample the most likely token
-    const auto new_token_id = llama_sample_token_greedy(context, &candidates_p);
+    const auto new_token_id = llama_sampler_sample(sampler, context, -1);

    const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
    if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
        return nullptr;
    }

-    auto new_token_chars = llama_token_to_piece(context, new_token_id);
+    auto new_token_chars = common_token_to_piece(context, new_token_id);
    cached_token_chars += new_token_chars;

    jstring new_token = nullptr;
@@ -424,8 +428,8 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
        new_token = env->NewStringUTF("");
    }

-    llama_batch_clear(*batch);
-    llama_batch_add(*batch, new_token_id, n_cur, { 0 }, true);
+    common_batch_clear(*batch);
+    common_batch_add(*batch, new_token_id, n_cur, { 0 }, true);

    env->CallVoidMethod(intvar_ncur, la_int_var_inc);

--- a/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt
+++ b/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt
@@ -45,8 +45,10 @@ class LLamaAndroid {
    private external fun free_context(context: Long)
    private external fun backend_init(numa: Boolean)
    private external fun backend_free()
-    private external fun free_batch(batch: Long)
    private external fun new_batch(nTokens: Int, embd: Int, nSeqMax: Int): Long
+    private external fun free_batch(batch: Long)
+    private external fun new_sampler(): Long
+    private external fun free_sampler(sampler: Long)
    private external fun bench_model(
        context: Long,
        model: Long,
@@ -69,6 +71,7 @@ class LLamaAndroid {
    private external fun completion_loop(
        context: Long,
        batch: Long,
+        sampler: Long,
        nLen: Int,
        ncur: IntVar
    ): String?
@@ -101,8 +104,11 @@ class LLamaAndroid {
                    val batch = new_batch(512, 0, 1)
                    if (batch == 0L) throw IllegalStateException("new_batch() failed")

+                    val sampler = new_sampler()
+                    if (sampler == 0L) throw IllegalStateException("new_sampler() failed")
+
                    Log.i(tag, "Loaded model $pathToModel")
-                    threadLocalState.set(State.Loaded(model, context, batch))
+                    threadLocalState.set(State.Loaded(model, context, batch, sampler))
                }
                else -> throw IllegalStateException("Model already loaded")
            }
@@ -114,7 +120,7 @@ class LLamaAndroid {
            is State.Loaded -> {
                val ncur = IntVar(completion_init(state.context, state.batch, message, nlen))
                while (ncur.value <= nlen) {
-                    val str = completion_loop(state.context, state.batch, nlen, ncur)
+                    val str = completion_loop(state.context, state.batch, state.sampler, nlen, ncur)
                    if (str == null) {
                        break
                    }
@@ -138,6 +144,7 @@ class LLamaAndroid {
                    free_context(state.context)
                    free_model(state.model)
                    free_batch(state.batch)
+                    free_sampler(state.sampler);

                    threadLocalState.set(State.Idle)
                }
@@ -161,7 +168,7 @@ class LLamaAndroid {

        private sealed interface State {
            data object Idle: State
-            data class Loaded(val model: Long, val context: Long, val batch: Long): State
+            data class Loaded(val model: Long, val context: Long, val batch: Long, val sampler: Long): State
        }

        // Enforce only one instance of Llm.
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@@ -24,6 +24,7 @@ func llama_batch_add(_ batch: inout llama_batch, _ id: llama_token, _ pos: llama
 actor LlamaContext {
    private var model: OpaquePointer
    private var context: OpaquePointer
+    private var sampling: UnsafeMutablePointer<llama_sampler>
    private var batch: llama_batch
    private var tokens_list: [llama_token]
    var is_done: Bool = false
@@ -42,9 +43,14 @@ actor LlamaContext {
        self.tokens_list = []
        self.batch = llama_batch_init(512, 0, 1)
        self.temporary_invalid_cchars = []
+        let sparams = llama_sampler_chain_default_params()
+        self.sampling = llama_sampler_chain_init(sparams)
+        llama_sampler_chain_add(self.sampling, llama_sampler_init_temp(0.4))
+        llama_sampler_chain_add(self.sampling, llama_sampler_init_dist(1234))
    }

    deinit {
+        llama_sampler_free(sampling)
        llama_batch_free(batch)
        llama_free(context)
        llama_free_model(model)
@@ -69,10 +75,9 @@ actor LlamaContext {
        print("Using \(n_threads) threads")

        var ctx_params = llama_context_default_params()
-        ctx_params.seed  = 1234
        ctx_params.n_ctx = 2048
-        ctx_params.n_threads       = UInt32(n_threads)
-        ctx_params.n_threads_batch = UInt32(n_threads)
+        ctx_params.n_threads       = Int32(n_threads)
+        ctx_params.n_threads_batch = Int32(n_threads)

        let context = llama_new_context_with_model(model, ctx_params)
        guard let context else {
@@ -144,20 +149,7 @@ actor LlamaContext {
    func completion_loop() -> String {
        var new_token_id: llama_token = 0

-        let n_vocab = llama_n_vocab(model)
-        let logits = llama_get_logits_ith(context, batch.n_tokens - 1)
-
-        var candidates = Array<llama_token_data>()
-        candidates.reserveCapacity(Int(n_vocab))
-
-        for token_id in 0..<n_vocab {
-            candidates.append(llama_token_data(id: token_id, logit: logits![Int(token_id)], p: 0.0))
-        }
-        candidates.withUnsafeMutableBufferPointer() { buffer in
-            var candidates_p = llama_token_data_array(data: buffer.baseAddress, size: buffer.count, sorted: false)
-
-            new_token_id = llama_sample_token_greedy(context, &candidates_p)
-        }
+        new_token_id = llama_sampler_sample(sampling, context, batch.n_tokens - 1)

        if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
            print("\n")
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -1,135 +1,783 @@
-" Requires an already running llama.cpp server
-" To install either copy or symlink to ~/.vim/autoload/llama.vim
-" Then start with either :call llama#doLlamaGen(),
-" or add a keybind to your vimrc such as
-" nnoremap Z :call llama#doLlamaGen()<CR>
-" Similarly, you could add an insert mode keybind with
-" inoremap <C-B> <Cmd>call llama#doLlamaGen()<CR>
+" LLM-based text completion using llama.cpp
 "
-" g:llama_api_url, g:llama_api_key and g:llama_overrides can be configured in your .vimrc
-" let g:llama_api_url = "192.168.1.10:8080"
-" llama_overrides can also be set through buffer/window scopes. For instance
-" autocmd filetype python let b:llama_overrides = {"temp": 0.2}
-" Could be added to your .vimrc to automatically set a lower temperature when
-" editing a python script
-" Additionally, an override dict can be stored at the top of a file
-" !*{"stop": ["User:"]}
-" Could be added to the start of your chatlog.txt to set the stopping token
-" These parameter dicts are merged together from lowest to highest priority:
-" server default -> g:llama_overrides -> w:llama_overrides ->
-" b:llama_overrides -> in file (!*) overrides
+" requires:
+"
+"   - neovim or vim
+"   - curl
+"   - llama.cpp server instance
+"   - FIM-compatible model
+"
+" sample config:
+"
+"   - Tab       - accept the current suggestion
+"   - Shift+Tab - accept just the first line of the suggestion
+"   - Ctrl+F    - toggle FIM completion manually
+"
+" make symlink or copy this file to ~/.config/nvim/autoload/llama.vim
+"
+" start the llama.cpp server with a FIM-compatible model. for example:
+"
+"   $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa -dt 0.1 --ubatch-size 512 --batch-size 1024 --cache-reuse 256
+"
+"   --batch-size [512, model max context]
+"
+"     adjust the batch size to control how much of the provided local context will be used during the inference
+"     lower values will use smaller part of the context around the cursor, which will result in faster processing
+"
+"   --ubatch-size [64, 2048]
+"
+"     chunks the batch into smaller chunks for faster processing
+"     depends on the specific hardware. use llama-bench to profile and determine the best size
+"
+"   --cache-reuse (ge:llama_config.n_predict, 1024]
+"
+"     this should be either 0 (disabled) or strictly larger than g:llama_config.n_predict
+"     using non-zero value enables context reuse on the server side which dramatically improves the performance at
+"     large contexts. a value of 256 should be good for all cases
+"
+" run this once to initialise llama.vim:
+"
+"   :call llama#init()
+"
+" more info: https://github.com/ggerganov/llama.cpp/pull/9787
 "
-" Sublists (like logit_bias and stop) are overridden, not merged
-" Example override:
-" !*{"logit_bias": [[13, -5], [2, false]], "temperature": 1, "top_k": 5, "top_p": 0.5, "n_predict": 256, "repeat_last_n": 256, "repeat_penalty": 1.17647}
-if !exists("g:llama_api_url")
-    let g:llama_api_url= "127.0.0.1:8080"
-endif
-if !exists("g:llama_overrides")
-   let g:llama_overrides = {}
-endif
-const s:querydata = {"n_predict": 256, "stop": [ "\n" ], "stream": v:true }
-const s:curlcommand = ['curl','--data-raw', "{\"prompt\":\"### System:\"}", '--silent', '--no-buffer', '--request', 'POST', '--url', g:llama_api_url .. '/completion', '--header', "Content-Type: application/json"]
-let s:linedict = {}

-func s:callbackHandler(bufn, channel, msg)
-   if len(a:msg) < 3
-      return
-   elseif a:msg[0] == "d"
-      let l:msg = a:msg[6:-1]
-   else
-      let l:msg = a:msg
-   endif
-   let l:decoded_msg = json_decode(l:msg)
-   let l:newtext = split(l:decoded_msg['content'], "\n", 1)
-   if len(l:newtext) > 0
-      call setbufline(a:bufn, s:linedict[a:bufn], getbufline(a:bufn, s:linedict[a:bufn])[0] .. newtext[0])
-   else
-      echo "nothing genned"
-   endif
-   if len(newtext) > 1
-      let l:failed = appendbufline(a:bufn, s:linedict[a:bufn], newtext[1:-1])
-      let s:linedict[a:bufn] = s:linedict[a:bufn] + len(newtext)-1
-   endif
-   if has_key(l:decoded_msg, "stop") && l:decoded_msg.stop
-       echo "Finished generation"
-   endif
-endfunction
+" colors (adjust to your liking)
+highlight llama_hl_hint guifg=#ff772f ctermfg=202
+highlight llama_hl_info guifg=#77ff2f ctermfg=119

-func llama#doLlamaGen()
-   if exists("b:job")
-      if job_status(b:job) == "run"
-         call job_stop(b:job)
-         return
-      endif
-   endif
+" general parameters:
+"
+"   endpoint:         llama.cpp server endpoint
+"   n_prefix:         number of lines before the cursor location to include in the local prefix
+"   n_suffix:         number of lines after  the cursor location to include in the local suffix
+"   n_predict:        max number of tokens to predict
+"   t_max_prompt_ms:  max alloted time for the prompt processing (TODO: not yet supported)
+"   t_max_predict_ms: max alloted time for the prediction
+"   show_info:        show extra info about the inference (0 - disabled, 1 - statusline, 2 - inline)
+"   auto_fim:         trigger FIM completion automatically on cursor movement
+"   max_line_suffix:  do not auto-trigger FIM completion if there are more than this number of characters to the right of the cursor
+"
+" ring buffer of chunks, accumulated with time upon:
+"
+"  - completion request
+"  - yank
+"  - entering a buffer
+"  - leaving a buffer
+"  - writing a file
+"
+" parameters for the ring-buffer with extra context:
+"
+"   ring_n_chunks:    max number of chunks to pass as extra context to the server (0 to disable)
+"   ring_chunk_size:  max size of the chunks (in number of lines)
+"                     note: adjust these numbers so that you don't overrun your context
+"                           at ring_n_chunks = 64 and ring_chunk_size = 64 you need ~32k context
+"   ring_scope:       the range around the cursor position (in number of lines) for gathering chunks after FIM
+"   ring_update_ms:   how often to process queued chunks in normal mode
+"
+let s:default_config = {
+    \ 'endpoint':         'http://127.0.0.1:8012/infill',
+    \ 'n_prefix':         256,
+    \ 'n_suffix':         64,
+    \ 'n_predict':        128,
+    \ 't_max_prompt_ms':  500,
+    \ 't_max_predict_ms': 3000,
+    \ 'show_info':        2,
+    \ 'auto_fim':         v:true,
+    \ 'max_line_suffix':  8,
+    \ 'ring_n_chunks':    64,
+    \ 'ring_chunk_size':  64,
+    \ 'ring_scope':       1024,
+    \ 'ring_update_ms':   1000,
+    \ }

-   let l:cbuffer = bufnr("%")
-   let s:linedict[l:cbuffer] = line('$')
-   let l:buflines = getbufline(l:cbuffer, 1, 1000)
-   let l:querydata = copy(s:querydata)
-   call extend(l:querydata, g:llama_overrides)
-   if exists("w:llama_overrides")
-      call extend(l:querydata, w:llama_overrides)
-   endif
-   if exists("b:llama_overrides")
-      call extend(l:querydata, b:llama_overrides)
-   endif
-   if l:buflines[0][0:1] == '!*'
-      let l:userdata = json_decode(l:buflines[0][2:-1])
-      call extend(l:querydata, l:userdata)
-      let l:buflines = l:buflines[1:-1]
-   endif
-   let l:querydata.prompt = join(l:buflines, "\n")
-   let l:curlcommand = copy(s:curlcommand)
-   if exists("g:llama_api_key")
-       call extend(l:curlcommand, ['--header', 'Authorization: Bearer ' .. g:llama_api_key])
-   endif
-   let l:curlcommand[2] = json_encode(l:querydata)
-   let b:job = job_start(l:curlcommand, {"callback": function("s:callbackHandler", [l:cbuffer])})
-endfunction
+let g:llama_config = get(g:, 'llama_config', s:default_config)

-" Echos the tokkenization of the provided string , or cursor to end of word
-" Onus is placed on the user to include the preceding space
-func llama#tokenizeWord(...)
-    if (a:0 > 0)
-        let l:input = a:1
-    else
-        exe "normal \"*ye"
-        let l:input = @*
-    endif
-    let l:querydata = {"content": l:input}
-    let l:curlcommand = copy(s:curlcommand)
-    let l:curlcommand[2] = json_encode(l:querydata)
-    let l:curlcommand[8] = g:llama_api_url .. "/tokenize"
-   let s:token_job = job_start(l:curlcommand, {"callback": function("s:tokenizeWordCallback", [l:input])})
-endfunction
-
-func s:tokenizeWordCallback(plaintext, channel, msg)
-    echo '"' .. a:plaintext ..'" - ' .. string(json_decode(a:msg).tokens)
-endfunction
-
-
-" Echos the token count of the entire buffer (or provided string)
-" Example usage :echo llama#tokenCount()
-func llama#tokenCount(...)
-    if (a:0 > 0)
-        let l:buflines = a:1
-    else
-        let l:buflines = getline(1,1000)
-        if l:buflines[0][0:1] == '!*'
-            let l:buflines = l:buflines[1:-1]
+function! s:get_indent(str)
+    let l:count = 0
+    for i in range(len(a:str))
+        if a:str[i] == "\t"
+            let l:count += &tabstop - 1
+        else
+            break
        endif
-        let l:buflines = join(l:buflines, "\n")
-    endif
-    let l:querydata = {"content": l:buflines}
-    let l:curlcommand = copy(s:curlcommand)
-    let l:curlcommand[2] = json_encode(l:querydata)
-    let l:curlcommand[8] = g:llama_api_url .. "/tokenize"
-   let s:token_job = job_start(l:curlcommand, {"callback": "s:tokenCountCallback"})
+    endfor
+    return l:count
 endfunction

-func s:tokenCountCallback(channel, msg)
-    let resp = json_decode(a:msg)
-    echo len(resp.tokens)
+function! s:rand(i0, i1) abort
+    return a:i0 + rand() % (a:i1 - a:i0 + 1)
+endfunction
+
+function! llama#init()
+    if !executable('curl')
+        echohl WarningMsg
+        echo 'llama.vim requires the "curl" command to be available'
+        echohl None
+        return
+    endif
+
+    let s:pos_x = 0 " cursor position upon start of completion
+    let s:pos_y = 0
+
+    let s:line_cur = ''
+
+    let s:line_cur_prefix = ''
+    let s:line_cur_suffix = ''
+
+    let s:ring_chunks = [] " current set of chunks used as extra context
+    let s:ring_queued = [] " chunks that are queued to be sent for processing
+    let s:ring_n_evict = 0
+
+    let s:hint_shown = v:false
+    let s:pos_y_pick = -9999 " last y where we picked a chunk
+    let s:pos_dx = 0
+    let s:content = []
+    let s:can_accept = v:false
+
+    let s:timer_fim = -1
+    let s:t_fim_start = reltime() " used to measure total FIM time
+    let s:t_last_move = reltime() " last time the cursor moved
+
+    let s:current_job = v:null
+
+    let s:ghost_text_nvim = exists('*nvim_buf_get_mark')
+    let s:ghost_text_vim = has('textprop')
+
+    if s:ghost_text_vim
+        let s:hlgroup_hint = 'llama_hl_hint'
+        let s:hlgroup_info = 'llama_hl_info'
+
+        if empty(prop_type_get(s:hlgroup_hint))
+            call prop_type_add(s:hlgroup_hint, {'highlight': s:hlgroup_hint})
+        endif
+        if empty(prop_type_get(s:hlgroup_info))
+            call prop_type_add(s:hlgroup_info, {'highlight': s:hlgroup_info})
+        endif
+    endif
+
+    augroup llama
+        autocmd!
+        autocmd InsertEnter     * inoremap <expr> <silent> <C-F> llama#fim_inline(v:false)
+        autocmd InsertLeavePre  * call llama#fim_cancel()
+
+        autocmd CursorMoved     * call s:on_move()
+        autocmd CursorMovedI    * call s:on_move()
+        autocmd CompleteChanged * call llama#fim_cancel()
+
+        if g:llama_config.auto_fim
+            autocmd CursorMovedI * call llama#fim(v:true)
+        endif
+
+        " gather chunks upon yanking
+        autocmd TextYankPost    * if v:event.operator ==# 'y' | call s:pick_chunk(v:event.regcontents, v:false, v:true) | endif
+
+        " gather chunks upon entering/leaving a buffer
+        autocmd BufEnter        * call timer_start(100, {-> s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true)})
+        autocmd BufLeave        * call                      s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true)
+
+        " gather chunk upon saving the file
+        autocmd BufWritePost    * call s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true)
+    augroup END
+
+    silent! call llama#fim_cancel()
+
+    " init background update of the ring buffer
+    if g:llama_config.ring_n_chunks > 0
+        call s:ring_update()
+    endif
+endfunction
+
+" compute how similar two chunks of text are
+" 0 - no similarity, 1 - high similarity
+" TODO: figure out something better
+function! s:chunk_sim(c0, c1)
+    let l:lines0 = len(a:c0)
+    let l:lines1 = len(a:c1)
+
+    let l:common = 0
+
+    for l:line0 in a:c0
+        for l:line1 in a:c1
+            if l:line0 == l:line1
+                let l:common += 1
+                break
+            endif
+        endfor
+    endfor
+
+    return 2.0 * l:common / (l:lines0 + l:lines1)
+endfunction
+
+" pick a random chunk of size g:llama_config.ring_chunk_size from the provided text and queue it for processing
+"
+" no_mod   - do not pick chunks from buffers with pending changes
+" do_evict - evict chunks that are very similar to the new one
+"
+function! s:pick_chunk(text, no_mod, do_evict)
+    " do not pick chunks from buffers with pending changes or buffers that are not files
+    if a:no_mod && (getbufvar(bufnr('%'), '&modified') || !buflisted(bufnr('%')) || !filereadable(expand('%')))
+        return
+    endif
+
+    " if the extra context option is disabled - do nothing
+    if g:llama_config.ring_n_chunks <= 0
+        return
+    endif
+
+    " don't pick very small chunks
+    if len(a:text) < 3
+        return
+    endif
+
+    if len(a:text) + 1 < g:llama_config.ring_chunk_size
+        let l:chunk = a:text
+    else
+        let l:l0 = s:rand(0, max([0, len(a:text) - g:llama_config.ring_chunk_size/2]))
+        let l:l1 = min([l:l0 + g:llama_config.ring_chunk_size/2, len(a:text)])
+
+        let l:chunk = a:text[l:l0:l:l1]
+    endif
+
+    let l:chunk_str = join(l:chunk, "\n") . "\n"
+
+    " check if this chunk is already added
+    let l:exist = v:false
+
+    for i in range(len(s:ring_chunks))
+        if s:ring_chunks[i].data == l:chunk
+            let l:exist = v:true
+            break
+        endif
+    endfor
+
+    for i in range(len(s:ring_queued))
+        if s:ring_queued[i].data == l:chunk
+            let l:exist = v:true
+            break
+        endif
+    endfor
+
+    if l:exist
+        return
+    endif
+
+    " evict queued chunks that are very similar to the new one
+    for i in range(len(s:ring_queued) - 1, 0, -1)
+        if s:chunk_sim(s:ring_queued[i].data, l:chunk) > 0.9
+            if a:do_evict
+                call remove(s:ring_queued, i)
+                let s:ring_n_evict += 1
+            else
+                return
+            endif
+        endif
+    endfor
+
+    " also from s:ring_chunks
+    for i in range(len(s:ring_chunks) - 1, 0, -1)
+        if s:chunk_sim(s:ring_chunks[i].data, l:chunk) > 0.9
+            if a:do_evict
+                call remove(s:ring_chunks, i)
+                let s:ring_n_evict += 1
+            else
+                return
+            endif
+        endif
+    endfor
+
+    " TODO: become parameter ?
+    if len(s:ring_queued) == 16
+        call remove(s:ring_queued, 0)
+    endif
+
+    call add(s:ring_queued, {'data': l:chunk, 'str': l:chunk_str, 'time': reltime(), 'filename': expand('%')})
+
+    "let &statusline = 'extra context: ' . len(s:ring_chunks) . ' / ' . len(s:ring_queued)
+endfunction
+
+" picks a queued chunk, sends it for processing and adds it to s:ring_chunks
+" called every g:llama_config.ring_update_ms
+function! s:ring_update()
+    call timer_start(g:llama_config.ring_update_ms, {-> s:ring_update()})
+
+    " update only if in normal mode or if the cursor hasn't moved for a while
+    if mode() !=# 'n' && reltimefloat(reltime(s:t_last_move)) < 3.0
+        return
+    endif
+
+    if len(s:ring_queued) == 0
+        return
+    endif
+
+    " move the first queued chunk to the ring buffer
+    if len(s:ring_chunks) == g:llama_config.ring_n_chunks
+        call remove(s:ring_chunks, 0)
+    endif
+
+    call add(s:ring_chunks, remove(s:ring_queued, 0))
+
+    "let &statusline = 'updated context: ' . len(s:ring_chunks) . ' / ' . len(s:ring_queued)
+
+    " send asynchronous job with the new extra context so that it is ready for the next FIM
+    let l:extra_context = []
+    for l:chunk in s:ring_chunks
+        call add(l:extra_context, {
+            \ 'text':     l:chunk.str,
+            \ 'time':     l:chunk.time,
+            \ 'filename': l:chunk.filename
+            \ })
+    endfor
+
+    " no samplers needed here
+    let l:request = json_encode({
+        \ 'input_prefix':     "",
+        \ 'input_suffix':     "",
+        \ 'input_extra':      l:extra_context,
+        \ 'prompt':           "",
+        \ 'n_predict':        1,
+        \ 'temperature':      0.0,
+        \ 'stream':           v:false,
+        \ 'samplers':         ["temperature"],
+        \ 'cache_prompt':     v:true,
+        \ 't_max_prompt_ms':  1,
+        \ 't_max_predict_ms': 1
+        \ })
+
+    let l:curl_command = [
+        \ "curl",
+        \ "--silent",
+        \ "--no-buffer",
+        \ "--request", "POST",
+        \ "--url", g:llama_config.endpoint,
+        \ "--header", "Content-Type: application/json",
+        \ "--data", l:request
+        \ ]
+
+    " no callbacks because we don't need to process the response
+    if s:ghost_text_nvim
+        call jobstart(l:curl_command, {})
+    elseif s:ghost_text_vim
+        call job_start(l:curl_command, {})
+    endif
+endfunction
+
+" necessary for 'inoremap <expr>'
+function! llama#fim_inline(is_auto) abort
+    call llama#fim(a:is_auto)
+    return ''
+endfunction
+
+" the main FIM call
+" takes local context around the cursor and sends it together with the extra context to the server for completion
+function! llama#fim(is_auto) abort
+    " we already have a suggestion for the current cursor position
+    if s:hint_shown && !a:is_auto
+        call llama#fim_cancel()
+        return
+    endif
+
+    call llama#fim_cancel()
+
+    " avoid sending repeated requests too fast
+    if reltimefloat(reltime(s:t_fim_start)) < 0.6
+        if s:timer_fim != -1
+            call timer_stop(s:timer_fim)
+            let s:timer_fim = -1
+        endif
+
+        let s:t_fim_start = reltime()
+        let s:timer_fim = timer_start(600, {-> llama#fim(v:true)})
+        return
+    endif
+
+    let s:t_fim_start = reltime()
+
+    let s:content = []
+    let s:can_accept = v:false
+
+    let s:pos_x = col('.') - 1
+    let s:pos_y = line('.')
+    let l:max_y = line('$')
+
+    let l:lines_prefix = getline(max([1, s:pos_y - g:llama_config.n_prefix]), s:pos_y - 1)
+    let l:lines_suffix = getline(s:pos_y + 1, min([l:max_y, s:pos_y + g:llama_config.n_suffix]))
+
+    let s:line_cur = getline('.')
+
+    let s:line_cur_prefix = strpart(s:line_cur, 0, s:pos_x)
+    let s:line_cur_suffix = strpart(s:line_cur, s:pos_x)
+
+    if a:is_auto && len(s:line_cur_suffix) > g:llama_config.max_line_suffix
+        return
+    endif
+
+    let l:prefix = ""
+        \ . join(l:lines_prefix, "\n")
+        \ . "\n"
+
+    let l:prompt = ""
+        \ . s:line_cur_prefix
+
+    let l:suffix = ""
+        \ . s:line_cur_suffix
+        \ . "\n"
+        \ . join(l:lines_suffix, "\n")
+        \ . "\n"
+
+    " prepare the extra context data
+    let l:extra_context = []
+    for l:chunk in s:ring_chunks
+        call add(l:extra_context, {
+            \ 'text':     l:chunk.str,
+            \ 'time':     l:chunk.time,
+            \ 'filename': l:chunk.filename
+            \ })
+    endfor
+
+    " the indentation of the current line
+    let l:indent = strlen(matchstr(s:line_cur_prefix, '^\s*'))
+
+    let l:request = json_encode({
+        \ 'input_prefix':     l:prefix,
+        \ 'input_suffix':     l:suffix,
+        \ 'input_extra':      l:extra_context,
+        \ 'prompt':           l:prompt,
+        \ 'n_predict':        g:llama_config.n_predict,
+        \ 'n_indent':         l:indent,
+        \ 'top_k':            40,
+        \ 'top_p':            0.99,
+        \ 'stream':           v:false,
+        \ 'samplers':         ["top_k", "top_p", "infill"],
+        \ 'cache_prompt':     v:true,
+        \ 't_max_prompt_ms':  g:llama_config.t_max_prompt_ms,
+        \ 't_max_predict_ms': g:llama_config.t_max_predict_ms
+        \ })
+
+    let l:curl_command = [
+        \ "curl",
+        \ "--silent",
+        \ "--no-buffer",
+        \ "--request", "POST",
+        \ "--url", g:llama_config.endpoint,
+        \ "--header", "Content-Type: application/json",
+        \ "--data", l:request
+        \ ]
+
+    if s:current_job != v:null
+        if s:ghost_text_nvim
+            call jobstop(s:current_job)
+        elseif s:ghost_text_vim
+            call job_stop(s:current_job)
+        endif
+    endif
+
+    " send the request asynchronously
+    if s:ghost_text_nvim
+        let s:current_job = jobstart(l:curl_command, {
+            \ 'on_stdout': function('s:fim_on_stdout', [s:pos_x, s:pos_y, a:is_auto]),
+            \ 'on_exit':   function('s:fim_on_exit'),
+            \ 'stdout_buffered': v:true
+            \ })
+    elseif s:ghost_text_vim
+        let s:current_job = job_start(l:curl_command, {
+            \ 'out_cb': function('s:fim_on_stdout', [s:pos_x, s:pos_y, a:is_auto]),
+            \ 'exit_cb':   function('s:fim_on_exit')
+            \ })
+    endif
+
+    " TODO: per-file location
+    let l:delta_y = abs(s:pos_y - s:pos_y_pick)
+
+    " gather some extra context nearby and process it in the background
+    " only gather chunks if the cursor has moved a lot
+    " TODO: something more clever? reranking?
+    if a:is_auto && l:delta_y > 32
+        " expand the prefix even further
+        call s:pick_chunk(getline(max([1,       s:pos_y - g:llama_config.ring_scope]), max([1,       s:pos_y - g:llama_config.n_prefix])), v:false, v:false)
+
+        " pick a suffix chunk
+        call s:pick_chunk(getline(min([l:max_y, s:pos_y + g:llama_config.n_suffix]),   min([l:max_y, s:pos_y + g:llama_config.n_suffix + g:llama_config.ring_chunk_size])), v:false, v:false)
+
+        let s:pos_y_pick = s:pos_y
+    endif
+endfunction
+
+" if first_line == v:true accept only the first line of the response
+function! llama#fim_accept(first_line)
+    " insert the suggestion at the cursor location
+    if s:can_accept && len(s:content) > 0
+        call setline(s:pos_y, s:line_cur[:(s:pos_x - 1)] . s:content[0])
+        if len(s:content) > 1
+            if !a:first_line
+                call append(s:pos_y, s:content[1:-1])
+            endif
+        endif
+
+        " move the cursor to the end of the accepted text
+        if !a:first_line && len(s:content) > 1
+            call cursor(s:pos_y + len(s:content) - 1, s:pos_x + s:pos_dx + 1)
+        else
+            call cursor(s:pos_y, s:pos_x + len(s:content[0]))
+        endif
+    endif
+
+    call llama#fim_cancel()
+endfunction
+
+function! llama#fim_cancel()
+    let s:hint_shown = v:false
+
+    " clear the virtual text
+    let l:bufnr = bufnr('%')
+
+    if s:ghost_text_nvim
+        let l:id_vt_fim = nvim_create_namespace('vt_fim')
+        call nvim_buf_clear_namespace(l:bufnr, l:id_vt_fim,  0, -1)
+    elseif s:ghost_text_vim
+        call prop_remove({'type': s:hlgroup_hint, 'all': v:true})
+        call prop_remove({'type': s:hlgroup_info, 'all': v:true})
+    endif
+
+    " remove the mappings
+    silent! iunmap <buffer> <Tab>
+    silent! iunmap <buffer> <S-Tab>
+    silent! iunmap <buffer> <Esc>
+endfunction
+
+function! s:on_move()
+    let s:t_last_move = reltime()
+
+    call llama#fim_cancel()
+endfunction
+
+" callback that processes the FIM result from the server and displays the suggestion
+function! s:fim_on_stdout(pos_x, pos_y, is_auto, job_id, data, event = v:null)
+    if s:ghost_text_nvim
+        let l:raw = join(a:data, "\n")
+    elseif s:ghost_text_vim
+        let l:raw = a:data
+    endif
+
+    if len(l:raw) == 0
+        return
+    endif
+
+    if a:pos_x != col('.') - 1 || a:pos_y != line('.')
+        return
+    endif
+
+    " show the suggestion only in insert mode
+    if mode() !=# 'i'
+        return
+    endif
+
+    let s:pos_x = a:pos_x
+    let s:pos_y = a:pos_y
+
+    let s:can_accept = v:true
+    let l:has_info   = v:false
+
+    if s:can_accept && v:shell_error
+        if !a:is_auto
+            call add(s:content, "<| curl error: is the server on? |>")
+        endif
+        let s:can_accept = v:false
+    endif
+
+    let l:n_prompt    = 0
+    let l:t_prompt_ms = 1.0
+    let l:s_prompt    = 0
+
+    let l:n_predict    = 0
+    let l:t_predict_ms = 1.0
+    let l:s_predict    = 0
+
+    " get the generated suggestion
+    if s:can_accept
+        let l:response = json_decode(l:raw)
+
+        for l:part in split(get(l:response, 'content', ''), "\n", 1)
+            call add(s:content, l:part)
+        endfor
+
+        " remove trailing new lines
+        while len(s:content) > 0 && s:content[-1] == ""
+            call remove(s:content, -1)
+        endwhile
+
+        let l:generation_settings = get(l:response, 'generation_settings', {})
+        let l:n_ctx = get(l:generation_settings, 'n_ctx', 0)
+
+        let l:n_cached  = get(l:response, 'tokens_cached', 0)
+        let l:truncated = get(l:response, 'truncated', v:false)
+
+        " if response.timings is available
+        if len(get(l:response, 'timings', {})) > 0
+            let l:has_info = v:true
+            let l:timings  = get(l:response, 'timings', {})
+
+            let l:n_prompt    = get(l:timings, 'prompt_n', 0)
+            let l:t_prompt_ms = get(l:timings, 'prompt_ms', 1)
+            let l:s_prompt    = get(l:timings, 'prompt_per_second', 0)
+
+            let l:n_predict    = get(l:timings, 'predicted_n', 0)
+            let l:t_predict_ms = get(l:timings, 'predicted_ms', 1)
+            let l:s_predict    = get(l:timings, 'predicted_per_second', 0)
+        endif
+    endif
+
+    if len(s:content) == 0
+        call add(s:content, "")
+        let s:can_accept = v:false
+    endif
+
+    if len(s:content) == 0
+        return
+    endif
+
+    " NOTE: the following is logic for discarding predictions that repeat existing text
+    "       the code is quite ugly and there is very likely a simpler and more canonical way to implement this
+    "
+    "       still, I wonder if there is some better way that avoids having to do these special hacks?
+    "       on one hand, the LLM 'sees' the contents of the file before we start editing, so it is normal that it would
+    "       start generating whatever we have given it via the extra context. but on the other hand, it's not very
+    "       helpful to re-generate the same code that is already there
+
+    " truncate the suggestion if the first line is empty
+    if len(s:content) == 1 && s:content[0] == ""
+        let s:content = [""]
+    endif
+
+    " ... and the next lines are repeated
+    if len(s:content) > 1 && s:content[0] == "" && s:content[1:] == getline(s:pos_y + 1, s:pos_y + len(s:content) - 1)
+        let s:content = [""]
+    endif
+
+    " truncate the suggestion if it repeats the suffix
+    if len(s:content) == 1 && s:content[0] == s:line_cur_suffix
+        let s:content = [""]
+    endif
+
+    " find the first non-empty line (strip whitespace)
+    let l:cmp_y = s:pos_y + 1
+    while l:cmp_y < line('$') && getline(l:cmp_y) =~? '^\s*$'
+        let l:cmp_y += 1
+    endwhile
+
+    if (s:line_cur_prefix . s:content[0]) == getline(l:cmp_y)
+        " truncate the suggestion if it repeats the next line
+        if len(s:content) == 1
+            let s:content = [""]
+        endif
+
+        " ... or if the second line of the suggestion is the prefix of line l:cmp_y + 1
+        if len(s:content) == 2 && s:content[-1] == getline(l:cmp_y + 1)[:len(s:content[-1]) - 1]
+            let s:content = [""]
+        endif
+
+        " ... or if the middle chunk of lines of the suggestion is the same as [l:cmp_y + 1, l:cmp_y + len(s:content) - 1)
+        if len(s:content) > 2 && join(s:content[1:-1], "\n") == join(getline(l:cmp_y + 1, l:cmp_y + len(s:content) - 1), "\n")
+            let s:content = [""]
+        endif
+    endif
+
+    " keep only lines that have the same or larger whitespace prefix as s:line_cur_prefix
+    "let l:indent = strlen(matchstr(s:line_cur_prefix, '^\s*'))
+    "for i in range(1, len(s:content) - 1)
+    "    if strlen(matchstr(s:content[i], '^\s*')) < l:indent
+    "        let s:content = s:content[:i - 1]
+    "        break
+    "    endif
+    "endfor
+
+    let s:pos_dx = len(s:content[-1])
+
+    let s:content[-1] .= s:line_cur_suffix
+
+    call llama#fim_cancel()
+
+    " display virtual text with the suggestion
+    let l:bufnr = bufnr('%')
+
+    if s:ghost_text_nvim
+        let l:id_vt_fim = nvim_create_namespace('vt_fim')
+    endif
+
+    " construct the info message
+    if g:llama_config.show_info > 0 && l:has_info
+        let l:prefix = '   '
+
+        if l:truncated
+            let l:info = printf("%s | WARNING: the context is full: %d / %d, increase the server context size or reduce g:llama_config.ring_n_chunks",
+                \ g:llama_config.show_info == 2 ? l:prefix : 'llama.vim',
+                \ l:n_cached, l:n_ctx
+                \ )
+        else
+            let l:info = printf("%s | c: %d / %d, r: %d / %d, e: %d, q: %d / 16 | p: %d (%.2f ms, %.2f t/s) | g: %d (%.2f ms, %.2f t/s) | t: %.2f ms",
+                \ g:llama_config.show_info == 2 ? l:prefix : 'llama.vim',
+                \ l:n_cached,  l:n_ctx, len(s:ring_chunks), g:llama_config.ring_n_chunks, s:ring_n_evict, len(s:ring_queued),
+                \ l:n_prompt,  l:t_prompt_ms,  l:s_prompt,
+                \ l:n_predict, l:t_predict_ms, l:s_predict,
+                \ 1000.0 * reltimefloat(reltime(s:t_fim_start))
+                \ )
+        endif
+
+        if g:llama_config.show_info == 1
+            " display the info in the statusline
+            let &statusline = l:info
+            let l:info = ''
+        endif
+    endif
+
+    " display the suggestion and append the info to the end of the first line
+    if s:ghost_text_nvim
+        call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, s:pos_y - 1, s:pos_x - 1, {
+            \ 'virt_text': [[s:content[0], 'llama_hl_hint'], [l:info, 'llama_hl_info']],
+            \ 'virt_text_win_col': virtcol('.') - 1
+            \ })
+
+        call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, s:pos_y - 1, 0, {
+            \ 'virt_lines': map(s:content[1:], {idx, val -> [[val, 'llama_hl_hint']]}),
+            \ 'virt_text_win_col': virtcol('.')
+            \ })
+    elseif s:ghost_text_vim
+        let l:new_suffix = s:content[0]
+        if !empty(l:new_suffix)
+            call prop_add(s:pos_y, s:pos_x + 1, {
+                        \ 'type': s:hlgroup_hint,
+                        \ 'text': l:new_suffix
+                        \ })
+        endif
+        for line in s:content[1:]
+            call prop_add(s:pos_y, 0, {
+                        \ 'type': s:hlgroup_hint,
+                        \ 'text': line,
+                        \ 'text_padding_left': s:get_indent(line),
+                        \ 'text_align': 'below'
+                        \ })
+        endfor
+        if !empty(l:info)
+            call prop_add(s:pos_y, 0, {
+                        \ 'type': s:hlgroup_info,
+                        \ 'text': l:info,
+                        \ 'text_padding_left': col('$'),
+                        \ 'text_wrap': 'truncate'
+                        \ })
+        endif
+    endif
+
+    " setup accept shortcuts
+    inoremap <buffer> <Tab>   <C-O>:call llama#fim_accept(v:false)<CR>
+    inoremap <buffer> <S-Tab> <C-O>:call llama#fim_accept(v:true)<CR>
+
+    let s:hint_shown = v:true
+endfunction
+
+function! s:fim_on_exit(job_id, exit_code, event = v:null)
+    if a:exit_code != 0
+        echom "Job failed with exit code: " . a:exit_code
+    endif
+
+    let s:current_job = v:null
 endfunction
--- a/examples/llava/MobileVLM-README.md
+++ b/examples/llava/MobileVLM-README.md
@@ -39,7 +39,7 @@ python ./examples/llava/llava_surgery.py -m path/to/MobileVLM-1.7B
 3. Use `convert_image_encoder_to_gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:

 ```sh
-python ./examples/llava/convert_image_encoder_to_gguf \
+python ./examples/llava/convert_image_encoder_to_gguf.py \
    -m path/to/clip-vit-large-patch14-336 \
    --llava-projector path/to/MobileVLM-1.7B/llava.projector \
    --output-dir path/to/MobileVLM-1.7B \
@@ -47,7 +47,7 @@ python ./examples/llava/convert_image_encoder_to_gguf \
 ```

 ```sh
-python ./examples/llava/convert_image_encoder_to_gguf \
+python ./examples/llava/convert_image_encoder_to_gguf.py \
    -m path/to/clip-vit-large-patch14-336 \
    --llava-projector path/to/MobileVLM-1.7B_V2/llava.projector \
    --output-dir path/to/MobileVLM-1.7B_V2 \
@@ -57,12 +57,12 @@ python ./examples/llava/convert_image_encoder_to_gguf \
 4. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF:

 ```sh
-python ./examples/convert_legacy_llama.py path/to/MobileVLM-1.7B
+python ./examples/convert_legacy_llama.py path/to/MobileVLM-1.7B --skip-unknown
 ```

-5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`
+5. Use `quantize` to convert LLaMA part's DataType from `fp32` to `q4_k`
 ```sh
-./llama-quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
+./llama-quantize path/to/MobileVLM-1.7B/ggml-model-F32.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
 ```

 Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directory.
--- a/examples/llava/README-minicpmv2.5.md
+++ b/examples/llava/README-minicpmv2.5.md
@@ -15,8 +15,8 @@ cd llama.cpp
 Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us)

 ```bash
-python ./examples/minicpmv/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
-python ./examples/minicpmv/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2
+python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
+python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2
 python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model

 # quantize int4 version
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -3,8 +3,8 @@
 // I'll gradually clean and extend it
 // Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
 #include "clip.h"
-#include "log.h"
 #include "ggml.h"
+#include "ggml-cpu.h"
 #include "ggml-alloc.h"
 #include "ggml-backend.h"

@@ -40,6 +40,11 @@
 #include <cinttypes>
 #include <limits>

+#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
+#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
+#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
+#define LOG_DBG(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
+
 //#define CLIP_DEBUG_FUNCTIONS

 // RGB uint8 image
@@ -165,7 +170,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
 static int get_key_idx(const gguf_context * ctx, const char * key) {
    int i = gguf_find_key(ctx, key);
    if (i == -1) {
-        LOG_TEE("key %s not found in file\n", key);
+        LOG_ERR("key %s not found in file\n", key);
        throw std::runtime_error(format("Missing required key: %s", key));
    }

@@ -270,7 +275,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {

 static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") {
    size_t tensor_size = ggml_nbytes(tensor);
-    LOG_TEE("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
+    LOG_INF("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
            prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
            tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type));
 }
@@ -288,7 +293,7 @@ static projector_type clip_projector_type_from_string(const std::string & name)
 static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
    std::ofstream file(filename, std::ios::binary);
    if (!file.is_open()) {
-        LOG_TEE("Failed to open file for writing: %s\n", filename.c_str());
+        LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
        return;
    }

@@ -307,7 +312,7 @@ static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::s
 static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
    std::ofstream file(filename, std::ios::binary);
    if (!file.is_open()) {
-        LOG_TEE("Failed to open file for writing: %s\n", filename.c_str());
+        LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
        return;
    }

@@ -568,7 +573,7 @@ struct clip_ctx {

 static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
    if (!ctx->has_vision_encoder) {
-        LOG_TEE("This gguf file seems to have no vision encoder\n");
+        LOG_ERR("This gguf file seems to have no vision encoder\n");
        return nullptr;
    }

@@ -582,7 +587,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
        if (load_image_size == nullptr) {
            load_image_size = clip_image_size_init();
        }
-        LOG_TEE("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height);
+        LOG_DBG("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height);
        image_size_width  = load_image_size->width;
        image_size_height = load_image_size->height;
        if (is_inf) {
@@ -1047,21 +1052,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        const int idx_name = gguf_find_key(ctx, KEY_NAME);
        if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
            const std::string name = gguf_get_val_str(ctx, idx_name);
-            LOG_TEE("%s: model name:   %s\n", __func__, name.c_str());
+            LOG_INF("%s: model name:   %s\n", __func__, name.c_str());
        }
-        LOG_TEE("%s: description:  %s\n", __func__, description.c_str());
-        LOG_TEE("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
-        LOG_TEE("%s: alignment:    %zu\n", __func__, gguf_get_alignment(ctx));
-        LOG_TEE("%s: n_tensors:    %d\n", __func__, n_tensors);
-        LOG_TEE("%s: n_kv:         %d\n", __func__, n_kv);
-        LOG_TEE("%s: ftype:        %s\n", __func__, ftype_str.c_str());
-        LOG_TEE("\n");
+        LOG_INF("%s: description:  %s\n", __func__, description.c_str());
+        LOG_INF("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
+        LOG_INF("%s: alignment:    %zu\n", __func__, gguf_get_alignment(ctx));
+        LOG_INF("%s: n_tensors:    %d\n", __func__, n_tensors);
+        LOG_INF("%s: n_kv:         %d\n", __func__, n_kv);
+        LOG_INF("%s: ftype:        %s\n", __func__, ftype_str.c_str());
+        LOG_INF("\n");
    }
    const int n_tensors = gguf_get_n_tensors(ctx);

    // kv
    const int n_kv = gguf_get_n_kv(ctx);
-    LOG_TEE("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
+    LOG_INF("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
        __func__, n_kv, n_tensors, fname);
    {
        std::map<enum ggml_type, uint32_t> n_type;
@@ -1072,7 +1077,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            n_type[type]++;
        }

-        LOG_TEE("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
+        LOG_INF("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
        for (int i = 0; i < n_kv; i++) {
            const char * name           = gguf_get_key(ctx, i);
            const enum gguf_type type   = gguf_get_kv_type(ctx, i);
@@ -1088,7 +1093,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            }
            replace_all(value, "\n", "\\n");

-            LOG_TEE("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
+            LOG_INF("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
        }

        // print type counts
@@ -1097,7 +1102,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
                continue;
            }

-            LOG_TEE("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
+            LOG_INF("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
        }
    }

@@ -1112,7 +1117,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            size_t tensor_size = ggml_nbytes(cur);
            model_size += tensor_size;
            if (verbosity >= 3) {
-                LOG_TEE("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
+                LOG_INF("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
                       __func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
            }
        }
@@ -1139,27 +1144,27 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {

 #ifdef GGML_USE_CUDA
    new_clip->backend = ggml_backend_cuda_init(0);
-    LOG_TEE("%s: CLIP using CUDA backend\n", __func__);
+    LOG_INF("%s: CLIP using CUDA backend\n", __func__);
 #endif

 #ifdef GGML_USE_METAL
    new_clip->backend = ggml_backend_metal_init();
-    LOG_TEE("%s: CLIP using Metal backend\n", __func__);
+    LOG_INF("%s: CLIP using Metal backend\n", __func__);
 #endif

 #ifdef GGML_USE_CANN
    new_clip->backend = ggml_backend_cann_init(0);
-    LOG_TEE("%s: CLIP using CANN backend\n", __func__);
+    LOG_INF("%s: CLIP using CANN backend\n", __func__);
 #endif

 #ifdef GGML_USE_VULKAN
    new_clip->backend = ggml_backend_vk_init(0);
-    LOG_TEE("%s: CLIP using Vulkan backend\n", __func__);
+    LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
 #endif

    if (!new_clip->backend) {
        new_clip->backend = ggml_backend_cpu_init();
-        LOG_TEE("%s: CLIP using CPU backend\n", __func__);
+        LOG_INF("%s: CLIP using CPU backend\n", __func__);
    }

    // model size and capabilities
@@ -1194,16 +1199,16 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        new_clip->use_gelu = gguf_get_val_bool(ctx, idx);

        if (verbosity >= 1) {
-            LOG_TEE("%s: text_encoder:   %d\n", __func__, new_clip->has_text_encoder);
-            LOG_TEE("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
-            LOG_TEE("%s: llava_projector:  %d\n", __func__, new_clip->has_llava_projector);
-            LOG_TEE("%s: minicpmv_projector:  %d\n", __func__, new_clip->has_minicpmv_projector);
-            LOG_TEE("%s: model size:     %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
-            LOG_TEE("%s: metadata size:  %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
+            LOG_INF("%s: text_encoder:   %d\n", __func__, new_clip->has_text_encoder);
+            LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
+            LOG_INF("%s: llava_projector:  %d\n", __func__, new_clip->has_llava_projector);
+            LOG_INF("%s: minicpmv_projector:  %d\n", __func__, new_clip->has_minicpmv_projector);
+            LOG_INF("%s: model size:     %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
+            LOG_INF("%s: metadata size:  %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
        }
    }

-    LOG_TEE("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);
+    LOG_INF("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);

    // load tensors
    {
@@ -1216,7 +1221,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {

        new_clip->ctx_data = ggml_init(params);
        if (!new_clip->ctx_data) {
-            LOG_TEE("%s: ggml_init() failed\n", __func__);
+            LOG_ERR("%s: ggml_init() failed\n", __func__);
            clip_free(new_clip);
            gguf_free(ctx);
            return nullptr;
@@ -1224,7 +1229,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {

        auto fin = std::ifstream(fname, std::ios::binary);
        if (!fin) {
-            LOG_TEE("cannot open model file for loading tensors\n");
+            LOG_ERR("cannot open model file for loading tensors\n");
            clip_free(new_clip);
            gguf_free(ctx);
            return nullptr;
@@ -1246,7 +1251,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
            fin.seekg(offset, std::ios::beg);
            if (!fin) {
-                LOG_TEE("%s: failed to seek for tensor %s\n", __func__, name);
+                LOG_ERR("%s: failed to seek for tensor %s\n", __func__, name);
                clip_free(new_clip);
                gguf_free(ctx);
                return nullptr;
@@ -1317,23 +1322,23 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        }

        if (verbosity >= 2) {
-            LOG_TEE("\n%s: vision model hparams\n", __func__);
-            LOG_TEE("image_size         %d\n", hparams.image_size);
-            LOG_TEE("patch_size         %d\n", hparams.patch_size);
-            LOG_TEE("v_hidden_size      %d\n", hparams.hidden_size);
-            LOG_TEE("v_n_intermediate   %d\n", hparams.n_intermediate);
-            LOG_TEE("v_projection_dim   %d\n", hparams.projection_dim);
-            LOG_TEE("v_n_head           %d\n", hparams.n_head);
-            LOG_TEE("v_n_layer          %d\n", hparams.n_layer);
-            LOG_TEE("v_eps              %f\n", hparams.eps);
-            LOG_TEE("v_image_mean       %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
-            LOG_TEE("v_image_std        %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
-            LOG_TEE("v_image_grid_pinpoints: ");
+            LOG_INF("\n%s: vision model hparams\n", __func__);
+            LOG_INF("image_size         %d\n", hparams.image_size);
+            LOG_INF("patch_size         %d\n", hparams.patch_size);
+            LOG_INF("v_hidden_size      %d\n", hparams.hidden_size);
+            LOG_INF("v_n_intermediate   %d\n", hparams.n_intermediate);
+            LOG_INF("v_projection_dim   %d\n", hparams.projection_dim);
+            LOG_INF("v_n_head           %d\n", hparams.n_head);
+            LOG_INF("v_n_layer          %d\n", hparams.n_layer);
+            LOG_INF("v_eps              %f\n", hparams.eps);
+            LOG_INF("v_image_mean       %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
+            LOG_INF("v_image_std        %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
+            LOG_INF("v_image_grid_pinpoints: ");
            for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) {
-                LOG_TEE("%d ", hparams.image_grid_pinpoints[i]);
+                LOG_INF("%d ", hparams.image_grid_pinpoints[i]);
            }
-            LOG_TEE("\n");
-            LOG_TEE("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
+            LOG_INF("\n");
+            LOG_INF("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);

        }

@@ -1371,7 +1376,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            vision_model.patch_embeddings    = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
            vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
        } catch(const std::exception& /*e*/) {
-            LOG_TEE("%s: failed to load vision model tensors\n", __func__);
+            LOG_ERR("%s: failed to load vision model tensors\n", __func__);
        }

        // LLaVA projection
@@ -1400,7 +1405,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            } catch (std::runtime_error & /*e*/) { }
            try {
                vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE);
-                // LOG_TEE("%s: image_newline tensor (llava-1.6) found\n", __func__);
+                // LOG_INF("%s: image_newline tensor (llava-1.6) found\n", __func__);
            } catch (std::runtime_error & /*e*/) { }
        } else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
            // MobileVLM projection
@@ -1501,7 +1506,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
        ggml_gallocr_reserve(new_clip->compute_alloc, gf);
        size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
-        LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
+        LOG_INF("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
    }

    return new_clip;
@@ -1552,7 +1557,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
    int nx, ny, nc;
    auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
    if (!data) {
-        LOG_TEE("%s: failed to load image '%s'\n", __func__, fname);
+        LOG_ERR("%s: failed to load image '%s'\n", __func__, fname);
        return false;
    }
    build_clip_img_from_data(data, nx, ny, img);
@@ -1564,7 +1569,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
    int nx, ny, nc;
    auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
    if (!data) {
-        LOG_TEE("%s: failed to decode image bytes\n", __func__);
+        LOG_ERR("%s: failed to decode image bytes\n", __func__);
        return false;
    }
    build_clip_img_from_data(data, nx, ny, img);
@@ -1623,7 +1628,7 @@ static void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32*
    }
 }

-inline float clip(float x, float lower, float upper) {
+inline int clip(int x, int lower, int upper) {
    return std::max(lower, std::min(x, upper));
 }

@@ -1754,7 +1759,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int> & or
        int downscaled_height = static_cast<int>(original_height * scale);
        int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
        int wasted_resolution = (width * height) - effective_resolution;
-        // LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
+        // LOG_INF("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
        if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
            max_effective_resolution = effective_resolution;
            min_wasted_resolution = wasted_resolution;
@@ -1827,10 +1832,6 @@ static std::pair<int, int> uhd_get_refine_size(std::pair<int, int> original_size
    return refine_size;
 }

-inline int clip(int x, int lower, int upper) {
-    return std::max(lower, std::min(x, upper));
-}
-
 static std::pair<int, int> uhd_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) {
    std::vector<int> candidate_split_grids_nums;
    for (int i : {multiple - 1, multiple, multiple + 1}) {
@@ -1876,7 +1877,7 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
    const int multiple = fmin(ceil(ratio), max_slice_nums);

    std::vector<std::vector<clip_image_u8 *>> images;
-    LOG_TEE("%s: multiple %d\n", __func__, multiple);
+    LOG_INF("%s: multiple %d\n", __func__, multiple);
    images.push_back(std::vector<clip_image_u8 *>());

    if (multiple <= 1) {
@@ -1891,17 +1892,17 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
        clip_image_u8 * source_image = clip_image_u8_init();
        bicubic_resize(*img, *source_image, best_size.first, best_size.second);
        // source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)
-        LOG_TEE("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second);
+        LOG_INF("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second);
        images[images.size()-1].push_back(source_image);

        std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
-        LOG_TEE("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second);
+        LOG_INF("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second);

        auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true);
        clip_image_u8 * refine_image = clip_image_u8_init();
        bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second);

-        LOG_TEE("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second);
+        LOG_INF("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second);

        // split_to_patches
        int width = refine_image->nx;
@@ -1958,7 +1959,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
        int idx = 0;
        for (size_t i = 0; i < imgs.size(); ++i) {
            for (size_t j = 0; j < imgs[i].size(); ++j) {
-                LOG_TEE("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
+                LOG_DBG("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
                clip_image_f32 * res = clip_image_f32_init();
                normalize_image_u8_to_f32(imgs[i][j], res, ctx->image_mean, ctx->image_std);
                res_imgs->data[idx++] = *res;
@@ -1970,7 +1971,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli

    bool pad_to_square = true;
    if (!ctx->has_vision_encoder) {
-        LOG_TEE("This gguf file seems to have no vision encoder\n");
+        LOG_ERR("This gguf file seems to have no vision encoder\n");
        return false;
    }
    auto & params = ctx->vision_model.hparams;
@@ -2047,7 +2048,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
            }

            for (size_t i = 0; i < patches.size(); i++) {
-                // LOG_TEE("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
+                // LOG_DBG("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
                clip_image_u8_free(patches[i]);
            }

@@ -2283,7 +2284,7 @@ static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, co

 bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
    if (!ctx->has_vision_encoder) {
-        LOG_TEE("This gguf file seems to have no vision encoder\n");
+        LOG_ERR("This gguf file seems to have no vision encoder\n");
        return false;
    }

@@ -2295,7 +2296,7 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3

 bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
    if (!ctx->has_vision_encoder) {
-        LOG_TEE("This gguf file seems to have no vision encoder\n");
+        LOG_ERR("This gguf file seems to have no vision encoder\n");
        return false;
    }

@@ -2444,16 +2445,10 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
        ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
    }

-#ifdef GGML_USE_METAL
-    if (ggml_backend_is_metal(ctx->backend)) {
-        ggml_backend_metal_set_n_cb(ctx->backend, n_threads);
-    }
-#endif
-
    ggml_backend_graph_compute(ctx->backend, gf);

    // the last node is the embedding tensor
-    struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1];
+    struct ggml_tensor * embeddings = ggml_graph_node(gf, -1);

    // copy the embeddings to the location passed by the user
    ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
@@ -2525,7 +2520,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
            new_type = type;
            if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) {
                new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type
-                // LOG_TEE("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
+                // LOG_ERR("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
            }
            const size_t n_elms = ggml_nelements(cur);
            float * f32_data;
@@ -2544,7 +2539,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
                f32_data = (float *)conv_buf.data();
                break;
            default:
-                LOG_TEE("Please use an input file in f32 or f16\n");
+                LOG_ERR("Please use an input file in f32 or f16\n");
                gguf_free(ctx_out);
                return false;
            }
@@ -2571,7 +2566,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
            fout.put(0);
        }

-        LOG_TEE("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
+        LOG_INF("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
               orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
    }

@@ -2587,8 +2582,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
    gguf_free(ctx_out);

    {
-        LOG_TEE("%s: original  size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
-        LOG_TEE("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
+        LOG_INF("%s: original  size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
+        LOG_INF("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
    }

    return true;
--- a/examples/llava/convert_image_encoder_to_gguf.py
+++ b/examples/llava/convert_image_encoder_to_gguf.py
@@ -274,7 +274,7 @@ fout.add_bool("clip.use_gelu", use_gelu)


 if has_llava_projector:
-    model.vision_model.encoder.layers.pop(-1)  # pyright: ignore[reportAttributeAccessIssue]
+    model.vision_model.encoder.layers.pop(-1)
    projector = torch.load(args.llava_projector)
    for name, data in projector.items():
        name = get_tensor_name(name)
@@ -288,7 +288,7 @@ if has_llava_projector:

    print("Projector tensors added\n")

-state_dict = model.state_dict()  # pyright: ignore[reportAttributeAccessIssue]
+state_dict = model.state_dict()
 for name, data in state_dict.items():
    if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_llava_projector):
        # we don't need this
--- a/Show More
+++ b/Show More