llama : add llama_sampling API + move grammar in libllama

ggml-ci
readme : refactor API section + remove old hot topics
2026-02-26 14:23:22 +02:00 · 2024-09-03 10:31:54 +03:00 · 2024-09-03 10:00:36 +03:00 · 2024-09-02 22:08:38 +02:00 · 2024-09-02 13:53:23 -04:00 · 2024-09-02 18:11:13 +02:00
175 changed files with 18554 additions and 12407 deletions
--- a/.devops/full-cuda.Dockerfile
+++ b/.devops/full-cuda.Dockerfile
@@ -1,18 +1,16 @@
 ARG UBUNTU_VERSION=22.04
-
 # This needs to generally match the container host's environment.
-ARG CUDA_VERSION=11.7.1
-
+ARG CUDA_VERSION=12.6.0
 # Target the CUDA build image
 ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}

 FROM ${BASE_CUDA_DEV_CONTAINER} AS build

-# Unless otherwise specified, we make a fat build.
-ARG CUDA_DOCKER_ARCH=all
+# CUDA architecture to build for (defaults to all supported archs)
+ARG CUDA_DOCKER_ARCH=default

 RUN apt-get update && \
-    apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
+    apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1

 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
@@ -24,13 +22,12 @@ WORKDIR /app

 COPY . .

-# Set nvcc architecture
-ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
-# Enable CUDA
-ENV GGML_CUDA=1
-# Enable cURL
-ENV LLAMA_CURL=1
-
-RUN make -j$(nproc)
+# Use the default CUDA archs if not specified
+RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
+        export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
+    fi && \
+    cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake --build build --config Release -j$(nproc) && \
+    cp build/bin/* .

 ENTRYPOINT ["/app/.devops/tools.sh"]
--- a/.devops/llama-cli-cann.Dockerfile
+++ b/.devops/llama-cli-cann.Dockerfile
@@ -0,0 +1,44 @@
+ARG ASCEND_VERSION=8.0.rc2.alpha003-910b-openeuler22.03-py3.8
+
+FROM cosdt/cann:$ASCEND_VERSION AS build
+
+WORKDIR /app
+
+COPY . .
+
+RUN yum install -y gcc g++ cmake make
+ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
+ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
+ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
+ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH}
+ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH}
+ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
+ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
+ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
+ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}
+
+# find libascend_hal.so, because the drive hasn`t been mounted.
+ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
+
+RUN echo "Building with static libs" && \
+    source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
+    cmake -B build -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF  && \
+    cmake --build build --config Release --target llama-cli
+
+# TODO: use image with NNRT
+FROM cosdt/cann:$ASCEND_VERSION AS runtime
+COPY --from=build /app/build/bin/llama-cli /llama-cli
+
+ENV LC_ALL=C.utf8
+
+ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
+ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
+ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
+ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH}
+ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH}
+ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
+ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
+ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
+ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}
+
+ENTRYPOINT ["/llama-cli" ]
--- a/.devops/llama-cli-cuda.Dockerfile
+++ b/.devops/llama-cli-cuda.Dockerfile
@@ -1,6 +1,6 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
-ARG CUDA_VERSION=11.7.1
+ARG CUDA_VERSION=12.6.0
 # Target the CUDA build image
 ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
 # Target the CUDA runtime image
@@ -8,28 +8,30 @@ ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_V

 FROM ${BASE_CUDA_DEV_CONTAINER} AS build

-# Unless otherwise specified, we make a fat build.
-ARG CUDA_DOCKER_ARCH=all
+# CUDA architecture to build for (defaults to all supported archs)
+ARG CUDA_DOCKER_ARCH=default

 RUN apt-get update && \
-    apt-get install -y build-essential git
+    apt-get install -y build-essential git cmake

 WORKDIR /app

 COPY . .

-# Set nvcc architecture
-ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
-# Enable CUDA
-ENV GGML_CUDA=1
-
-RUN make -j$(nproc) llama-cli
+# Use the default CUDA archs if not specified
+RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
+        export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
+    fi && \
+    cmake -B build -DGGML_CUDA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake --build build --config Release --target llama-cli -j$(nproc)

 FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime

 RUN apt-get update && \
    apt-get install -y libgomp1

-COPY --from=build /app/llama-cli /llama-cli
+COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
+COPY --from=build /app/build/src/libllama.so /libllama.so
+COPY --from=build /app/build/bin/llama-cli /llama-cli

 ENTRYPOINT [ "/llama-cli" ]
--- a/.devops/llama-server-cuda.Dockerfile
+++ b/.devops/llama-server-cuda.Dockerfile
@@ -1,6 +1,6 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
-ARG CUDA_VERSION=11.7.1
+ARG CUDA_VERSION=12.6.0
 # Target the CUDA build image
 ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
 # Target the CUDA runtime image
@@ -8,31 +8,34 @@ ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_V

 FROM ${BASE_CUDA_DEV_CONTAINER} AS build

-# Unless otherwise specified, we make a fat build.
-ARG CUDA_DOCKER_ARCH=all
+# CUDA architecture to build for (defaults to all supported archs)
+ARG CUDA_DOCKER_ARCH=default

 RUN apt-get update && \
-    apt-get install -y build-essential git libcurl4-openssl-dev
+    apt-get install -y build-essential git cmake libcurl4-openssl-dev

 WORKDIR /app

 COPY . .

-# Set nvcc architecture
-ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
-# Enable CUDA
-ENV GGML_CUDA=1
-# Enable cURL
-ENV LLAMA_CURL=1
-
-RUN make -j$(nproc) llama-server
+# Use the default CUDA archs if not specified
+RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
+        export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
+    fi && \
+    cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake --build build --config Release --target llama-server -j$(nproc)

 FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime

 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev libgomp1 curl

-COPY --from=build /app/llama-server /llama-server
+COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
+COPY --from=build /app/build/src/libllama.so /libllama.so
+COPY --from=build /app/build/bin/llama-server /llama-server
+
+# Must be set to 0.0.0.0 so it can listen to requests from host machine
+ENV LLAMA_ARG_HOST=0.0.0.0

 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]

--- a/.devops/llama-server-intel.Dockerfile
+++ b/.devops/llama-server-intel.Dockerfile
@@ -26,6 +26,8 @@ RUN apt-get update && \
 COPY --from=build /app/build/bin/llama-server /llama-server

 ENV LC_ALL=C.utf8
+# Must be set to 0.0.0.0 so it can listen to requests from host machine
+ENV LLAMA_ARG_HOST=0.0.0.0

 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]

--- a/.devops/llama-server-rocm.Dockerfile
+++ b/.devops/llama-server-rocm.Dockerfile
@@ -39,6 +39,8 @@ ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
 ENV GGML_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++
+# Must be set to 0.0.0.0 so it can listen to requests from host machine
+ENV LLAMA_ARG_HOST=0.0.0.0

 # Enable cURL
 ENV LLAMA_CURL=1
--- a/.devops/llama-server-vulkan.Dockerfile
+++ b/.devops/llama-server-vulkan.Dockerfile
@@ -23,6 +23,8 @@ RUN cp /app/build/bin/llama-server /llama-server && \
    rm -rf /app

 ENV LC_ALL=C.utf8
+# Must be set to 0.0.0.0 so it can listen to requests from host machine
+ENV LLAMA_ARG_HOST=0.0.0.0

 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]

--- a/.devops/llama-server.Dockerfile
+++ b/.devops/llama-server.Dockerfile
@@ -21,6 +21,8 @@ RUN apt-get update && \
 COPY --from=build /app/llama-server /llama-server

 ENV LC_ALL=C.utf8
+# Must be set to 0.0.0.0 so it can listen to requests from host machine
+ENV LLAMA_ARG_HOST=0.0.0.0

 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]

--- a/.devops/nix/devshells.nix
+++ b/.devops/nix/devshells.nix
@@ -1,13 +1,52 @@
+{ inputs, ... }:
+
 {
  perSystem =
-    { config, lib, ... }:
+    {
+      config,
+      lib,
+      system,
+      ...
+    }:
    {
      devShells =
-        lib.concatMapAttrs
-          (name: package: {
-            ${name} = package.passthru.shell;
-            ${name + "-extra"} = package.passthru.shell-extra;
-          })
-          config.packages;
+        let
+          pkgs = import inputs.nixpkgs { inherit system; };
+          stdenv = pkgs.stdenv;
+          scripts = config.packages.python-scripts;
+        in
+        lib.pipe (config.packages) [
+          (lib.concatMapAttrs (
+            name: package: {
+              ${name} = pkgs.mkShell {
+                name = "${name}";
+                inputsFrom = [ package ];
+                shellHook = ''
+                  echo "Entering ${name} devShell"
+                '';
+              };
+              "${name}-extra" =
+                if (name == "python-scripts") then
+                  null
+                else
+                  pkgs.mkShell {
+                    name = "${name}-extra";
+                    inputsFrom = [
+                      package
+                      scripts
+                    ];
+                    # Extra packages that *may* be used by some scripts
+                    packages = [
+                        pkgs.python3Packages.tiktoken
+                    ];
+                    shellHook = ''
+                      echo "Entering ${name} devShell"
+                      addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib stdenv.cc.cc}/lib"
+                    '';
+                  };
+            }
+          ))
+          (lib.filterAttrs (name: value: value != null))
+        ];
    };
 }
--- a/.devops/nix/nixpkgs-instances.nix
+++ b/.devops/nix/nixpkgs-instances.nix
@@ -26,16 +26,14 @@
          config.cudaSupport = true;
          config.allowUnfreePredicate =
            p:
-            builtins.all
-              (
-                license:
-                license.free
-                || builtins.elem license.shortName [
-                  "CUDA EULA"
-                  "cuDNN EULA"
-                ]
-              )
-              (p.meta.licenses or [ p.meta.license ]);
+            builtins.all (
+              license:
+              license.free
+              || builtins.elem license.shortName [
+                "CUDA EULA"
+                "cuDNN EULA"
+              ]
+            ) (p.meta.licenses or [ p.meta.license ]);
        };
        # Ensure dependencies use ROCm consistently
        pkgsRocm = import inputs.nixpkgs {
--- a/.devops/nix/package-gguf-py.nix
+++ b/.devops/nix/package-gguf-py.nix
@@ -0,0 +1,36 @@
+{
+  lib,
+  llamaVersion,
+  numpy,
+  tqdm,
+  sentencepiece,
+  pyyaml,
+  poetry-core,
+  buildPythonPackage,
+  pytestCheckHook,
+}:
+
+buildPythonPackage {
+  pname = "gguf";
+  version = llamaVersion;
+  pyproject = true;
+  nativeBuildInputs = [ poetry-core ];
+  propagatedBuildInputs = [
+    numpy
+    tqdm
+    sentencepiece
+    pyyaml
+  ];
+  src = lib.cleanSource ../../gguf-py;
+  pythonImportsCheck = [
+    "numpy"
+    "gguf"
+  ];
+  nativeCheckInputs = [ pytestCheckHook ];
+  doCheck = true;
+  meta = with lib; {
+    description = "Python package for writing binary files in the GGUF format";
+    license = licenses.mit;
+    maintainers = [ maintainers.ditsuke ];
+  };
+}
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -3,31 +3,33 @@
  glibc,
  config,
  stdenv,
-  mkShell,
  runCommand,
  cmake,
  ninja,
  pkg-config,
  git,
-  python3,
  mpi,
  blas,
  cudaPackages,
+  autoAddDriverRunpath,
  darwin,
  rocmPackages,
  vulkan-headers,
  vulkan-loader,
  curl,
  shaderc,
-  useBlas ? builtins.all (x: !x) [
-    useCuda
-    useMetalKit
-    useRocm
-    useVulkan
-  ] && blas.meta.available,
+  useBlas ?
+    builtins.all (x: !x) [
+      useCuda
+      useMetalKit
+      useRocm
+      useVulkan
+    ]
+    && blas.meta.available,
  useCuda ? config.cudaSupport,
  useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin,
-  useMpi ? false, # Increases the runtime closure size by ~700M
+  # Increases the runtime closure size by ~700M
+  useMpi ? false,
  useRocm ? config.rocmSupport,
  enableCurl ? true,
  useVulkan ? false,
@@ -37,8 +39,8 @@
  # otherwise we get libstdc++ errors downstream.
  effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv,
  enableStatic ? effectiveStdenv.hostPlatform.isStatic,
-  precompileMetalShaders ? false
-}@inputs:
+  precompileMetalShaders ? false,
+}:

 let
  inherit (lib)
@@ -46,7 +48,6 @@ let
    cmakeFeature
    optionals
    strings
-    versionOlder
    ;

  stdenv = throw "Use effectiveStdenv instead";
@@ -62,54 +63,11 @@ let
  pnameSuffix =
    strings.optionalString (suffices != [ ])
      "-${strings.concatMapStringsSep "-" strings.toLower suffices}";
-  descriptionSuffix =
-    strings.optionalString (suffices != [ ])
-      ", accelerated with ${strings.concatStringsSep ", " suffices}";
+  descriptionSuffix = strings.optionalString (
+    suffices != [ ]
+  ) ", accelerated with ${strings.concatStringsSep ", " suffices}";

-  executableSuffix = effectiveStdenv.hostPlatform.extensions.executable;
-
-  # TODO: package the Python in this repository in a Nix-like way.
-  # It'd be nice to migrate to buildPythonPackage, as well as ensure this repo
-  # is PEP 517-compatible, and ensure the correct .dist-info is generated.
-  # https://peps.python.org/pep-0517/
-  #
-  # TODO: Package up each Python script or service appropriately, by making
-  # them into "entrypoints"
-  llama-python = python3.withPackages (
-    ps: [
-      ps.numpy
-      ps.sentencepiece
-    ]
-  );
-
-  # TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime
-  llama-python-extra = python3.withPackages (
-    ps: [
-      ps.numpy
-      ps.sentencepiece
-      ps.tiktoken
-      ps.torchWithoutCuda
-      ps.transformers
-
-      # server bench
-      ps.matplotlib
-
-      # server tests
-      ps.openai
-      ps.behave
-      ps.prometheus-client
-
-      # for examples/pydantic-models-to-grammar-examples.py
-      ps.docstring-parser
-      ps.pydantic
-
-      # for scripts/compare-llama-bench.py
-      ps.gitpython
-      ps.tabulate
-    ]
-  );
-
-  xcrunHost = runCommand "xcrunHost" {} ''
+  xcrunHost = runCommand "xcrunHost" { } ''
    mkdir -p $out/bin
    ln -s /usr/bin/xcrun $out/bin
  '';
@@ -144,181 +102,145 @@ let
  ];
 in

-effectiveStdenv.mkDerivation (
-  finalAttrs: {
-    pname = "llama-cpp${pnameSuffix}";
-    version = llamaVersion;
+effectiveStdenv.mkDerivation (finalAttrs: {
+  pname = "llama-cpp${pnameSuffix}";
+  version = llamaVersion;

-    # Note: none of the files discarded here are visible in the sandbox or
-    # affect the output hash. This also means they can be modified without
-    # triggering a rebuild.
-    src = lib.cleanSourceWith {
-      filter =
-        name: type:
-        let
-          noneOf = builtins.all (x: !x);
-          baseName = baseNameOf name;
-        in
-        noneOf [
-          (lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths
-          (lib.hasSuffix ".md" name) # Ignore *.md changes whe computing outPaths
-          (lib.hasPrefix "." baseName) # Skip hidden files and directories
-          (baseName == "flake.lock")
-        ];
-      src = lib.cleanSource ../../.;
-    };
-
-    postPatch = ''
-      substituteInPlace ./ggml/src/ggml-metal.m \
-        --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
-      substituteInPlace ./ggml/src/ggml-metal.m \
-        --replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
-    '';
-
-    # With PR#6015 https://github.com/ggerganov/llama.cpp/pull/6015,
-    # `default.metallib` may be compiled with Metal compiler from XCode
-    # and we need to escape sandbox on MacOS to access Metal compiler.
-    # `xcrun` is used find the path of the Metal compiler, which is varible
-    # and not on $PATH
-    # see https://github.com/ggerganov/llama.cpp/pull/6118 for discussion
-    __noChroot = effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders;
-
-    nativeBuildInputs =
-      [
-        cmake
-        ninja
-        pkg-config
-        git
-      ]
-      ++ optionals useCuda [
-        cudaPackages.cuda_nvcc
-
-        # TODO: Replace with autoAddDriverRunpath
-        # once https://github.com/NixOS/nixpkgs/pull/275241 has been merged
-        cudaPackages.autoAddOpenGLRunpathHook
-      ]
-      ++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [
-        glibc.static
-      ] ++ optionals (effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders) [
-        xcrunHost
+  # Note: none of the files discarded here are visible in the sandbox or
+  # affect the output hash. This also means they can be modified without
+  # triggering a rebuild.
+  src = lib.cleanSourceWith {
+    filter =
+      name: type:
+      let
+        noneOf = builtins.all (x: !x);
+        baseName = baseNameOf name;
+      in
+      noneOf [
+        (lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths
+        (lib.hasSuffix ".md" name) # Ignore *.md changes whe computing outPaths
+        (lib.hasPrefix "." baseName) # Skip hidden files and directories
+        (baseName == "flake.lock")
      ];
+    src = lib.cleanSource ../../.;
+  };

-    buildInputs =
-      optionals effectiveStdenv.isDarwin darwinBuildInputs
-      ++ optionals useCuda cudaBuildInputs
-      ++ optionals useMpi [ mpi ]
-      ++ optionals useRocm rocmBuildInputs
-      ++ optionals useBlas [ blas ]
-      ++ optionals useVulkan vulkanBuildInputs
-      ++ optionals enableCurl [ curl ];
+  postPatch = ''
+    substituteInPlace ./ggml/src/ggml-metal.m \
+      --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
+    substituteInPlace ./ggml/src/ggml-metal.m \
+      --replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
+  '';

-    cmakeFlags =
-      [
-        (cmakeBool "LLAMA_BUILD_SERVER" true)
-        (cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
-        (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
-        (cmakeBool "LLAMA_CURL" enableCurl)
-        (cmakeBool "GGML_NATIVE" false)
-        (cmakeBool "GGML_BLAS" useBlas)
-        (cmakeBool "GGML_CUDA" useCuda)
-        (cmakeBool "GGML_HIPBLAS" useRocm)
-        (cmakeBool "GGML_METAL" useMetalKit)
-        (cmakeBool "GGML_VULKAN" useVulkan)
-        (cmakeBool "GGML_STATIC" enableStatic)
-      ]
-      ++ optionals useCuda [
-        (
-          with cudaPackages.flags;
-          cmakeFeature "CMAKE_CUDA_ARCHITECTURES" (
-            builtins.concatStringsSep ";" (map dropDot cudaCapabilities)
-          )
+  # With PR#6015 https://github.com/ggerganov/llama.cpp/pull/6015,
+  # `default.metallib` may be compiled with Metal compiler from XCode
+  # and we need to escape sandbox on MacOS to access Metal compiler.
+  # `xcrun` is used find the path of the Metal compiler, which is varible
+  # and not on $PATH
+  # see https://github.com/ggerganov/llama.cpp/pull/6118 for discussion
+  __noChroot = effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders;
+
+  nativeBuildInputs =
+    [
+      cmake
+      ninja
+      pkg-config
+      git
+    ]
+    ++ optionals useCuda [
+      cudaPackages.cuda_nvcc
+
+      autoAddDriverRunpath
+    ]
+    ++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [ glibc.static ]
+    ++ optionals (effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders) [ xcrunHost ];
+
+  buildInputs =
+    optionals effectiveStdenv.isDarwin darwinBuildInputs
+    ++ optionals useCuda cudaBuildInputs
+    ++ optionals useMpi [ mpi ]
+    ++ optionals useRocm rocmBuildInputs
+    ++ optionals useBlas [ blas ]
+    ++ optionals useVulkan vulkanBuildInputs
+    ++ optionals enableCurl [ curl ];
+
+  cmakeFlags =
+    [
+      (cmakeBool "LLAMA_BUILD_SERVER" true)
+      (cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
+      (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
+      (cmakeBool "LLAMA_CURL" enableCurl)
+      (cmakeBool "GGML_NATIVE" false)
+      (cmakeBool "GGML_BLAS" useBlas)
+      (cmakeBool "GGML_CUDA" useCuda)
+      (cmakeBool "GGML_HIPBLAS" useRocm)
+      (cmakeBool "GGML_METAL" useMetalKit)
+      (cmakeBool "GGML_VULKAN" useVulkan)
+      (cmakeBool "GGML_STATIC" enableStatic)
+    ]
+    ++ optionals useCuda [
+      (
+        with cudaPackages.flags;
+        cmakeFeature "CMAKE_CUDA_ARCHITECTURES" (
+          builtins.concatStringsSep ";" (map dropDot cudaCapabilities)
        )
-      ]
-      ++ optionals useRocm [
-        (cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
-        (cmakeFeature "CMAKE_HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets))
-      ]
-      ++ optionals useMetalKit [
-        (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
-        (cmakeBool "GGML_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
-      ];
+      )
+    ]
+    ++ optionals useRocm [
+      (cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
+      (cmakeFeature "CMAKE_HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets))
+    ]
+    ++ optionals useMetalKit [
+      (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
+      (cmakeBool "GGML_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
+    ];

-    # Environment variables needed for ROCm
-    env = optionals useRocm {
-      ROCM_PATH = "${rocmPackages.clr}";
-      HIP_DEVICE_LIB_PATH = "${rocmPackages.rocm-device-libs}/amdgcn/bitcode";
-    };
+  # Environment variables needed for ROCm
+  env = optionals useRocm {
+    ROCM_PATH = "${rocmPackages.clr}";
+    HIP_DEVICE_LIB_PATH = "${rocmPackages.rocm-device-libs}/amdgcn/bitcode";
+  };

-    # TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
-    # if they haven't been added yet.
-    postInstall = ''
-      mkdir -p $out/include
-      cp $src/include/llama.h $out/include/
-    '';
+  # TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
+  # if they haven't been added yet.
+  postInstall = ''
+    mkdir -p $out/include
+    cp $src/include/llama.h $out/include/
+  '';

-    # Define the shells here, but don't add in the inputsFrom to avoid recursion.
-    passthru = {
-      inherit
-        useBlas
-        useCuda
-        useMetalKit
-        useMpi
-        useRocm
-        useVulkan
-        ;
+  meta = {
+    # Configurations we don't want even the CI to evaluate. Results in the
+    # "unsupported platform" messages. This is mostly a no-op, because
+    # cudaPackages would've refused to evaluate anyway.
+    badPlatforms = optionals useCuda lib.platforms.darwin;

-      shell = mkShell {
-        name = "shell-${finalAttrs.finalPackage.name}";
-        description = "contains numpy and sentencepiece";
-        buildInputs = [ llama-python ];
-        inputsFrom = [ finalAttrs.finalPackage ];
-        shellHook = ''
-          addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib effectiveStdenv.cc.cc}/lib"
-        '';
-      };
+    # Configurations that are known to result in build failures. Can be
+    # overridden by importing Nixpkgs with `allowBroken = true`.
+    broken = (useMetalKit && !effectiveStdenv.isDarwin);

-      shell-extra = mkShell {
-        name = "shell-extra-${finalAttrs.finalPackage.name}";
-        description = "contains numpy, sentencepiece, torchWithoutCuda, and transformers";
-        buildInputs = [ llama-python-extra ];
-        inputsFrom = [ finalAttrs.finalPackage ];
-      };
-    };
+    description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
+    homepage = "https://github.com/ggerganov/llama.cpp/";
+    license = lib.licenses.mit;

-    meta = {
-      # Configurations we don't want even the CI to evaluate. Results in the
-      # "unsupported platform" messages. This is mostly a no-op, because
-      # cudaPackages would've refused to evaluate anyway.
-      badPlatforms = optionals useCuda lib.platforms.darwin;
+    # Accommodates `nix run` and `lib.getExe`
+    mainProgram = "llama-cli";

-      # Configurations that are known to result in build failures. Can be
-      # overridden by importing Nixpkgs with `allowBroken = true`.
-      broken = (useMetalKit && !effectiveStdenv.isDarwin);
+    # These people might respond, on the best effort basis, if you ping them
+    # in case of Nix-specific regressions or for reviewing Nix-specific PRs.
+    # Consider adding yourself to this list if you want to ensure this flake
+    # stays maintained and you're willing to invest your time. Do not add
+    # other people without their consent. Consider removing people after
+    # they've been unreachable for long periods of time.

-      description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
-      homepage = "https://github.com/ggerganov/llama.cpp/";
-      license = lib.licenses.mit;
+    # Note that lib.maintainers is defined in Nixpkgs, but you may just add
+    # an attrset following the same format as in
+    # https://github.com/NixOS/nixpkgs/blob/f36a80e54da29775c78d7eff0e628c2b4e34d1d7/maintainers/maintainer-list.nix
+    maintainers = with lib.maintainers; [
+      philiptaron
+      SomeoneSerge
+    ];

-      # Accommodates `nix run` and `lib.getExe`
-      mainProgram = "llama-cli";
-
-      # These people might respond, on the best effort basis, if you ping them
-      # in case of Nix-specific regressions or for reviewing Nix-specific PRs.
-      # Consider adding yourself to this list if you want to ensure this flake
-      # stays maintained and you're willing to invest your time. Do not add
-      # other people without their consent. Consider removing people after
-      # they've been unreachable for long periods of time.
-
-      # Note that lib.maintainers is defined in Nixpkgs, but you may just add
-      # an attrset following the same format as in
-      # https://github.com/NixOS/nixpkgs/blob/f36a80e54da29775c78d7eff0e628c2b4e34d1d7/maintainers/maintainer-list.nix
-      maintainers = with lib.maintainers; [
-        philiptaron
-        SomeoneSerge
-      ];
-
-      # Extend `badPlatforms` instead
-      platforms = lib.platforms.all;
-    };
-  }
-)
+    # Extend `badPlatforms` instead
+    platforms = lib.platforms.all;
+  };
+})
--- a/.devops/nix/python-scripts.nix
+++ b/.devops/nix/python-scripts.nix
@@ -0,0 +1,66 @@
+{
+  lib,
+  stdenv,
+  buildPythonPackage,
+  poetry-core,
+  mkShell,
+  python3Packages,
+  gguf-py,
+}@inputs:
+
+let
+  llama-python-deps = with python3Packages; [
+    numpy
+    sentencepiece
+    transformers
+    protobuf
+    torchWithoutCuda
+    gguf-py
+    tqdm
+
+    # for scripts/compare-llama-bench.py
+    gitpython
+    tabulate
+
+    # for examples/pydantic-models-to-grammar-examples.py
+    docstring-parser
+    pydantic
+
+  ];
+
+  llama-python-test-deps = with python3Packages; [
+    # Server bench
+    matplotlib
+
+    # server tests
+    openai
+    behave
+    prometheus-client
+  ];
+in
+
+buildPythonPackage ({
+  pname = "llama-scripts";
+  version = "0.0.0";
+  pyproject = true;
+
+  # NOTE: The files filtered out here are not visible in the build sandbox, neither
+  # do they affect the output hash. They can be modified without triggering a rebuild.
+  src = lib.cleanSourceWith {
+    filter =
+      name: type:
+      let
+        any = builtins.any (x: x);
+        baseName = builtins.baseNameOf name;
+      in
+      any [
+        (lib.hasSuffix ".py" name)
+        (baseName == "README.md")
+        (baseName == "pyproject.toml")
+      ];
+    src = lib.cleanSource ../../.;
+  };
+  nativeBuildInputs = [ poetry-core ];
+  nativeCheckInputs = llama-python-test-deps;
+  dependencies = llama-python-deps;
+})
--- a/.devops/nix/scope.nix
+++ b/.devops/nix/scope.nix
@@ -1,19 +1,41 @@
 {
  lib,
  newScope,
+  python3,
  llamaVersion ? "0.0.0",
 }:

+let
+  pythonPackages = python3.pkgs;
+  buildPythonPackage = pythonPackages.buildPythonPackage;
+  numpy = pythonPackages.numpy;
+  tqdm = pythonPackages.tqdm;
+  sentencepiece = pythonPackages.sentencepiece;
+  pyyaml = pythonPackages.pyyaml;
+  poetry-core = pythonPackages.poetry-core;
+  pytestCheckHook = pythonPackages.pytestCheckHook;
+in
+
 # We're using `makeScope` instead of just writing out an attrset
 # because it allows users to apply overlays later using `overrideScope'`.
 # Cf. https://noogle.dev/f/lib/makeScope

-lib.makeScope newScope (
-  self: {
-    inherit llamaVersion;
-    llama-cpp = self.callPackage ./package.nix { };
-    docker = self.callPackage ./docker.nix { };
-    docker-min = self.callPackage ./docker.nix { interactive = false; };
-    sif = self.callPackage ./sif.nix { };
-  }
-)
+lib.makeScope newScope (self: {
+  inherit llamaVersion;
+  gguf-py = self.callPackage ./package-gguf-py.nix {
+    inherit
+      buildPythonPackage
+      numpy
+      tqdm
+      sentencepiece
+      poetry-core
+      pyyaml
+      pytestCheckHook
+      ;
+  };
+  python-scripts = self.callPackage ./python-scripts.nix { inherit buildPythonPackage poetry-core; };
+  llama-cpp = self.callPackage ./package.nix { };
+  docker = self.callPackage ./docker.nix { };
+  docker-min = self.callPackage ./docker.nix { interactive = false; };
+  sif = self.callPackage ./sif.nix { };
+})
--- a/.ecrc
+++ b/.ecrc
@@ -1,5 +1,5 @@
 {
-  "Exclude": ["^\\.gitmodules$"],
+  "Exclude": ["^\\.gitmodules$", "stb_image\\.h"],
  "Disable": {
    "IndentSize": true
  }
--- a/.github/workflows/bench.yml.disabled
+++ b/.github/workflows/bench.yml.disabled
@@ -1,3 +1,6 @@
+# TODO: there have been some issues with the workflow, so disabling for now
+#       https://github.com/ggerganov/llama.cpp/issues/7893
+#
 # Benchmark
 name: Benchmark

@@ -129,6 +132,8 @@ jobs:

      - name: Server bench
        id: server_bench
+        env:
+            HEAD_REF: ${{ github.head_ref || github.ref_name }}
        run: |
          set -eux

@@ -137,7 +142,7 @@ jobs:
          python bench.py \
              --runner-label ${{ env.RUNNER_LABEL }} \
              --name ${{ github.job }} \
-              --branch ${{ github.head_ref || github.ref_name }} \
+              --branch $HEAD_REF \
              --commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \
              --scenario script.js \
              --duration ${{ github.event.inputs.duration || env.DURATION }} \
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -47,7 +47,7 @@ jobs:
          sysctl -a
          mkdir build
          cd build
-          cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF ..
+          cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF ..
          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)

      - name: Test
@@ -105,7 +105,7 @@ jobs:
          sysctl -a
          # Metal is disabled due to intermittent failures with Github runners not having a GPU:
          # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
-          cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF
+          cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF
          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)

      - name: Test
@@ -222,7 +222,7 @@ jobs:
        run: |
          mkdir build
          cd build
-          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF
+          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF
          cmake --build . --config Release -j $(nproc)

      - name: Test
@@ -696,22 +696,20 @@ jobs:
    strategy:
      matrix:
        include:
-          - build: 'rpc-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON'
          - build: 'noavx-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON'
          - build: 'avx2-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON'
          - build: 'avx-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
          - build: 'avx512-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON'
          - build: 'openblas-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
          - build: 'kompute-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
          - build: 'vulkan-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
          - build: 'llvm-arm64'
            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
          - build: 'msvc-arm64'
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -96,21 +96,12 @@ jobs:
        env:
          GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'

-      - name: Build and push Docker image (versioned)
+      - name: Build and push Docker image (tagged + versioned)
        if: github.event_name == 'push'
-        uses: docker/build-push-action@v4
+        uses: docker/build-push-action@v6
        with:
          context: .
          push: true
          platforms: ${{ matrix.config.platforms }}
-          tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
-          file: ${{ matrix.config.dockerfile }}
-
-      - name: Build and push Docker image (tagged)
-        uses: docker/build-push-action@v4
-        with:
-          context: .
-          push: ${{ github.event_name == 'push' }}
-          platforms: ${{ matrix.config.platforms }}
-          tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
+          tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
          file: ${{ matrix.config.dockerfile }}
--- a/.github/workflows/python-check-requirements.yml
+++ b/.github/workflows/python-check-requirements.yml
@@ -6,15 +6,13 @@ on:
      - '.github/workflows/python-check-requirements.yml'
      - 'scripts/check-requirements.sh'
      - 'convert*.py'
-      - 'requirements.txt'
-      - 'requirements/*.txt'
+      - '**/requirements*.txt'
  pull_request:
    paths:
      - '.github/workflows/python-check-requirements.yml'
      - 'scripts/check-requirements.sh'
      - 'convert*.py'
-      - 'requirements.txt'
-      - 'requirements/*.txt'
+      - '**/requirements*.txt'

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
--- a/.gitignore
+++ b/.gitignore
@@ -129,3 +129,6 @@ poetry.toml

 # Scripts
 !/scripts/install-oneapi.bat
+
+# Test models for lora adapters
+/lora-tests
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -28,6 +28,7 @@
    { "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
    { "name": "reldbg",  "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
    { "name": "static",  "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } },
+    { "name": "sycl_f16",  "hidden": true, "cacheVariables": { "GGML_SYCL_F16": "ON" } },

    {
        "name": "arm64-windows-msvc", "hidden": true,
@@ -60,6 +61,8 @@
    { "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },

    { "name": "x64-windows-sycl-debug"  , "inherits": [ "sycl-base", "debug"   ] },
-    { "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] }
+    { "name": "x64-windows-sycl-debug-f16", "inherits": [ "sycl-base", "debug", "sycl_f16" ] },
+    { "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] },
+    { "name": "x64-windows-sycl-release-f16", "inherits": [ "sycl-base", "release", "sycl_f16" ] }
  ]
 }
--- a/10
+++ b/10
@@ -763,6 +763,10 @@ ifdef GGML_VULKAN_MEMORY_DEBUG
 	MK_CPPFLAGS  += -DGGML_VULKAN_MEMORY_DEBUG
 endif

+ifdef GGML_VULKAN_PERF
+	MK_CPPFLAGS  += -DGGML_VULKAN_PERF
+endif
+
 ifdef GGML_VULKAN_VALIDATE
 	MK_CPPFLAGS  += -DGGML_VULKAN_VALIDATE
 endif
@@ -923,7 +927,6 @@ OBJ_COMMON = \
 	common/ngram-cache.o \
 	common/sampling.o \
 	common/train.o \
-	common/grammar-parser.o \
 	common/build-info.o \
 	common/json-schema-to-grammar.o

@@ -1163,11 +1166,6 @@ common/console.o: \
 	common/console.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

-common/grammar-parser.o: \
-	common/grammar-parser.cpp \
-	common/grammar-parser.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
 common/json-schema-to-grammar.o: \
 	common/json-schema-to-grammar.cpp \
 	common/json-schema-to-grammar.h
--- a/README.md
+++ b/README.md
@@ -10,32 +10,14 @@

 Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++

-> [!IMPORTANT]
-[2024 Jun 12] Binaries have been renamed w/ a `llama-` prefix. `main` is now `llama-cli`, `server` is `llama-server`, etc (https://github.com/ggerganov/llama.cpp/pull/7809)
-
 ## Recent API changes

- [2024 Jun 26] The source code and CMake build scripts have been restructured https://github.com/ggerganov/llama.cpp/pull/8006
- [2024 Apr 21] `llama_token_to_piece` can now optionally render special tokens https://github.com/ggerganov/llama.cpp/pull/6807
- [2024 Apr 4] State and session file functions reorganized under `llama_state_*` https://github.com/ggerganov/llama.cpp/pull/6341
- [2024 Mar 26] Logits and embeddings API updated for compactness https://github.com/ggerganov/llama.cpp/pull/6122
- [2024 Mar 13] Add `llama_synchronize()` + `llama_context_params.n_ubatch` https://github.com/ggerganov/llama.cpp/pull/6017
- [2024 Mar 8] `llama_kv_cache_seq_rm()` returns a `bool` instead of `void`, and new `llama_n_seq_max()` returns the upper limit of acceptable `seq_id` in batches (relevant when dealing with multiple sequences) https://github.com/ggerganov/llama.cpp/pull/5328
- [2024 Mar 4] Embeddings API updated https://github.com/ggerganov/llama.cpp/pull/5796
- [2024 Mar 3] `struct llama_context_params` https://github.com/ggerganov/llama.cpp/pull/5849
+- [Changelog for `libllama` API](https://github.com/ggerganov/llama.cpp/issues/9289)
+- [Changelog for `llama-server` REST API](https://github.com/ggerganov/llama.cpp/issues/9291)

 ## Hot topics

- **`convert.py` has been deprecated and moved to `examples/convert_legacy_llama.py`, please use `convert_hf_to_gguf.py`** https://github.com/ggerganov/llama.cpp/pull/7430
- Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021
- BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920
- MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387
- Model sharding instructions using `gguf-split` https://github.com/ggerganov/llama.cpp/discussions/6404
- Fix major bug in Metal batched inference https://github.com/ggerganov/llama.cpp/pull/6225
- Multi-GPU pipeline parallelism support https://github.com/ggerganov/llama.cpp/pull/6017
- Looking for contributions to add Deepseek support: https://github.com/ggerganov/llama.cpp/issues/5981
- Quantization blind testing: https://github.com/ggerganov/llama.cpp/discussions/5962
- Initial Mamba support has been added: https://github.com/ggerganov/llama.cpp/pull/5328
+- *add hot topics here*

 ----

@@ -105,6 +87,8 @@ Typically finetunes of the base models below are supported as well.
 - [x] [Open Elm models](https://huggingface.co/collections/apple/openelm-instruct-models-6619ad295d7ae9f868b759ca)
 - [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b)
 - [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)
+- [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct)
+- [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a)

 (instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))

@@ -186,10 +170,12 @@ Unless otherwise noted these projects are open-source with permissive licensing:

 - [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
 - [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
+- [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage

 **Infrastructure:**

 - [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
+- [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs

 **Games:**
 - [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you.
@@ -422,6 +408,7 @@ Please refer to [Build llama.cpp locally](./docs/build.md)
 | [CUDA](./docs/build.md#cuda) | Nvidia GPU |
 | [hipBLAS](./docs/build.md#hipblas) | AMD GPU |
 | [Vulkan](./docs/build.md#vulkan) | GPU |
+| [CANN](./docs/build.md#cann) | Ascend NPU |

 ## Tools

--- a/ci/run.sh
+++ b/ci/run.sh
@@ -13,6 +13,9 @@
 # # with SYCL support
 # GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 #
+# # with VULKAN support
+# GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+#

 if [ -z "$2" ]; then
    echo "usage: $0 <output-dir> <mnt-dir>"
@@ -40,7 +43,7 @@ if [ ! -z ${GG_BUILD_METAL} ]; then
 fi

 if [ ! -z ${GG_BUILD_CUDA} ]; then
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=1"
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=native"
 fi

 if [ ! -z ${GG_BUILD_SYCL} ]; then
@@ -52,6 +55,10 @@ if [ ! -z ${GG_BUILD_SYCL} ]; then

    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
 fi
+
+if [ ! -z ${GG_BUILD_VULKAN} ]; then
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
+fi
 ## helpers

 # download a file if it does not exist or if it is outdated
@@ -107,7 +114,7 @@ function gg_run_ctest_debug {
    gg_check_build_requirements

    (time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                                          ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time make -j$(nproc)                                  ) 2>&1 | tee -a $OUT/${ci}-make.log

    (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log

@@ -138,7 +145,7 @@ function gg_run_ctest_release {
    gg_check_build_requirements

    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                                            ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log

    if [ -z ${GG_BUILD_LOW_PERF} ]; then
        (time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log
@@ -266,7 +273,6 @@ function gg_sum_ctest_with_model_release {
 }

 # open_llama_7b_v2
-# requires: GG_BUILD_CUDA

 function gg_run_open_llama_7b_v2 {
    cd ${SRC}
@@ -290,8 +296,8 @@ function gg_run_open_llama_7b_v2 {

    set -e

-    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                                                           ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log

    python3 ../examples/convert_legacy_llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf

@@ -425,7 +431,7 @@ function gg_run_pythia_1_4b {
    set -e

    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                                            ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log

    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf

@@ -535,7 +541,6 @@ function gg_sum_pythia_1_4b {
 }

 # pythia_2_8b
-# requires: GG_BUILD_CUDA

 function gg_run_pythia_2_8b {
    cd ${SRC}
@@ -556,8 +561,8 @@ function gg_run_pythia_2_8b {

    set -e

-    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                                                           ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log

    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf

@@ -692,7 +697,7 @@ function gg_run_embd_bge_small {
    set -e

    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                                            ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log

    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf

@@ -761,7 +766,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
    fi

    if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
-        if [ -z ${GG_BUILD_CUDA} ]; then
+        if [ -z ${GG_BUILD_CUDA} ] && [ -z ${GG_BUILD_VULKAN} ]; then
            test $ret -eq 0 && gg_run pythia_1_4b
        else
            test $ret -eq 0 && gg_run pythia_2_8b
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -58,8 +58,6 @@ add_library(${TARGET} STATIC
    sampling.cpp
    console.h
    console.cpp
-    grammar-parser.h
-    grammar-parser.cpp
    json.hpp
    json-schema-to-grammar.cpp
    train.h
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -77,6 +77,41 @@

 using json = nlohmann::ordered_json;

+//
+// Environment variable utils
+//
+
+template<typename T>
+static typename std::enable_if<std::is_same<T, std::string>::value, void>::type
+get_env(std::string name, T & target) {
+    char * value = std::getenv(name.c_str());
+    target = value ? std::string(value) : target;
+}
+
+template<typename T>
+static typename std::enable_if<!std::is_same<T, bool>::value && std::is_integral<T>::value, void>::type
+get_env(std::string name, T & target) {
+    char * value = std::getenv(name.c_str());
+    target = value ? std::stoi(value) : target;
+}
+
+template<typename T>
+static typename std::enable_if<std::is_floating_point<T>::value, void>::type
+get_env(std::string name, T & target) {
+    char * value = std::getenv(name.c_str());
+    target = value ? std::stof(value) : target;
+}
+
+template<typename T>
+static typename std::enable_if<std::is_same<T, bool>::value, void>::type
+get_env(std::string name, T & target) {
+    char * value = std::getenv(name.c_str());
+    if (value) {
+        std::string val(value);
+        target = val == "1" || val == "true";
+    }
+}
+
 //
 // CPU utils
 //
@@ -110,8 +145,34 @@ int32_t cpu_get_num_physical_cores() {
    if (result == 0) {
        return num_physical_cores;
    }
-#elif defined(_WIN32)
-    //TODO: Implement
+#elif defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
+    // TODO: windows + arm64 + mingw64
+    unsigned int n_threads_win = std::thread::hardware_concurrency();
+    unsigned int default_threads = n_threads_win > 0 ? (n_threads_win <= 4 ? n_threads_win : n_threads_win / 2) : 4;
+
+    DWORD buffer_size = 0;
+    if (!GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &buffer_size)) {
+        if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
+            return default_threads;
+        }
+    }
+
+    std::vector<char> buffer(buffer_size);
+    if (!GetLogicalProcessorInformationEx(RelationProcessorCore, reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data()), &buffer_size)) {
+        return default_threads;
+    }
+
+    int32_t num_physical_cores = 0;
+    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data());
+    while (buffer_size > 0) {
+        if (info->Relationship == RelationProcessorCore) {
+            num_physical_cores += info->Processor.GroupCount;
+        }
+        buffer_size -= info->Size;
+        info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(reinterpret_cast<char*>(info) + info->Size);
+    }
+
+    return num_physical_cores > 0 ? num_physical_cores : default_threads;
 #endif
    unsigned int n_threads = std::thread::hardware_concurrency();
    return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
@@ -190,16 +251,61 @@ int32_t cpu_get_num_math() {
    return cpu_get_num_physical_cores();
 }

+// Helper for setting process priority
+
+#if defined(_WIN32)
+
+bool set_process_priority(enum ggml_sched_priority prio) {
+    if (prio == GGML_SCHED_PRIO_NORMAL) {
+        return true;
+    }
+
+    DWORD p = NORMAL_PRIORITY_CLASS;
+    switch (prio) {
+        case GGML_SCHED_PRIO_NORMAL:   p = NORMAL_PRIORITY_CLASS;       break;
+        case GGML_SCHED_PRIO_MEDIUM:   p = ABOVE_NORMAL_PRIORITY_CLASS; break;
+        case GGML_SCHED_PRIO_HIGH:     p = HIGH_PRIORITY_CLASS;         break;
+        case GGML_SCHED_PRIO_REALTIME: p = REALTIME_PRIORITY_CLASS;     break;
+    }
+
+    if (!SetPriorityClass(GetCurrentProcess(), p)) {
+        fprintf(stderr, "warn: failed to set process priority class %d : (%d)\n", prio, (int) GetLastError());
+        return false;
+    }
+
+    return true;
+}
+
+#else // MacOS and POSIX
+#include <sys/types.h>
+#include <sys/resource.h>
+
+bool set_process_priority(enum ggml_sched_priority prio) {
+    if (prio == GGML_SCHED_PRIO_NORMAL) {
+        return true;
+    }
+
+    int p = 0;
+    switch (prio) {
+        case GGML_SCHED_PRIO_NORMAL:   p =  0;  break;
+        case GGML_SCHED_PRIO_MEDIUM:   p = -5;  break;
+        case GGML_SCHED_PRIO_HIGH:     p = -10; break;
+        case GGML_SCHED_PRIO_REALTIME: p = -20; break;
+    }
+
+    if (!setpriority(PRIO_PROCESS, 0, p)) {
+        fprintf(stderr, "warn: failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
+        return false;
+    }
+    return true;
+}
+
+#endif
+
 //
 // CLI argument parsing
 //

-void gpt_params_handle_hf_token(gpt_params & params) {
-    if (params.hf_token.empty() && std::getenv("HF_TOKEN")) {
-        params.hf_token = std::getenv("HF_TOKEN");
-    }
-}
-
 void gpt_params_handle_model_default(gpt_params & params) {
    if (!params.hf_repo.empty()) {
        // short-hand to avoid specifying --hf-file -> default it to --model
@@ -222,17 +328,40 @@ void gpt_params_handle_model_default(gpt_params & params) {
    }
 }

-bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
-    bool invalid_param = false;
-    std::string arg;
-    const std::string arg_prefix = "--";
-    llama_sampling_params & sparams = params.sparams;
+void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) {
+    int32_t n_set = 0;

+    if (cpuparams.n_threads < 0) {
+        // Assuming everything about cpuparams is invalid
+        if (role_model != nullptr) {
+            cpuparams = *role_model;
+        } else {
+            cpuparams.n_threads = cpu_get_num_math();
+        }
+    }
+
+    for (int32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
+        if (cpuparams.cpumask[i]) {
+            n_set++;
+        }
+    }
+
+    if (n_set && n_set < cpuparams.n_threads) {
+        // Not enough set bits, may experience performance issues.
+        fprintf(stderr, "warn: Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
+    }
+}
+
+bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
    for (int i = 1; i < argc; i++) {
-        arg = argv[i];
+        const std::string arg_prefix = "--";
+
+        std::string arg = argv[i];
        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
            std::replace(arg.begin(), arg.end(), '_', '-');
        }
+
+        bool invalid_param = false;
        if (!gpt_params_find_arg(argc, argv, arg, params, i, invalid_param)) {
            throw std::invalid_argument("error: unknown argument: " + arg);
        }
@@ -241,19 +370,27 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
        }
    }

+    postprocess_cpu_params(params.cpuparams, nullptr);
+    postprocess_cpu_params(params.cpuparams_batch, &params.cpuparams);
+    postprocess_cpu_params(params.draft_cpuparams, &params.cpuparams);
+    postprocess_cpu_params(params.draft_cpuparams_batch, &params.cpuparams_batch);
+
    if (params.prompt_cache_all && (params.interactive || params.interactive_first)) {
        throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
    }

    gpt_params_handle_model_default(params);

-    gpt_params_handle_hf_token(params);
+    if (params.hf_token.empty()) {
+        get_env("HF_TOKEN", params.hf_token);
+    }
+
+    auto & sparams = params.sparams;

    if (params.escape) {
        string_process_escapes(params.prompt);
        string_process_escapes(params.input_prefix);
        string_process_escapes(params.input_suffix);
-        string_process_escapes(sparams.cfg_negative_prompt);
        for (auto & antiprompt : params.antiprompt) {
            string_process_escapes(antiprompt);
        }
@@ -264,9 +401,39 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
        params.kv_overrides.back().key[0] = 0;
    }

+    if (sparams.seed == LLAMA_DEFAULT_SEED) {
+        sparams.seed = time(NULL);
+    }
+
    return true;
 }

+void gpt_params_parse_from_env(gpt_params & params) {
+    // we only care about server-related params for now
+    get_env("LLAMA_ARG_MODEL",            params.model);
+    get_env("LLAMA_ARG_MODEL_URL",        params.model_url);
+    get_env("LLAMA_ARG_MODEL_ALIAS",      params.model_alias);
+    get_env("LLAMA_ARG_HF_REPO",          params.hf_repo);
+    get_env("LLAMA_ARG_HF_FILE",          params.hf_file);
+    get_env("LLAMA_ARG_THREADS",          params.cpuparams.n_threads);
+    get_env("LLAMA_ARG_CTX_SIZE",         params.n_ctx);
+    get_env("LLAMA_ARG_N_PARALLEL",       params.n_parallel);
+    get_env("LLAMA_ARG_BATCH",            params.n_batch);
+    get_env("LLAMA_ARG_UBATCH",           params.n_ubatch);
+    get_env("LLAMA_ARG_N_GPU_LAYERS",     params.n_gpu_layers);
+    get_env("LLAMA_ARG_THREADS_HTTP",     params.n_threads_http);
+    get_env("LLAMA_ARG_CHAT_TEMPLATE",    params.chat_template);
+    get_env("LLAMA_ARG_N_PREDICT",        params.n_predict);
+    get_env("LLAMA_ARG_ENDPOINT_METRICS", params.endpoint_metrics);
+    get_env("LLAMA_ARG_ENDPOINT_SLOTS",   params.endpoint_slots);
+    get_env("LLAMA_ARG_EMBEDDINGS",       params.embedding);
+    get_env("LLAMA_ARG_FLASH_ATTN",       params.flash_attn);
+    get_env("LLAMA_ARG_DEFRAG_THOLD",     params.defrag_thold);
+    get_env("LLAMA_ARG_CONT_BATCHING",    params.cont_batching);
+    get_env("LLAMA_ARG_HOST",             params.hostname);
+    get_env("LLAMA_ARG_PORT",             params.port);
+}
+
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
    const auto params_org = params; // the example can modify the default params

@@ -285,52 +452,229 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
    return true;
 }

+bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THREADS]) {
+    size_t dash_loc = range.find('-');
+    if (dash_loc == std::string::npos) {
+        fprintf(stderr, "Format of CPU range is invalid! Expected [<start>]-[<end>].\n");
+        return false;
+    }
+
+    size_t start_i;
+    size_t end_i;
+
+    if (dash_loc == 0) {
+        start_i = 0;
+    } else {
+        start_i = std::stoull(range.substr(0, dash_loc));
+        if (start_i >= GGML_MAX_N_THREADS) {
+            fprintf(stderr, "Start index out of bounds!\n");
+            return false;
+        }
+    }
+
+    if (dash_loc == range.length() - 1) {
+        end_i = GGML_MAX_N_THREADS - 1;
+    } else {
+        end_i = std::stoull(range.substr(dash_loc + 1));
+        if (end_i >= GGML_MAX_N_THREADS) {
+            fprintf(stderr, "End index out of bounds!\n");
+            return false;
+        }
+    }
+
+    for (size_t i = start_i; i <= end_i; i++) {
+        boolmask[i] = true;
+    }
+
+    return true;
+}
+
+bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREADS]) {
+    // Discard potential 0x prefix
+    size_t start_i = 0;
+    if (mask.length() >= 2 && mask.substr(0, 2) == "0x") {
+        start_i = 2;
+    }
+
+    size_t num_digits = mask.length() - start_i;
+    if (num_digits > 128) num_digits = 128;
+
+    size_t end_i = num_digits + start_i;
+
+    for (size_t i = start_i, n = (num_digits*4 - 1); i < end_i; i++, n-=4) {
+        char c = mask.at(i);
+        int8_t id = c;
+
+        if ((c >= '0' && c <= '9')) {
+            id -= '0';
+        } else if (c >= 'a' && c <= 'f') {
+            id -= 'a' - 10;
+        } else if (c >= 'A' && c <= 'F') {
+            id -= 'A' - 10;
+        } else {
+            fprintf(stderr, "Invalid hex character '%c' at position %d\n", c, int32_t(i));
+            return false;
+        }
+
+        boolmask[  n  ] = boolmask[  n  ] || ((id & 8) != 0);
+        boolmask[n - 1] = boolmask[n - 1] || ((id & 4) != 0);
+        boolmask[n - 2] = boolmask[n - 2] || ((id & 2) != 0);
+        boolmask[n - 3] = boolmask[n - 3] || ((id & 1) != 0);
+    }
+
+    return true;
+}
+
 #define CHECK_ARG if (++i >= argc) { invalid_param = true; return true; }

 bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
    const char split_delim = ',';

-    llama_sampling_params & sparams = params.sparams;
+    auto & sparams = params.sparams;

    if (arg == "-s" || arg == "--seed") {
        CHECK_ARG
-        // TODO: this is temporary, in the future the sampling state will be moved fully to llama_sampling_context.
-        params.seed = std::stoul(argv[i]);
        sparams.seed = std::stoul(argv[i]);
        return true;
    }
    if (arg == "-t" || arg == "--threads") {
        CHECK_ARG
-        params.n_threads = std::stoi(argv[i]);
-        if (params.n_threads <= 0) {
-            params.n_threads = std::thread::hardware_concurrency();
+        params.cpuparams.n_threads = std::stoi(argv[i]);
+        if (params.cpuparams.n_threads <= 0) {
+            params.cpuparams.n_threads = std::thread::hardware_concurrency();
        }
        return true;
    }
+    if (arg == "-C" || arg == "--cpu-mask") {
+        CHECK_ARG
+        std::string mask = argv[i];
+        params.cpuparams.mask_valid = true;
+        invalid_param = !parse_cpu_mask(mask, params.cpuparams.cpumask);
+        return true;
+    }
+    if (arg == "-Cr" || arg == "--cpu-range") {
+        CHECK_ARG
+        std::string range = argv[i];
+        params.cpuparams.mask_valid = true;
+        invalid_param = !parse_cpu_range(range, params.cpuparams.cpumask);
+        return true;
+    }
+    if (arg == "--prio") {
+        CHECK_ARG
+        params.cpuparams.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
+        return true;
+    }
+    if (arg == "--cpu-strict") {
+        CHECK_ARG
+        params.cpuparams.strict_cpu = std::stoul(argv[i]);
+        return true;
+    }
+    if (arg == "--poll") {
+        CHECK_ARG
+        params.cpuparams.poll = std::stoul(argv[i]);
+        return true;
+    }
    if (arg == "-tb" || arg == "--threads-batch") {
        CHECK_ARG
-        params.n_threads_batch = std::stoi(argv[i]);
-        if (params.n_threads_batch <= 0) {
-            params.n_threads_batch = std::thread::hardware_concurrency();
+        params.cpuparams_batch.n_threads = std::stoi(argv[i]);
+        if (params.cpuparams_batch.n_threads <= 0) {
+            params.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
        }
        return true;
    }
+    if (arg == "-Cb" || arg == "--cpu-mask-batch") {
+        CHECK_ARG
+        std::string mask = argv[i];
+        params.cpuparams_batch.mask_valid = true;
+        invalid_param = !parse_cpu_mask(mask, params.cpuparams_batch.cpumask);
+        return true;
+    }
+    if (arg == "-Crb" || arg == "--cpu-range_batch") {
+        CHECK_ARG
+        std::string range = argv[i];
+        params.cpuparams_batch.mask_valid = true;
+        invalid_param = !parse_cpu_range(range, params.cpuparams_batch.cpumask);
+        return true;
+    }
+    if (arg == "--prio-batch") {
+        CHECK_ARG
+        params.cpuparams_batch.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
+        return true;
+    }
+    if (arg == "--cpu-strict-batch") {
+        params.cpuparams_batch.strict_cpu = true;
+        return true;
+    }
+    if (arg == "--poll-batch") {
+        CHECK_ARG
+        params.cpuparams_batch.poll = std::stoul(argv[i]);
+        return true;
+    }
    if (arg == "-td" || arg == "--threads-draft") {
        CHECK_ARG
-        params.n_threads_draft = std::stoi(argv[i]);
-        if (params.n_threads_draft <= 0) {
-            params.n_threads_draft = std::thread::hardware_concurrency();
+        params.draft_cpuparams.n_threads = std::stoi(argv[i]);
+        if (params.draft_cpuparams.n_threads <= 0) {
+            params.draft_cpuparams.n_threads = std::thread::hardware_concurrency();
        }
        return true;
+    }
+        if (arg == "-Cd" || arg == "--cpu-mask-draft") {
+        CHECK_ARG
+        std::string mask = argv[i];
+        params.draft_cpuparams.mask_valid = true;
+        invalid_param = !parse_cpu_mask(mask, params.draft_cpuparams.cpumask);
+        return true;
+    }
+    if (arg == "-Crd" || arg == "--cpu-range-draft") {
+        CHECK_ARG
+        std::string range = argv[i];
+        params.draft_cpuparams.mask_valid = true;
+        invalid_param = !parse_cpu_range(range, params.draft_cpuparams.cpumask);
+        return true;
+    }
+    if (arg == "--prio-draft") {
+        CHECK_ARG
+        params.draft_cpuparams.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
+        return true;
+    }
+    if (arg == "--cpu-strict-draft") {
+        params.draft_cpuparams.strict_cpu = true;
+        return true;
+    }
+    if (arg == "--poll-draft") {
+        CHECK_ARG
+        params.draft_cpuparams.poll = std::stoul(argv[i]);
+        return true;
    }
    if (arg == "-tbd" || arg == "--threads-batch-draft") {
        CHECK_ARG
-        params.n_threads_batch_draft = std::stoi(argv[i]);
-        if (params.n_threads_batch_draft <= 0) {
-            params.n_threads_batch_draft = std::thread::hardware_concurrency();
+        params.draft_cpuparams_batch.n_threads = std::stoi(argv[i]);
+        if (params.draft_cpuparams_batch.n_threads <= 0) {
+            params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency();
        }
        return true;
    }
+    if (arg == "-Crbd" || arg == "--cpu-range-batch-draft") {
+        CHECK_ARG
+        std::string range = argv[i];
+        params.draft_cpuparams_batch.mask_valid = true;
+        invalid_param = !parse_cpu_range(range, params.draft_cpuparams_batch.cpumask);
+        return true;
+    }
+    if (arg == "--prio-batch-draft") {
+        CHECK_ARG
+        params.draft_cpuparams_batch.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
+        return true;
+    }
+    if (arg == "--cpu-strict-batch-draft") {
+        params.draft_cpuparams_batch.strict_cpu = true;
+        return true;
+    }
+    if (arg == "--poll-batch-draft") {
+        CHECK_ARG
+        params.draft_cpuparams_batch.poll = std::stoul(argv[i]);
+        return true;
+    }
    if (arg == "-p" || arg == "--prompt") {
        CHECK_ARG
        params.prompt = argv[i];
@@ -500,12 +844,12 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
    if (arg == "--samplers") {
        CHECK_ARG
        const auto sampler_names = string_split(argv[i], ';');
-        sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, true);
+        sparams.samplers = llama_sampling_types_from_names(sampler_names, true);
        return true;
    }
    if (arg == "--sampling-seq") {
        CHECK_ARG
-        sparams.samplers_sequence = llama_sampling_types_from_chars(argv[i]);
+        sparams.samplers = llama_sampling_types_from_chars(argv[i]);
        return true;
    }
    if (arg == "--top-p") {
@@ -531,7 +875,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
    }
    if (arg == "--typical") {
        CHECK_ARG
-        sparams.typical_p = std::stof(argv[i]);
+        sparams.typ_p = std::stof(argv[i]);
        return true;
    }
    if (arg == "--repeat-last-n") {
@@ -580,30 +924,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        sparams.mirostat_tau = std::stof(argv[i]);
        return true;
    }
-    if (arg == "--cfg-negative-prompt") {
-        CHECK_ARG
-        sparams.cfg_negative_prompt = argv[i];
-        return true;
-    }
-    if (arg == "--cfg-negative-prompt-file") {
-        CHECK_ARG
-        std::ifstream file(argv[i]);
-        if (!file) {
-            fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
-            invalid_param = true;
-            return true;
-        }
-        std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(sparams.cfg_negative_prompt));
-        if (!sparams.cfg_negative_prompt.empty() && sparams.cfg_negative_prompt.back() == '\n') {
-            sparams.cfg_negative_prompt.pop_back();
-        }
-        return true;
-    }
-    if (arg == "--cfg-scale") {
-        CHECK_ARG
-        sparams.cfg_scale = std::stof(argv[i]);
-        return true;
-    }
    if (arg == "-b" || arg == "--batch-size") {
        CHECK_ARG
        params.n_batch = std::stoi(argv[i]);
@@ -825,7 +1145,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        }
        return true;
    }
-    if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--gpu-layers-draft") {
+    if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--n-gpu-layers-draft") {
        CHECK_ARG
        params.n_gpu_layers_draft = std::stoi(argv[i]);
        if (!llama_supports_gpu_offload()) {
@@ -1011,7 +1331,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        return true;
    }
    if (arg == "--ignore-eos") {
-        params.ignore_eos = true;
+        sparams.ignore_eos = true;
        return true;
    }
    if (arg == "--penalize-nl") {
@@ -1026,7 +1346,8 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        std::string value_str;
        try {
            if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
-                sparams.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
+                const float bias = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
+                sparams.logit_bias.push_back({key, bias});
            }
            else {
                throw std::exception();
@@ -1373,13 +1694,13 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
 #endif

 void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
-    const llama_sampling_params & sparams = params.sparams;
+    const auto & sparams = params.sparams;

    std::string sampler_type_chars;
    std::string sampler_type_names;
-    for (const auto sampler_type : sparams.samplers_sequence) {
-        sampler_type_chars += static_cast<char>(sampler_type);
-        sampler_type_names += llama_sampling_type_to_str(sampler_type) + ";";
+    for (const auto & sampler : sparams.samplers) {
+        sampler_type_chars += llama_sampling_type_to_chr(sampler);
+        sampler_type_names += llama_sampling_type_to_str(sampler) + ";";
    }
    sampler_type_names.pop_back();

@@ -1414,12 +1735,40 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "*",           "       --verbose-prompt",       "print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false" });
    options.push_back({ "*",           "       --no-display-prompt",    "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" });
    options.push_back({ "*",           "-co,   --color",                "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" });
-    options.push_back({ "*",           "-s,    --seed SEED",            "RNG seed (default: %d, use random seed for < 0)", params.seed });
-    options.push_back({ "*",           "-t,    --threads N",            "number of threads to use during generation (default: %d)", params.n_threads });
+    options.push_back({ "*",           "-t,    --threads N",            "number of threads to use during generation (default: %d)", params.cpuparams.n_threads });
    options.push_back({ "*",           "-tb,   --threads-batch N",      "number of threads to use during batch and prompt processing (default: same as --threads)" });
    options.push_back({ "speculative", "-td,   --threads-draft N",      "number of threads to use during generation (default: same as --threads)" });
-    options.push_back({ "speculative", "-tbd,  --threads-batch-draft N",
-                                                                        "number of threads to use during batch and prompt processing (default: same as --threads-draft)" });
+    options.push_back({ "speculative", "-tbd,  --threads-batch-draft N","number of threads to use during batch and prompt processing (default: same as --threads-draft)" });
+
+#ifndef GGML_USE_OPENMP
+    // these options are available only with the internal threadpool
+    options.push_back({ "*",           "-C,    --cpu-mask M",            "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")"});
+    options.push_back({ "*",           "-Cr,   --cpu-range lo-hi",       "range of CPUs for affinity. Complements --cpu-mask"});
+    options.push_back({ "*",           "       --cpu-strict <0|1>",      "use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu});
+    options.push_back({ "*",           "       --priority N",            "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority});
+    options.push_back({ "*",           "       --poll <0...100>",        "use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll});
+
+    options.push_back({ "*",           "-Cb,   --cpu-mask-batch M",      "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)"});
+    options.push_back({ "*",           "-Crb,  --cpu-range-batch lo-hi", "ranges of CPUs for affinity. Complements --cpu-mask-batch"});
+    options.push_back({ "*",           "       --cpu-strict-batch <0|1>","use strict CPU placement (default: same as --cpu-strict)"});
+    options.push_back({ "*",           "       --priority-batch N",      "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority)"});
+    options.push_back({ "*",           "       --poll-batch <0|1>",      "use polling to wait for work (default: same as --poll"});
+
+    options.push_back({ "speculative", "-Cd,   --cpu-mask-draft M",      "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)"});
+    options.push_back({ "speculative", "-Crd,  --cpu-range-draft lo-hi", "Ranges of CPUs for affinity. Complements --cpu-mask-draft"});
+    options.push_back({ "speculative", "       --cpu-strict-draft <0|1>","Use strict CPU placement for draft model (default: same as --cpu-strict)"});
+    options.push_back({ "speculative", "       --priority-draft N",      "Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: same as --priority)"});
+    options.push_back({ "speculative", "       --poll-draft <0|1>",      "Use polling to wait for draft model work (default: same as --poll])"});
+
+    options.push_back({ "speculative", "-Cbd,  --cpu-mask-batch-draft M","Draft model CPU affinity mask. Complements cpu-range-draft-batch (default: same as --cpu-mask-draft)"});
+    options.push_back({ "speculative", "-Crbd, --cpu-range-batch-draft lo-hi",
+                                                                         "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)"});
+    options.push_back({ "speculative", "       --cpu-strict-batch-draft <0|1>",
+                                                                         "Use strict CPU placement for draft model (default: --cpu-strict-draft)"});
+    options.push_back({ "speculative", "       --priority-batch-draft N","Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority-draft)"});
+    options.push_back({ "speculative", "       --poll-batch-draft <0|1>","Use polling to wait for draft model work (default: --poll-draft)"});
+#endif // GGML_USE_OPENMP
+
    options.push_back({ "speculative", "       --draft N",              "number of tokens to draft for speculative decoding (default: %d)", params.n_draft });
    options.push_back({ "speculative", "-ps,   --p-split N",            "speculative decoding split probability (default: %.1f)", (double)params.p_split });
    options.push_back({ "*",           "-lcs,  --lookup-cache-static FNAME",
@@ -1465,18 +1814,19 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
                                       "       --spm-infill",           "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" });

    options.push_back({ "sampling" });
+    options.push_back({ "*",           "-s,    --seed SEED",            "RNG seed (default: %d, use random seed for < 0)", sparams.seed });
    options.push_back({ "*",           "       --samplers SAMPLERS",    "samplers that will be used for generation in the order, separated by \';\'\n"
                                                                        "(default: %s)", sampler_type_names.c_str() });
    options.push_back({ "*",           "       --sampling-seq SEQUENCE",
                                                                        "simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str() });
    options.push_back({ "*",           "       --ignore-eos",           "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)" });
    options.push_back({ "*",           "       --penalize-nl",          "penalize newline tokens (default: %s)", sparams.penalize_nl ? "true" : "false" });
-    options.push_back({ "*",           "       --temp N",               "temperature (default: %.1f)", (double)sparams.temp });
+    options.push_back({ "*",           "       --temp T",               "temperature (default: %.1f)", (double)sparams.temp });
    options.push_back({ "*",           "       --top-k N",              "top-k sampling (default: %d, 0 = disabled)", sparams.top_k });
-    options.push_back({ "*",           "       --top-p N",              "top-p sampling (default: %.1f, 1.0 = disabled)", (double)sparams.top_p });
-    options.push_back({ "*",           "       --min-p N",              "min-p sampling (default: %.1f, 0.0 = disabled)", (double)sparams.min_p });
-    options.push_back({ "*",           "       --tfs N",                "tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)sparams.tfs_z });
-    options.push_back({ "*",           "       --typical N",            "locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)sparams.typical_p });
+    options.push_back({ "*",           "       --top-p P",              "top-p sampling (default: %.1f, 1.0 = disabled)", (double)sparams.top_p });
+    options.push_back({ "*",           "       --min-p P",              "min-p sampling (default: %.1f, 0.0 = disabled)", (double)sparams.min_p });
+    options.push_back({ "*",           "       --tfs P",                "tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)sparams.tfs_z });
+    options.push_back({ "*",           "       --typical P",            "locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)sparams.typ_p });
    options.push_back({ "*",           "       --repeat-last-n N",      "last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", sparams.penalty_last_n });
    options.push_back({ "*",           "       --repeat-penalty N",     "penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)sparams.penalty_repeat });
    options.push_back({ "*",           "       --presence-penalty N",   "repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_present });
@@ -1491,11 +1841,6 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "*",           "       -l TOKEN_ID(+/-)BIAS",   "modifies the likelihood of token appearing in the completion,\n"
                                                                        "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"
                                                                        "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'" });
-    options.push_back({ "main",        "       --cfg-negative-prompt PROMPT",
-                                                                        "negative prompt to use for guidance (default: '%s')", sparams.cfg_negative_prompt.c_str() });
-    options.push_back({ "main",        "       --cfg-negative-prompt-file FNAME",
-                                                                        "negative prompt file to use for guidance" });
-    options.push_back({ "main",        "       --cfg-scale N",          "strength of guidance (default: %.1f, 1.0 = disable)", (double)sparams.cfg_scale });
    options.push_back({ "main",        "       --chat-template JINJA_TEMPLATE",
                                                                        "set custom jinja chat template (default: template taken from model's metadata)\n"
                                                                        "if suffix/prefix are specified, template will be disabled\n"
@@ -1691,7 +2036,6 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "export-lora", "-m,    --model",                "model path from which to load base model (default '%s')", params.model.c_str() });
    options.push_back({ "export-lora", "       --lora FNAME",           "path to LoRA adapter  (can be repeated to use multiple adapters)" });
    options.push_back({ "export-lora", "       --lora-scaled FNAME S",  "path to LoRA adapter with user defined scaling S  (can be repeated to use multiple adapters)" });
-    options.push_back({ "*",           "-t,    --threads N",            "number of threads to use during computation (default: %d)", params.n_threads });
    options.push_back({ "export-lora", "-o,    --output FNAME",         "output file (default: '%s')", params.lora_outfile.c_str() });

    printf("usage: %s [options]\n", argv[0]);
@@ -1723,11 +2067,17 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
 std::string gpt_params_get_system_info(const gpt_params & params) {
    std::ostringstream os;

-    os << "system_info: n_threads = " << params.n_threads;
-    if (params.n_threads_batch != -1) {
-        os << " (n_threads_batch = " << params.n_threads_batch << ")";
+    os << "system_info: n_threads = " << params.cpuparams.n_threads;
+    if (params.cpuparams_batch.n_threads != -1) {
+        os << " (n_threads_batch = " << params.cpuparams_batch.n_threads << ")";
    }
+#if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
+    // TODO: windows + arm64 + mingw64
+    DWORD logicalProcessorCount = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
+    os << " / " << logicalProcessorCount << " | " << llama_print_system_info();
+#else
    os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info();
+#endif

    return os.str();
 }
@@ -1779,13 +2129,19 @@ std::string string_get_sortable_timestamp() {

 void string_replace_all(std::string & s, const std::string & search, const std::string & replace) {
    if (search.empty()) {
-        return; // Avoid infinite loop if 'search' is an empty string
+        return;
    }
+    std::string builder;
+    builder.reserve(s.length());
    size_t pos = 0;
-    while ((pos = s.find(search, pos)) != std::string::npos) {
-        s.replace(pos, search.length(), replace);
-        pos += replace.length();
+    size_t last_pos = 0;
+    while ((pos = s.find(search, last_pos)) != std::string::npos) {
+        builder.append(s, last_pos, pos - last_pos);
+        builder.append(replace);
+        last_pos = pos + search.length();
    }
+    builder.append(s, last_pos, std::string::npos);
+    s = std::move(builder);
 }

 void string_process_escapes(std::string & input) {
@@ -2131,8 +2487,9 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
        llama_lora_adapters_apply(lctx, iparams.lora_adapters);
    }

-    if (params.ignore_eos) {
-        params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
+    if (params.sparams.ignore_eos && llama_token_eos(model) == -1) {
+        fprintf(stderr, "%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
+        params.sparams.ignore_eos = false;
    }

    if (params.warmup) {
@@ -2161,7 +2518,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
        }
        llama_kv_cache_clear(lctx);
        llama_synchronize(lctx);
-        llama_reset_timings(lctx);
+        llama_reset_timings(lctx, nullptr);
    }

    iparams.model   = model;
@@ -2237,9 +2594,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
    cparams.n_seq_max         = params.n_parallel;
    cparams.n_batch           = params.n_batch;
    cparams.n_ubatch          = params.n_ubatch;
-    cparams.n_threads         = params.n_threads;
-    cparams.n_threads_batch   = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
-    cparams.seed              = params.seed;
+    cparams.n_threads         = params.cpuparams.n_threads;
+    cparams.n_threads_batch   = params.cpuparams_batch.n_threads == -1 ?
+                                    params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
    cparams.logits_all        = params.logits_all;
    cparams.embeddings        = params.embedding;
    cparams.rope_scaling_type = params.rope_scaling_type;
@@ -2264,6 +2621,22 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
    return cparams;
 }

+struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params) {
+    struct ggml_threadpool_params tpp;
+
+    ggml_threadpool_params_init(&tpp, params.n_threads); // setup the defaults
+
+    if (params.mask_valid) {
+        std::memcpy(&tpp.cpumask, &params.cpumask, GGML_MAX_N_THREADS);
+    }
+
+    tpp.prio       = params.priority;
+    tpp.poll       = params.poll;
+    tpp.strict_cpu = params.strict_cpu;
+
+    return tpp;
+}
+
 #ifdef LLAMA_USE_CURL

 static bool starts_with(const std::string & str, const std::string & prefix) {
@@ -2702,12 +3075,6 @@ std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token>
    return text;
 }

-bool llama_should_add_bos_token(const llama_model * model) {
-    const int add_bos = llama_add_bos_token(model);
-
-    return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
-}
-
 //
 // Chat template utils
 //
@@ -3115,7 +3482,7 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha

 void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const llama_context * lctx,
                               const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
-    const llama_sampling_params & sparams = params.sparams;
+    const auto & sparams = params.sparams;

    fprintf(stream, "build_commit: %s\n",        LLAMA_COMMIT);
    fprintf(stream, "build_number: %d\n",        LLAMA_BUILD_NUMBER);
@@ -3166,8 +3533,6 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l

    fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
    fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
-    yaml_dump_string_multiline(stream, "cfg_negative_prompt", sparams.cfg_negative_prompt.c_str());
-    fprintf(stream, "cfg_scale: %f # default: 1.0\n", sparams.cfg_scale);
    fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
    fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
    fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
@@ -3178,10 +3543,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
    fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
    fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
-
-    const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(llama_get_model(lctx)));
-    const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY;
-    fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
+    fprintf(stream, "ignore_eos: %s # default: false\n", sparams.ignore_eos ? "true" : "false");

    yaml_dump_string_multiline(stream, "in_prefix", params.input_prefix.c_str());
    fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false");
@@ -3192,11 +3554,8 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());

    fprintf(stream, "logit_bias:\n");
-    for (std::pair<llama_token, float> lb : sparams.logit_bias) {
-        if (ignore_eos && lb.first == logit_bias_eos->first) {
-            continue;
-        }
-        fprintf(stream, "  %d: %f", lb.first, lb.second);
+    for (const auto & logit_bias : sparams.logit_bias) {
+        fprintf(stream, "  %d: %f", logit_bias.token, logit_bias.bias);
    }

    fprintf(stream, "lora:\n");
@@ -3249,7 +3608,6 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l

    fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base);
    fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
-    fprintf(stream, "seed: %u # default: -1 (random seed)\n", params.seed);
    fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
    fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
    fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false");
@@ -3259,11 +3617,11 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
    yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);

    fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
-    fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency());
+    fprintf(stream, "threads: %d # default: %u\n", params.cpuparams.n_threads, std::thread::hardware_concurrency());
    fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
    fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
    fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
-    fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
+    fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p);
    fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
    fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
 }
--- a/common/common.h
+++ b/common/common.h
@@ -67,13 +67,16 @@ enum dimre_method {
    DIMRE_METHOD_MEAN,
 };

-struct gpt_params {
-    uint32_t seed                 = LLAMA_DEFAULT_SEED; // RNG seed
+struct cpu_params {
+    int      n_threads                   = -1;
+    bool     cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
+    bool     mask_valid                  = false;   // Default: any CPU
+    enum ggml_sched_priority  priority   = GGML_SCHED_PRIO_NORMAL;  // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
+    bool     strict_cpu                  = false;   // Use strict CPU placement
+    uint32_t poll                        = 50;      // Polling (busywait) level (0 - no polling, 100 - mostly polling)
+};

-    int32_t n_threads             = cpu_get_num_math();
-    int32_t n_threads_draft       =    -1;
-    int32_t n_threads_batch       =    -1; // number of threads to use for batch processing (-1 = use n_threads)
-    int32_t n_threads_batch_draft =    -1;
+struct gpt_params {
    int32_t n_predict             =    -1; // new tokens to predict
    int32_t n_ctx                 =     0; // context size
    int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
@@ -100,6 +103,11 @@ struct gpt_params {
    int32_t yarn_orig_ctx         =     0; // YaRN original context length
    float   defrag_thold          = -1.0f; // KV cache defragmentation threshold

+    struct cpu_params cpuparams;
+    struct cpu_params cpuparams_batch;
+    struct cpu_params draft_cpuparams;
+    struct cpu_params draft_cpuparams_batch;
+
    ggml_backend_sched_eval_callback cb_eval = nullptr;
    void * cb_eval_user_data                 = nullptr;

@@ -110,8 +118,7 @@ struct gpt_params {
    enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
    enum llama_attention_type    attention_type    = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings

-    // // sampling parameters
-    struct llama_sampling_params sparams;
+    struct gpt_sampling_params sparams;

    std::string model                = ""; // model path
    std::string model_draft          = ""; // draft model for speculative decoding
@@ -175,7 +182,6 @@ struct gpt_params {
    bool flash_attn        = false; // flash attention

    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
-    bool ignore_eos        = false; // ignore generated EOS tokens
    bool logits_all        = false; // return logits for all tokens in the batch
    bool use_mmap          = true;  // use mmap for faster loads
    bool use_mlock         = false; // use mlock to keep model in memory
@@ -204,7 +210,7 @@ struct gpt_params {
    int32_t port           = 8080;         // server listens on this network port
    int32_t timeout_read   = 600;          // http read timeout in seconds
    int32_t timeout_write  = timeout_read; // http write timeout in seconds
-    int32_t n_threads_http = -1;           // number of threads to process HTTP requests
+    int     n_threads_http = -1;           // number of threads to process HTTP requests (TODO: support threadpool)

    std::string hostname      = "127.0.0.1";
    std::string public_path   = "";
@@ -267,7 +273,7 @@ struct gpt_params {
    std::string lora_outfile = "ggml-lora-merged-f16.gguf";
 };

-void gpt_params_handle_hf_token(gpt_params & params);
+void gpt_params_parse_from_env(gpt_params & params);
 void gpt_params_handle_model_default(gpt_params & params);

 bool gpt_params_parse_ex   (int argc, char ** argv, gpt_params & params);
@@ -277,6 +283,11 @@ void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);

 std::string gpt_params_get_system_info(const gpt_params & params);

+bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
+bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
+void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr);
+bool set_process_priority(enum ggml_sched_priority prio);
+
 //
 // String utils
 //
@@ -327,8 +338,9 @@ struct llama_init_result {

 struct llama_init_result    llama_init_from_gpt_params(gpt_params & params);

-struct llama_model_params   llama_model_params_from_gpt_params  (const gpt_params & params);
-struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
+struct llama_model_params     llama_model_params_from_gpt_params    (const gpt_params & params);
+struct llama_context_params   llama_context_params_from_gpt_params  (const gpt_params & params);
+struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);

 struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
 struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
@@ -380,10 +392,6 @@ std::string llama_detokenize(
        const std::vector<llama_token> & tokens,
                                  bool   special = true);

-// Uses the value from the model metadata if possible, otherwise
-// defaults to true when model type is SPM, otherwise false.
-bool llama_should_add_bos_token(const llama_model * model);
-
 //
 // Chat template utils
 //
--- a/common/grammar-parser.cpp
+++ b/common/grammar-parser.cpp
@@ -1,536 +0,0 @@
-#include "grammar-parser.h"
-#include <cstdint>
-#include <cwchar>
-#include <string>
-#include <utility>
-#include <stdexcept>
-#include <exception>
-
-namespace grammar_parser {
-    // NOTE: assumes valid utf8 (but checks for overrun)
-    // copied from llama.cpp
-    static std::pair<uint32_t, const char *> decode_utf8(const char * src) {
-        static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
-        uint8_t  first_byte = static_cast<uint8_t>(*src);
-        uint8_t  highbits   = first_byte >> 4;
-        int      len        = lookup[highbits];
-        uint8_t  mask       = (1 << (8 - len)) - 1;
-        uint32_t value      = first_byte & mask;
-        const char * end    = src + len; // may overrun!
-        const char * pos    = src + 1;
-        for ( ; pos < end && *pos; pos++) {
-            value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
-        }
-        return std::make_pair(value, pos);
-    }
-
-    static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
-        uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
-        auto result = state.symbol_ids.emplace(std::string(src, len), next_id);
-        return result.first->second;
-    }
-
-    static uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) {
-        uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
-        state.symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id;
-        return next_id;
-    }
-
-    static void add_rule(
-            parse_state & state,
-            uint32_t      rule_id,
-            const std::vector<llama_grammar_element> & rule) {
-        if (state.rules.size() <= rule_id) {
-            state.rules.resize(rule_id + 1);
-        }
-        state.rules[rule_id] = rule;
-    }
-
-    static bool is_digit_char(char c) {
-        return '0' <= c && c <= '9';
-    }
-
-    static bool is_word_char(char c) {
-        return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || is_digit_char(c);
-    }
-
-    static std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
-        const char * pos   = src;
-        const char * end   = src + size;
-        uint32_t     value = 0;
-        for ( ; pos < end && *pos; pos++) {
-            value <<= 4;
-            char c = *pos;
-            if ('a' <= c && c <= 'f') {
-                value += c - 'a' + 10;
-            } else if ('A' <= c && c <= 'F') {
-                value += c - 'A' + 10;
-            } else if ('0' <= c && c <= '9') {
-                value += c - '0';
-            } else {
-                break;
-            }
-        }
-        if (pos != end) {
-            throw std::runtime_error("expecting " + std::to_string(size) + " hex chars at " + src);
-        }
-        return std::make_pair(value, pos);
-    }
-
-    static const char * parse_space(const char * src, bool newline_ok) {
-        const char * pos = src;
-        while (*pos == ' ' || *pos == '\t' || *pos == '#' ||
-                (newline_ok && (*pos == '\r' || *pos == '\n'))) {
-            if (*pos == '#') {
-                while (*pos && *pos != '\r' && *pos != '\n') {
-                    pos++;
-                }
-            } else {
-                pos++;
-            }
-        }
-        return pos;
-    }
-
-    static const char * parse_name(const char * src) {
-        const char * pos = src;
-        while (is_word_char(*pos)) {
-            pos++;
-        }
-        if (pos == src) {
-            throw std::runtime_error(std::string("expecting name at ") + src);
-        }
-        return pos;
-    }
-
-    static const char * parse_int(const char * src) {
-        const char * pos = src;
-        while (is_digit_char(*pos)) {
-            pos++;
-        }
-        if (pos == src) {
-            throw std::runtime_error(std::string("expecting integer at ") + src);
-        }
-        return pos;
-    }
-
-    static std::pair<uint32_t, const char *> parse_char(const char * src) {
-        if (*src == '\\') {
-            switch (src[1]) {
-                case 'x': return parse_hex(src + 2, 2);
-                case 'u': return parse_hex(src + 2, 4);
-                case 'U': return parse_hex(src + 2, 8);
-                case 't': return std::make_pair('\t', src + 2);
-                case 'r': return std::make_pair('\r', src + 2);
-                case 'n': return std::make_pair('\n', src + 2);
-                case '\\':
-                case '"':
-                case '[':
-                case ']':
-                    return std::make_pair(src[1], src + 2);
-                default:
-                    throw std::runtime_error(std::string("unknown escape at ") + src);
-            }
-        } else if (*src) {
-            return decode_utf8(src);
-        }
-        throw std::runtime_error("unexpected end of input");
-    }
-
-    const char * parse_alternates(
-            parse_state       & state,
-            const char        * src,
-            const std::string & rule_name,
-            uint32_t            rule_id,
-            bool                is_nested);
-
-    static const char * parse_sequence(
-            parse_state                        & state,
-            const char                         * src,
-            const std::string                  & rule_name,
-            std::vector<llama_grammar_element> & out_elements,
-            bool                                 is_nested) {
-        size_t last_sym_start = out_elements.size();
-        const char * pos = src;
-
-        auto handle_repetitions = [&](int min_times, int max_times) {
-
-            if (last_sym_start == out_elements.size()) {
-                throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos);
-            }
-
-            // apply transformation to previous symbol (last_sym_start to end) according to
-            // the following rewrite rules:
-            // S{m,n} --> S S S (m times) S'(n-m)
-            //            S'(x)   ::= S S'(x-1) |
-            //            (... n-m definitions of these S' rules ...)
-            //            S'(1)   ::= S |
-            // S{m,} -->  S S S (m times) S'
-            //            S'     ::= S S' |
-            // S*     --> S{0,}
-            //        --> S'     ::= S S' |
-            // S+     --> S{1,}
-            //        --> S S'
-            //            S'     ::= S S' |
-            // S?     --> S{0,1}
-            //        --> S'
-            //            S'     ::= S |
-
-            std::vector<llama_grammar_element> previous_elements(out_elements.begin() + last_sym_start, out_elements.end());
-            if (min_times == 0) {
-                out_elements.resize(last_sym_start);
-            } else {
-                // Repeat the previous elements (min_times - 1) times
-                for (int i = 1; i < min_times; i++) {
-                    out_elements.insert(out_elements.end(), previous_elements.begin(), previous_elements.end());
-                }
-            }
-
-            uint32_t last_rec_rule_id = 0;
-            auto n_opt = max_times < 0 ? 1 : max_times - min_times;
-
-            std::vector<llama_grammar_element> rec_rule(previous_elements);
-            for (int i = 0; i < n_opt; i++) {
-                rec_rule.resize(previous_elements.size());
-                uint32_t rec_rule_id = generate_symbol_id(state, rule_name);
-                if (i > 0 || max_times < 0) {
-                    rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, max_times < 0 ? rec_rule_id : last_rec_rule_id});
-                }
-                rec_rule.push_back({LLAMA_GRETYPE_ALT, 0});
-                rec_rule.push_back({LLAMA_GRETYPE_END, 0});
-                add_rule(state, rec_rule_id, rec_rule);
-                last_rec_rule_id = rec_rule_id;
-            }
-            if (n_opt > 0) {
-                out_elements.push_back({LLAMA_GRETYPE_RULE_REF, last_rec_rule_id});
-            }
-        };
-
-        while (*pos) {
-            if (*pos == '"') { // literal string
-                pos++;
-                last_sym_start = out_elements.size();
-                while (*pos != '"') {
-                    if (!*pos) {
-                        throw std::runtime_error("unexpected end of input");
-                    }
-                    auto char_pair = parse_char(pos);
-                         pos       = char_pair.second;
-                    out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
-                }
-                pos = parse_space(pos + 1, is_nested);
-            } else if (*pos == '[') { // char range(s)
-                pos++;
-                enum llama_gretype start_type = LLAMA_GRETYPE_CHAR;
-                if (*pos == '^') {
-                    pos++;
-                    start_type = LLAMA_GRETYPE_CHAR_NOT;
-                }
-                last_sym_start = out_elements.size();
-                while (*pos != ']') {
-                    if (!*pos) {
-                        throw std::runtime_error("unexpected end of input");
-                    }
-                    auto char_pair = parse_char(pos);
-                         pos       = char_pair.second;
-                    enum llama_gretype type = last_sym_start < out_elements.size()
-                        ? LLAMA_GRETYPE_CHAR_ALT
-                        : start_type;
-
-                    out_elements.push_back({type, char_pair.first});
-                    if (pos[0] == '-' && pos[1] != ']') {
-                        if (!pos[1]) {
-                            throw std::runtime_error("unexpected end of input");
-                        }
-                        auto endchar_pair = parse_char(pos + 1);
-                             pos          = endchar_pair.second;
-                        out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});
-                    }
-                }
-                pos = parse_space(pos + 1, is_nested);
-            } else if (is_word_char(*pos)) { // rule reference
-                const char * name_end    = parse_name(pos);
-                uint32_t     ref_rule_id = get_symbol_id(state, pos, name_end - pos);
-                pos = parse_space(name_end, is_nested);
-                last_sym_start = out_elements.size();
-                out_elements.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id});
-            } else if (*pos == '(') { // grouping
-                // parse nested alternates into synthesized rule
-                pos = parse_space(pos + 1, true);
-                uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
-                pos = parse_alternates(state, pos, rule_name, sub_rule_id, true);
-                last_sym_start = out_elements.size();
-                // output reference to synthesized rule
-                out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
-                if (*pos != ')') {
-                    throw std::runtime_error(std::string("expecting ')' at ") + pos);
-                }
-                pos = parse_space(pos + 1, is_nested);
-            } else if (*pos == '.') { // any char
-                last_sym_start = out_elements.size();
-                out_elements.push_back({LLAMA_GRETYPE_CHAR_ANY, 0});
-                pos = parse_space(pos + 1, is_nested);
-            } else if (*pos == '*') {
-                pos = parse_space(pos + 1, is_nested);
-                handle_repetitions(0, -1);
-            } else if (*pos == '+') {
-                pos = parse_space(pos + 1, is_nested);
-                handle_repetitions(1, -1);
-            } else if (*pos == '?') {
-                pos = parse_space(pos + 1, is_nested);
-                handle_repetitions(0, 1);
-            } else if (*pos == '{') {
-                pos = parse_space(pos + 1, is_nested);
-
-                if (!is_digit_char(*pos)) {
-                    throw std::runtime_error(std::string("expecting an int at ") + pos);
-                }
-                const char * int_end = parse_int(pos);
-                int min_times = std::stoul(std::string(pos, int_end - pos));
-                pos = parse_space(int_end, is_nested);
-
-                int max_times = -1;
-
-                if (*pos == '}') {
-                    max_times = min_times;
-                    pos = parse_space(pos + 1, is_nested);
-                } else if (*pos == ',') {
-                    pos = parse_space(pos + 1, is_nested);
-
-                    if (is_digit_char(*pos)) {
-                        const char * int_end = parse_int(pos);
-                        max_times = std::stoul(std::string(pos, int_end - pos));
-                        pos = parse_space(int_end, is_nested);
-                    }
-
-                    if (*pos != '}') {
-                        throw std::runtime_error(std::string("expecting '}' at ") + pos);
-                    }
-                    pos = parse_space(pos + 1, is_nested);
-                } else {
-                    throw std::runtime_error(std::string("expecting ',' at ") + pos);
-                }
-                handle_repetitions(min_times, max_times);
-            } else {
-                break;
-            }
-        }
-        return pos;
-    }
-
-    const char * parse_alternates(
-            parse_state       & state,
-            const char        * src,
-            const std::string & rule_name,
-            uint32_t            rule_id,
-            bool                is_nested) {
-        std::vector<llama_grammar_element> rule;
-        const char * pos = parse_sequence(state, src, rule_name, rule, is_nested);
-        while (*pos == '|') {
-            rule.push_back({LLAMA_GRETYPE_ALT, 0});
-            pos = parse_space(pos + 1, true);
-            pos = parse_sequence(state, pos, rule_name, rule, is_nested);
-        }
-        rule.push_back({LLAMA_GRETYPE_END, 0});
-        add_rule(state, rule_id, rule);
-        return pos;
-    }
-
-    static const char * parse_rule(parse_state & state, const char * src) {
-        const char * name_end = parse_name(src);
-        const char * pos      = parse_space(name_end, false);
-        size_t       name_len = name_end - src;
-        uint32_t     rule_id  = get_symbol_id(state, src, name_len);
-        const std::string name(src, name_len);
-
-        if (!(pos[0] == ':' && pos[1] == ':' && pos[2] == '=')) {
-            throw std::runtime_error(std::string("expecting ::= at ") + pos);
-        }
-        pos = parse_space(pos + 3, true);
-
-        pos = parse_alternates(state, pos, name, rule_id, false);
-
-        if (*pos == '\r') {
-            pos += pos[1] == '\n' ? 2 : 1;
-        } else if (*pos == '\n') {
-            pos++;
-        } else if (*pos) {
-            throw std::runtime_error(std::string("expecting newline or end at ") + pos);
-        }
-        return parse_space(pos, true);
-    }
-
-    parse_state parse(const char * src) {
-        try {
-            parse_state state;
-            const char * pos = parse_space(src, true);
-            while (*pos) {
-                pos = parse_rule(state, pos);
-            }
-            // Validate the state to ensure that all rules are defined
-            for (const auto & rule : state.rules) {
-                for (const auto & elem : rule) {
-                    if (elem.type == LLAMA_GRETYPE_RULE_REF) {
-                        // Ensure that the rule at that location exists
-                        if (elem.value >= state.rules.size() || state.rules[elem.value].empty()) {
-                            // Get the name of the rule that is missing
-                            for (const auto & kv : state.symbol_ids) {
-                                if (kv.second == elem.value) {
-                                    throw std::runtime_error("Undefined rule identifier '" + kv.first + "'");
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-            return state;
-        } catch (const std::exception & err) {
-            fprintf(stderr, "%s: error parsing grammar: %s\n", __func__, err.what());
-            return parse_state();
-        }
-    }
-
-    static void print_grammar_char(FILE * file, uint32_t c) {
-        if (0x20 <= c && c <= 0x7f) {
-            fprintf(file, "%c", static_cast<char>(c));
-        } else {
-            // cop out of encoding UTF-8
-            fprintf(file, "<U+%04X>", c);
-        }
-    }
-
-    static bool is_char_element(llama_grammar_element elem) {
-        switch (elem.type) {
-            case LLAMA_GRETYPE_CHAR:           return true;
-            case LLAMA_GRETYPE_CHAR_NOT:       return true;
-            case LLAMA_GRETYPE_CHAR_ALT:       return true;
-            case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true;
-            case LLAMA_GRETYPE_CHAR_ANY:       return true;
-            default:                           return false;
-        }
-    }
-
-    static void print_rule_binary(FILE * file, const std::vector<llama_grammar_element> & rule) {
-        for (auto elem : rule) {
-            switch (elem.type) {
-                case LLAMA_GRETYPE_END:            fprintf(file, "END");            break;
-                case LLAMA_GRETYPE_ALT:            fprintf(file, "ALT");            break;
-                case LLAMA_GRETYPE_RULE_REF:       fprintf(file, "RULE_REF");       break;
-                case LLAMA_GRETYPE_CHAR:           fprintf(file, "CHAR");           break;
-                case LLAMA_GRETYPE_CHAR_NOT:       fprintf(file, "CHAR_NOT");       break;
-                case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
-                case LLAMA_GRETYPE_CHAR_ALT:       fprintf(file, "CHAR_ALT");       break;
-                case LLAMA_GRETYPE_CHAR_ANY:       fprintf(file, "CHAR_ANY");       break;
-            }
-            switch (elem.type) {
-                case LLAMA_GRETYPE_END:
-                case LLAMA_GRETYPE_ALT:
-                case LLAMA_GRETYPE_RULE_REF:
-                    fprintf(file, "(%u) ", elem.value);
-                    break;
-                case LLAMA_GRETYPE_CHAR:
-                case LLAMA_GRETYPE_CHAR_NOT:
-                case LLAMA_GRETYPE_CHAR_RNG_UPPER:
-                case LLAMA_GRETYPE_CHAR_ALT:
-                case LLAMA_GRETYPE_CHAR_ANY:
-                    fprintf(file, "(\"");
-                    print_grammar_char(file, elem.value);
-                    fprintf(file, "\") ");
-                    break;
-            }
-        }
-        fprintf(file, "\n");
-    }
-
-    static void print_rule(
-            FILE     * file,
-            uint32_t   rule_id,
-            const std::vector<llama_grammar_element> & rule,
-            const std::map<uint32_t, std::string>    & symbol_id_names) {
-        if (rule.empty() || rule.back().type != LLAMA_GRETYPE_END) {
-            throw std::runtime_error(
-                "malformed rule, does not end with LLAMA_GRETYPE_END: " + std::to_string(rule_id));
-        }
-        fprintf(file, "%s ::= ", symbol_id_names.at(rule_id).c_str());
-        for (size_t i = 0, end = rule.size() - 1; i < end; i++) {
-            llama_grammar_element elem = rule[i];
-            switch (elem.type) {
-                case LLAMA_GRETYPE_END:
-                    throw std::runtime_error(
-                        "unexpected end of rule: " + std::to_string(rule_id) + "," +
-                        std::to_string(i));
-                case LLAMA_GRETYPE_ALT:
-                    fprintf(file, "| ");
-                    break;
-                case LLAMA_GRETYPE_RULE_REF:
-                    fprintf(file, "%s ", symbol_id_names.at(elem.value).c_str());
-                    break;
-                case LLAMA_GRETYPE_CHAR:
-                    fprintf(file, "[");
-                    print_grammar_char(file, elem.value);
-                    break;
-                case LLAMA_GRETYPE_CHAR_NOT:
-                    fprintf(file, "[^");
-                    print_grammar_char(file, elem.value);
-                    break;
-                case LLAMA_GRETYPE_CHAR_RNG_UPPER:
-                    if (i == 0 || !is_char_element(rule[i - 1])) {
-                        throw std::runtime_error(
-                            "LLAMA_GRETYPE_CHAR_RNG_UPPER without preceding char: " +
-                            std::to_string(rule_id) + "," + std::to_string(i));
-                    }
-                    fprintf(file, "-");
-                    print_grammar_char(file, elem.value);
-                    break;
-                case LLAMA_GRETYPE_CHAR_ALT:
-                    if (i == 0 || !is_char_element(rule[i - 1])) {
-                        throw std::runtime_error(
-                            "LLAMA_GRETYPE_CHAR_ALT without preceding char: " +
-                            std::to_string(rule_id) + "," + std::to_string(i));
-                    }
-                    print_grammar_char(file, elem.value);
-                    break;
-                case LLAMA_GRETYPE_CHAR_ANY:
-                    fprintf(file, ".");
-                    break;
-            }
-            if (is_char_element(elem)) {
-                switch (rule[i + 1].type) {
-                    case LLAMA_GRETYPE_CHAR_ALT:
-                    case LLAMA_GRETYPE_CHAR_RNG_UPPER:
-                    case LLAMA_GRETYPE_CHAR_ANY:
-                        break;
-                    default:
-                        fprintf(file, "] ");
-                }
-            }
-        }
-        fprintf(file, "\n");
-    }
-
-    void print_grammar(FILE * file, const parse_state & state) {
-        try {
-            std::map<uint32_t, std::string> symbol_id_names;
-            for (const auto & kv : state.symbol_ids) {
-                symbol_id_names[kv.second] = kv.first;
-            }
-            for (size_t i = 0, end = state.rules.size(); i < end; i++) {
-                // fprintf(file, "%zu: ", i);
-                // print_rule_binary(file, state.rules[i]);
-                print_rule(file, uint32_t(i), state.rules[i], symbol_id_names);
-                // fprintf(file, "\n");
-            }
-        } catch (const std::exception & err) {
-            fprintf(stderr, "\n%s: error printing grammar: %s\n", __func__, err.what());
-        }
-    }
-
-    std::vector<const llama_grammar_element *> parse_state::c_rules() {
-        std::vector<const llama_grammar_element *> ret;
-        ret.reserve(rules.size());
-        for (const auto & rule : rules) {
-            ret.push_back(rule.data());
-        }
-        return ret;
-    }
-}
--- a/common/grammar-parser.h
+++ b/common/grammar-parser.h
@@ -1,29 +0,0 @@
-// Implements a parser for an extended Backus-Naur form (BNF), producing the
-// binary context-free grammar format specified by llama.h. Supports character
-// ranges, grouping, and repetition operators. As an example, a grammar for
-// arithmetic might look like:
-//
-// root  ::= expr
-// expr  ::= term ([-+*/] term)*
-// term  ::= num | "(" space expr ")" space
-// num   ::= [0-9]+ space
-// space ::= [ \t\n]*
-
-#pragma once
-#include "llama.h"
-#include <vector>
-#include <map>
-#include <cstdint>
-#include <string>
-
-namespace grammar_parser {
-    struct parse_state {
-        std::map<std::string, uint32_t>                 symbol_ids;
-        std::vector<std::vector<llama_grammar_element>> rules;
-
-        std::vector<const llama_grammar_element *> c_rules();
-    };
-
-    parse_state parse(const char * src);
-    void print_grammar(FILE * file, const parse_state & state);
-}
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -1,141 +1,28 @@
-#define LLAMA_API_INTERNAL
 #include "sampling.h"
-#include <random>

-struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) {
-    struct llama_sampling_context * result = new llama_sampling_context();
+#include "common.h"

-    result->params  = params;
-    result->grammar = nullptr;
-
-    // if there is a grammar, parse it
-    if (!params.grammar.empty()) {
-        result->parsed_grammar = grammar_parser::parse(params.grammar.c_str());
-
-        // will be empty (default) if there are parse errors
-        if (result->parsed_grammar.rules.empty()) {
-            fprintf(stderr, "%s: failed to parse grammar\n", __func__);
-            delete result;
-            return nullptr;
-        }
-
-        // Ensure that there is a "root" node.
-        if (result->parsed_grammar.symbol_ids.find("root") == result->parsed_grammar.symbol_ids.end()) {
-            fprintf(stderr, "%s: grammar does not contain a 'root' symbol\n", __func__);
-            delete result;
-            return nullptr;
-        }
-
-        std::vector<const llama_grammar_element *> grammar_rules(result->parsed_grammar.c_rules());
-
-        struct llama_grammar * grammar = llama_grammar_init(
-                grammar_rules.data(),
-                grammar_rules.size(), result->parsed_grammar.symbol_ids.at("root"));
-        if (grammar == nullptr) {
-            throw std::runtime_error("Failed to initialize llama_grammar");
-        }
-        result->grammar = grammar;
-    }
-
-    result->prev.resize(params.n_prev);
-
-    result->n_valid = 0;
-
-    llama_sampling_set_rng_seed(result, params.seed);
-
-    return result;
-}
-
-void llama_sampling_free(struct llama_sampling_context * ctx) {
-    if (ctx->grammar != NULL) {
-        llama_grammar_free(ctx->grammar);
-    }
-
-    delete ctx;
-}
-
-void llama_sampling_reset(llama_sampling_context * ctx) {
-    if (ctx->grammar != NULL) {
-        llama_grammar_free(ctx->grammar);
-        ctx->grammar = NULL;
-    }
-
-    if (!ctx->parsed_grammar.rules.empty()) {
-        std::vector<const llama_grammar_element *> grammar_rules(ctx->parsed_grammar.c_rules());
-
-        struct llama_grammar * grammar = llama_grammar_init(
-                grammar_rules.data(),
-                grammar_rules.size(), ctx->parsed_grammar.symbol_ids.at("root"));
-        if (grammar == nullptr) {
-            throw std::runtime_error("Failed to initialize llama_grammar");
-        }
-        ctx->grammar = grammar;
-    }
-
-    std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
-    ctx->cur.clear();
-    ctx->n_valid = 0;
-}
-
-void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) {
-    if (seed == LLAMA_DEFAULT_SEED) {
-        seed = std::random_device{}();
-    }
-    ctx->rng.seed(seed);
-}
-
-void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst) {
-    if (dst->grammar) {
-        llama_grammar_free(dst->grammar);
-        dst->grammar = nullptr;
-    }
-
-    if (src->grammar) {
-        dst->grammar = llama_grammar_copy(src->grammar);
-    }
-
-    dst->prev = src->prev;
-}
-
-llama_token llama_sampling_last(llama_sampling_context * ctx) {
-    return ctx->prev.back();
-}
-
-std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n) {
-    const int size = ctx_sampling->prev.size();
-
-    n = std::min(n, size);
-
-    std::string result;
-
-    for (int i = size - n; i < size; i++) {
-        result += llama_token_to_piece(ctx_main, ctx_sampling->prev[i]);
-    }
-
-    return result;
-}
-
-std::string llama_sampling_print(const llama_sampling_params & params) {
+std::string gpt_sampling_params::print_all() const {
    char result[1024];

    snprintf(result, sizeof(result),
            "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
            "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
            "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
-            params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present,
-            params.top_k, params.tfs_z, params.top_p, params.min_p, params.typical_p, params.temp,
-            params.mirostat, params.mirostat_eta, params.mirostat_tau);
+            penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
+            top_k, tfs_z, top_p, min_p, typ_p, temp,
+            mirostat, mirostat_eta, mirostat_tau);

    return std::string(result);
 }

-std::string llama_sampling_order_print(const llama_sampling_params & params) {
+std::string gpt_sampling_params::print_samplers() const {
    std::string result = "CFG -> Penalties ";
-    if (params.mirostat == 0) {
-        for (auto sampler_type : params.samplers_sequence) {
-            const auto sampler_type_name = llama_sampling_type_to_str(sampler_type);
-            if (!sampler_type_name.empty()) {
-                result += "-> " + sampler_type_name + " ";
+    if (mirostat == 0) {
+        for (const auto & sampler : samplers) {
+            const auto name = llama_sampling_type_to_str(sampler);
+            if (!name.empty()) {
+                result += "-> " + name + " ";
            }
        }
    } else {
@@ -145,316 +32,191 @@ std::string llama_sampling_order_print(const llama_sampling_params & params) {
    return result;
 }

-std::string llama_sampling_type_to_str(llama_sampler_type sampler_type) {
-    switch (sampler_type) {
-        case llama_sampler_type::TOP_K:       return "top_k";
-        case llama_sampler_type::TFS_Z:       return "tfs_z";
-        case llama_sampler_type::TYPICAL_P:   return "typical_p";
-        case llama_sampler_type::TOP_P:       return "top_p";
-        case llama_sampler_type::MIN_P:       return "min_p";
-        case llama_sampler_type::TEMPERATURE: return "temperature";
+struct llama_sampling * llama_sampling_init(const struct llama_model * model, const struct gpt_sampling_params & params) {
+    llama_sampling_params lparams = llama_sampling_default_params();
+
+    lparams.seed              = params.seed;
+    lparams.n_prev            = params.n_prev;
+    lparams.n_probs           = params.n_probs;
+    lparams.min_keep          = params.min_keep;
+    lparams.top_k             = params.top_k;
+    lparams.top_p             = params.top_p;
+    lparams.min_p             = params.min_p;
+    lparams.tfs_z             = params.tfs_z;
+    lparams.typ_p             = params.typ_p;
+    lparams.temp              = params.temp;
+    lparams.dynatemp_range    = params.dynatemp_range;
+    lparams.dynatemp_exponent = params.dynatemp_exponent;
+    lparams.penalty_last_n    = params.penalty_last_n;
+    lparams.penalty_repeat    = params.penalty_repeat;
+    lparams.penalty_freq      = params.penalty_freq;
+    lparams.penalty_present   = params.penalty_present;
+    lparams.mirostat          = params.mirostat;
+    lparams.mirostat_tau      = params.mirostat_tau;
+    lparams.mirostat_eta      = params.mirostat_eta;
+    lparams.penalize_nl       = params.penalize_nl;
+    lparams.ignore_eos        = params.ignore_eos;
+
+    lparams.n_samplers = params.samplers.size();
+    for (int i = 0; i < lparams.n_samplers; i++) {
+        lparams.samplers[i] = params.samplers[i];
+    }
+
+    struct llama_sampling * result = llama_sampling_init(model, lparams);
+
+    llama_sampling_set_grammar   (result, params.grammar.c_str(), "root");
+    llama_sampling_set_logit_bias(result, params.logit_bias.size(), params.logit_bias.data());
+
+    return result;
+}
+
+void llama_sampling_cp(llama_sampling * src, llama_sampling *& dst) {
+    if (dst) {
+        llama_sampling_free(dst);
+    }
+
+    dst = llama_sampling_cp(src);
+}
+
+llama_token llama_sampling_sample(
+        struct llama_sampling * smpl,
+        struct llama_context * ctx,
+        int idx) {
+    llama_sampling_set_logits(smpl, llama_get_logits_ith(ctx, idx));
+
+    // first, sample the token without any grammar constraints
+    const llama_token id = llama_sampling_sample(smpl, nullptr);
+
+    // create an array with a single token data element for the sampled id
+    llama_token_data       single_token_data       = { id, 1.0f, 0.0f };
+    llama_token_data_array single_token_data_array = { &single_token_data, 1, false };
+
+    llama_sampling_grammar(smpl, &single_token_data_array);
+
+    // check if the token is valid according to the grammar by seeing if its logit has been set to -INFINITY
+    const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
+    if (is_valid) {
+        return id;
+    }
+
+    // if the token is not valid, sample again, after applying the grammar constraints
+    llama_sampling_set_logits(smpl, llama_get_logits_ith(ctx, idx));
+
+    llama_sampling_grammar(smpl, nullptr);
+
+    return llama_sampling_sample(smpl, nullptr);
+}
+
+std::string llama_sampling_prev_str(llama_sampling * smpl, llama_context * ctx_main, int n) {
+    n = std::min(n, llama_sampling_n_prev(smpl));
+
+    if (n <= 0) {
+        return "";
+    }
+
+    std::string result;
+    result.reserve(8*n); // 8 is the average length of a token [citation needed], TODO: compute this from the vocab
+
+    for (int i = n - 1; i >= 0; i--) {
+        const llama_token id = llama_sampling_prev(smpl, i);
+
+        GGML_ASSERT(id != LLAMA_TOKEN_NULL && "null token in the sampling history - should not happen");
+
+        result += llama_token_to_piece(ctx_main, id);
+    }
+
+    return result;
+}
+
+char llama_sampling_type_to_chr(llama_sampler_type sampler) {
+    switch (sampler) {
+        case LLAMA_SAMPLER_TYPE_TOP_K:       return 'k';
+        case LLAMA_SAMPLER_TYPE_TFS_Z:       return 'f';
+        case LLAMA_SAMPLER_TYPE_TYPICAL_P:   return 'y';
+        case LLAMA_SAMPLER_TYPE_TOP_P:       return 'p';
+        case LLAMA_SAMPLER_TYPE_MIN_P:       return 'm';
+        case LLAMA_SAMPLER_TYPE_TEMPERATURE: return 't';
+        default : return '?';
+    }
+}
+
+std::string llama_sampling_type_to_str(llama_sampler_type sampler) {
+    switch (sampler) {
+        case LLAMA_SAMPLER_TYPE_TOP_K:       return "top_k";
+        case LLAMA_SAMPLER_TYPE_TFS_Z:       return "tfs_z";
+        case LLAMA_SAMPLER_TYPE_TYPICAL_P:   return "typ_p";
+        case LLAMA_SAMPLER_TYPE_TOP_P:       return "top_p";
+        case LLAMA_SAMPLER_TYPE_MIN_P:       return "min_p";
+        case LLAMA_SAMPLER_TYPE_TEMPERATURE: return "temperature";
        default : return "";
    }
 }

 std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
    std::unordered_map<std::string, llama_sampler_type> sampler_canonical_name_map {
-        {"top_k",       llama_sampler_type::TOP_K},
-        {"top_p",       llama_sampler_type::TOP_P},
-        {"typical_p",   llama_sampler_type::TYPICAL_P},
-        {"min_p",       llama_sampler_type::MIN_P},
-        {"tfs_z",       llama_sampler_type::TFS_Z},
-        {"temperature", llama_sampler_type::TEMPERATURE}
+        { "top_k",       LLAMA_SAMPLER_TYPE_TOP_K },
+        { "top_p",       LLAMA_SAMPLER_TYPE_TOP_P },
+        { "typ_p",       LLAMA_SAMPLER_TYPE_TYPICAL_P },
+        { "min_p",       LLAMA_SAMPLER_TYPE_MIN_P },
+        { "tfs_z",       LLAMA_SAMPLER_TYPE_TFS_Z },
+        { "temperature", LLAMA_SAMPLER_TYPE_TEMPERATURE },
    };

    // since samplers names are written multiple ways
    // make it ready for both system names and input names
    std::unordered_map<std::string, llama_sampler_type> sampler_alt_name_map {
-        {"top-k",       llama_sampler_type::TOP_K},
-        {"top-p",       llama_sampler_type::TOP_P},
-        {"nucleus",     llama_sampler_type::TOP_P},
-        {"typical-p",   llama_sampler_type::TYPICAL_P},
-        {"typical",     llama_sampler_type::TYPICAL_P},
-        {"min-p",       llama_sampler_type::MIN_P},
-        {"tfs-z",       llama_sampler_type::TFS_Z},
-        {"tfs",         llama_sampler_type::TFS_Z},
-        {"temp",        llama_sampler_type::TEMPERATURE}
+        { "top-k",       LLAMA_SAMPLER_TYPE_TOP_K },
+        { "top-p",       LLAMA_SAMPLER_TYPE_TOP_P },
+        { "nucleus",     LLAMA_SAMPLER_TYPE_TOP_P },
+        { "typical-p",   LLAMA_SAMPLER_TYPE_TYPICAL_P },
+        { "typical",     LLAMA_SAMPLER_TYPE_TYPICAL_P },
+        { "typ-p",       LLAMA_SAMPLER_TYPE_TYPICAL_P },
+        { "typ",         LLAMA_SAMPLER_TYPE_TYPICAL_P },
+        { "min-p",       LLAMA_SAMPLER_TYPE_MIN_P },
+        { "tfs-z",       LLAMA_SAMPLER_TYPE_TFS_Z },
+        { "tfs",         LLAMA_SAMPLER_TYPE_TFS_Z },
+        { "temp",        LLAMA_SAMPLER_TYPE_TEMPERATURE },
    };

-    std::vector<llama_sampler_type> sampler_types;
-    sampler_types.reserve(names.size());
-    for (const auto & name : names)
-    {
-        auto sampler_item = sampler_canonical_name_map.find(name);
-        if (sampler_item != sampler_canonical_name_map.end())
-        {
-            sampler_types.push_back(sampler_item->second);
-        }
-        else
-        {
-            if (allow_alt_names)
-            {
-                sampler_item = sampler_alt_name_map.find(name);
-                if (sampler_item != sampler_alt_name_map.end())
-                {
-                    sampler_types.push_back(sampler_item->second);
-                }
-            }
-        }
-    }
-    return sampler_types;
-}
+    std::vector<llama_sampler_type> samplers;
+    samplers.reserve(names.size());

-std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string) {
-    std::unordered_map<char, llama_sampler_type> sampler_name_map {
-        {'k', llama_sampler_type::TOP_K},
-        {'p', llama_sampler_type::TOP_P},
-        {'y', llama_sampler_type::TYPICAL_P},
-        {'m', llama_sampler_type::MIN_P},
-        {'f', llama_sampler_type::TFS_Z},
-        {'t', llama_sampler_type::TEMPERATURE}
-    };
-
-    std::vector<llama_sampler_type> sampler_types;
-    sampler_types.reserve(names_string.size());
-    for (const auto & c : names_string) {
-        const auto sampler_item = sampler_name_map.find(c);
-        if (sampler_item != sampler_name_map.end()) {
-            sampler_types.push_back(sampler_item->second);
-        }
-    }
-    return sampler_types;
-}
-
-// no reasons to expose this function in header
-static void sampler_queue(
-                   struct llama_context * ctx_main,
-            const llama_sampling_params & params,
-                 llama_token_data_array & cur_p,
-                                 size_t   min_keep) {
-    const float         temp              = params.temp;
-    const float         dynatemp_range    = params.dynatemp_range;
-    const float         dynatemp_exponent = params.dynatemp_exponent;
-    const int32_t       top_k             = params.top_k;
-    const float         top_p             = params.top_p;
-    const float         min_p             = params.min_p;
-    const float         tfs_z             = params.tfs_z;
-    const float         typical_p         = params.typical_p;
-    const std::vector<llama_sampler_type> & samplers_sequence = params.samplers_sequence;
-
-    for (auto sampler_type : samplers_sequence) {
-        switch (sampler_type) {
-            case llama_sampler_type::TOP_K    : llama_sample_top_k    (ctx_main, &cur_p, top_k,     min_keep); break;
-            case llama_sampler_type::TFS_Z    : llama_sample_tail_free(ctx_main, &cur_p, tfs_z,     min_keep); break;
-            case llama_sampler_type::TYPICAL_P: llama_sample_typical  (ctx_main, &cur_p, typical_p, min_keep); break;
-            case llama_sampler_type::TOP_P    : llama_sample_top_p    (ctx_main, &cur_p, top_p,     min_keep); break;
-            case llama_sampler_type::MIN_P    : llama_sample_min_p    (ctx_main, &cur_p, min_p,     min_keep); break;
-            case llama_sampler_type::TEMPERATURE:
-                if (dynatemp_range > 0) {
-                    float dynatemp_min = std::max(0.0f, temp - dynatemp_range);
-                    float dynatemp_max = std::max(0.0f, temp + dynatemp_range);
-                    llama_sample_entropy(ctx_main, &cur_p, dynatemp_min, dynatemp_max, dynatemp_exponent);
-                } else {
-                    llama_sample_temp(ctx_main, &cur_p, temp);
-                }
-                break;
-            default : break;
-        }
-    }
-}
-
-static llama_token llama_sampling_sample_impl(
-                  struct llama_sampling_context * ctx_sampling,
-                  struct llama_context * ctx_main,
-                  struct llama_context * ctx_cfg,
-                  const int idx,
-                  bool is_resampling) {
-    const llama_sampling_params & params = ctx_sampling->params;
-
-    const float   temp            = params.temp;
-    const int     mirostat        = params.mirostat;
-    const float   mirostat_tau    = params.mirostat_tau;
-    const float   mirostat_eta    = params.mirostat_eta;
-
-    std::vector<float> original_logits;
-    auto cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx, /* apply_grammar= */ is_resampling, &original_logits);
-    if (ctx_sampling->grammar != NULL && !is_resampling) {
-        GGML_ASSERT(!original_logits.empty());
-    }
-    llama_token id = 0;
-
-    if (temp < 0.0) {
-        // greedy sampling, with probs
-        llama_sample_softmax(ctx_main, &cur_p);
-        id = cur_p.data[0].id;
-    } else if (temp == 0.0) {
-        // greedy sampling, no probs
-        id = llama_sample_token_greedy(ctx_main, &cur_p);
-    } else {
-        if (mirostat == 1) {
-            const int mirostat_m = 100;
-            llama_sample_temp(ctx_main, &cur_p, temp);
-            id = llama_sample_token_mirostat(ctx_main, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &ctx_sampling->mirostat_mu);
-        } else if (mirostat == 2) {
-            llama_sample_temp(ctx_main, &cur_p, temp);
-            id = llama_sample_token_mirostat_v2(ctx_main, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling->mirostat_mu);
+    for (const auto & name : names) {
+        auto sampler = sampler_canonical_name_map.find(name);
+        if (sampler != sampler_canonical_name_map.end()) {
+            samplers.push_back(sampler->second);
        } else {
-            // temperature sampling
-            size_t min_keep = std::max(1, params.min_keep);
-
-            sampler_queue(ctx_main, params, cur_p, min_keep);
-
-            id = llama_sample_token_with_rng(ctx_main, &cur_p, ctx_sampling->rng);
-
-            //{
-            //    const int n_top = 10;
-            //    LOG("top %d candidates:\n", n_top);
-
-            //    for (int i = 0; i < n_top; i++) {
-            //        const llama_token id = cur_p.data[i].id;
-            //        (void)id; // To avoid a warning that id is unused when logging is disabled.
-            //        LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx_main, id).c_str(), cur_p.data[i].p);
-            //    }
-            //}
-
-            //LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx_main, id).c_str());
-        }
-    }
-
-    if (ctx_sampling->grammar != NULL && !is_resampling) {
-        // Get a pointer to the logits
-        float * logits = llama_get_logits_ith(ctx_main, idx);
-
-        // Create an array with a single token data element for the sampled id
-        llama_token_data single_token_data = {id, logits[id], 0.0f};
-        llama_token_data_array single_token_data_array = { &single_token_data, 1, false };
-
-        // Apply grammar constraints to the single token
-        llama_grammar_sample(ctx_sampling->grammar, ctx_main, &single_token_data_array);
-
-        // Check if the token is valid according to the grammar by seeing if its logit has been set to -INFINITY
-        bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
-
-        // If the token is not valid according to the grammar, perform resampling
-        if (!is_valid) {
-            LOG("Resampling because token %d: '%s' does not meet grammar rules\n", id, llama_token_to_piece(ctx_main, id).c_str());
-
-            // Restore logits from the copy
-            std::copy(original_logits.begin(), original_logits.end(), logits);
-
-            return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ true);
-        }
-    }
-
-    ctx_sampling->n_valid = temp == 0.0f ? 0 : cur_p.size;
-
-    return id;
-}
-
-static llama_token_data_array llama_sampling_prepare_impl(
-                  struct llama_sampling_context * ctx_sampling,
-                  struct llama_context * ctx_main,
-                  struct llama_context * ctx_cfg,
-                  const int idx,
-                  bool apply_grammar,
-                  std::vector<float> * original_logits) {
-    const llama_sampling_params & params = ctx_sampling->params;
-
-    const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
-
-    const int32_t penalty_last_n  = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
-    const float   penalty_repeat  = params.penalty_repeat;
-    const float   penalty_freq    = params.penalty_freq;
-    const float   penalty_present = params.penalty_present;
-
-    const bool    penalize_nl     = params.penalize_nl;
-
-    auto & prev = ctx_sampling->prev;
-    auto & cur  = ctx_sampling->cur;
-
-    // Get a pointer to the logits
-    float * logits = llama_get_logits_ith(ctx_main, idx);
-
-    if (ctx_sampling->grammar != NULL && !apply_grammar) {
-        GGML_ASSERT(original_logits != NULL);
-        // Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this.
-        *original_logits = {logits, logits + n_vocab};
-    }
-
-    // apply params.logit_bias map
-    for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
-        logits[it->first] += it->second;
-    }
-
-    if (ctx_cfg) {
-        float * logits_guidance = llama_get_logits_ith(ctx_cfg, idx);
-        llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale);
-    }
-
-    cur.resize(n_vocab);
-
-    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-        cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
-    }
-
-    llama_token_data_array cur_p = { cur.data(), cur.size(), false };
-
-    // apply penalties
-    const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev;
-    const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n);
-    if (penalty_tokens_used_size) {
-        const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];
-
-        llama_sample_repetition_penalties(ctx_main, &cur_p,
-                penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size,
-                penalty_tokens_used_size, penalty_repeat, penalty_freq, penalty_present);
-
-        if (!penalize_nl) {
-            for (size_t idx = 0; idx < cur_p.size; idx++) {
-                if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) {
-                    cur_p.data[idx].logit = nl_logit;
-                    break;
+            if (allow_alt_names) {
+                sampler = sampler_alt_name_map.find(name);
+                if (sampler != sampler_alt_name_map.end()) {
+                    samplers.push_back(sampler->second);
                }
            }
        }
    }

-    // apply grammar checks before sampling logic
-    if (apply_grammar && ctx_sampling->grammar != NULL) {
-        llama_grammar_sample(ctx_sampling->grammar, ctx_main, &cur_p);
+    return samplers;
+}
+
+std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & chars) {
+    std::unordered_map<char, llama_sampler_type> sampler_name_map {
+        { llama_sampling_type_to_chr(LLAMA_SAMPLER_TYPE_TOP_K),       LLAMA_SAMPLER_TYPE_TOP_K },
+        { llama_sampling_type_to_chr(LLAMA_SAMPLER_TYPE_TFS_Z),       LLAMA_SAMPLER_TYPE_TFS_Z },
+        { llama_sampling_type_to_chr(LLAMA_SAMPLER_TYPE_TYPICAL_P),   LLAMA_SAMPLER_TYPE_TYPICAL_P },
+        { llama_sampling_type_to_chr(LLAMA_SAMPLER_TYPE_TOP_P),       LLAMA_SAMPLER_TYPE_TOP_P },
+        { llama_sampling_type_to_chr(LLAMA_SAMPLER_TYPE_MIN_P),       LLAMA_SAMPLER_TYPE_MIN_P },
+        { llama_sampling_type_to_chr(LLAMA_SAMPLER_TYPE_TEMPERATURE), LLAMA_SAMPLER_TYPE_TEMPERATURE }
+    };
+
+    std::vector<llama_sampler_type> samplers;
+    samplers.reserve(chars.size());
+
+    for (const auto & c : chars) {
+        const auto sampler = sampler_name_map.find(c);
+        if (sampler != sampler_name_map.end()) {
+            samplers.push_back(sampler->second);
+        }
    }

-    return cur_p;
-}
-
-llama_token llama_sampling_sample(
-                  struct llama_sampling_context * ctx_sampling,
-                  struct llama_context * ctx_main,
-                  struct llama_context * ctx_cfg,
-                  const int idx) {
-    // Call the implementation function with is_resampling set to false by default
-    return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ false);
-}
-
-llama_token_data_array llama_sampling_prepare(
-                  struct llama_sampling_context * ctx_sampling,
-                  struct llama_context * ctx_main,
-                  struct llama_context * ctx_cfg,
-                  const int idx,
-                  bool apply_grammar,
-                  std::vector<float> * original_logits) {
-    return llama_sampling_prepare_impl(ctx_sampling,ctx_main, ctx_cfg, idx, apply_grammar, original_logits);
-}
-
-void llama_sampling_accept(
-        struct llama_sampling_context * ctx_sampling,
-        struct llama_context * ctx_main,
-        llama_token id,
-        bool apply_grammar) {
-    ctx_sampling->prev.erase(ctx_sampling->prev.begin());
-    ctx_sampling->prev.push_back(id);
-
-    if (ctx_sampling->grammar != NULL && apply_grammar) {
-        llama_grammar_accept_token(ctx_sampling->grammar, ctx_main, id);
-    }
+    return samplers;
 }
--- a/common/sampling.h
+++ b/common/sampling.h
@@ -2,159 +2,78 @@

 #include "llama.h"

-#include "grammar-parser.h"
-
-#include <random>
 #include <string>
-#include <unordered_map>
 #include <vector>

-// sampler types
-enum class llama_sampler_type : char {
-    TOP_K       = 'k',
-    TOP_P       = 'p',
-    MIN_P       = 'm',
-    TFS_Z       = 'f',
-    TYPICAL_P   = 'y',
-    TEMPERATURE = 't'
-};
-
 // sampling parameters
-typedef struct llama_sampling_params {
-    int32_t     n_prev                = 64;                 // number of previous tokens to remember
-    int32_t     n_probs               = 0;                  // if greater than 0, output the probabilities of top n_probs tokens.
-    int32_t     min_keep              = 0;                  // 0 = disabled, otherwise samplers should return at least min_keep tokens
-    int32_t     top_k                 = 40;                 // <= 0 to use vocab size
-    float       top_p                 = 0.95f;              // 1.0 = disabled
-    float       min_p                 = 0.05f;              // 0.0 = disabled
-    float       tfs_z                 = 1.00f;              // 1.0 = disabled
-    float       typical_p             = 1.00f;              // 1.0 = disabled
-    float       temp                  = 0.80f;              // <= 0.0 to sample greedily, 0.0 to not output probabilities
-    float       dynatemp_range        = 0.00f;              // 0.0 = disabled
-    float       dynatemp_exponent     = 1.00f;              // controls how entropy maps to temperature in dynamic temperature sampler
-    int32_t     penalty_last_n        = 64;                 // last n tokens to penalize (0 = disable penalty, -1 = context size)
-    float       penalty_repeat        = 1.00f;              // 1.0 = disabled
-    float       penalty_freq          = 0.00f;              // 0.0 = disabled
-    float       penalty_present       = 0.00f;              // 0.0 = disabled
-    int32_t     mirostat              = 0;                  // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
-    float       mirostat_tau          = 5.00f;              // target entropy
-    float       mirostat_eta          = 0.10f;              // learning rate
-    bool        penalize_nl           = false;              // consider newlines as a repeatable token
-    uint32_t    seed                  = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampling_context
+typedef struct gpt_sampling_params {
+    uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampling

-    std::vector<llama_sampler_type> samplers_sequence = {
-        llama_sampler_type::TOP_K,
-        llama_sampler_type::TFS_Z,
-        llama_sampler_type::TYPICAL_P,
-        llama_sampler_type::TOP_P,
-        llama_sampler_type::MIN_P,
-        llama_sampler_type::TEMPERATURE
+    int32_t n_prev            = 64;    // number of previous tokens to remember
+    int32_t n_probs           = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
+    int32_t min_keep          = 0;     // 0 = disabled, otherwise samplers should return at least min_keep tokens
+    int32_t top_k             = 40;    // <= 0 to use vocab size
+    float   top_p             = 0.95f; // 1.0 = disabled
+    float   min_p             = 0.05f; // 0.0 = disabled
+    float   tfs_z             = 1.00f; // 1.0 = disabled
+    float   typ_p             = 1.00f; // typical_p, 1.0 = disabled
+    float   temp              = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
+    float   dynatemp_range    = 0.00f; // 0.0 = disabled
+    float   dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
+    int32_t penalty_last_n    = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
+    float   penalty_repeat    = 1.00f; // 1.0 = disabled
+    float   penalty_freq      = 0.00f; // 0.0 = disabled
+    float   penalty_present   = 0.00f; // 0.0 = disabled
+    int32_t mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
+    float   mirostat_tau      = 5.00f; // target entropy
+    float   mirostat_eta      = 0.10f; // learning rate
+    bool    penalize_nl       = false; // consider newlines as a repeatable token
+    bool    ignore_eos        = false;
+
+    std::vector<enum llama_sampler_type> samplers = {
+        LLAMA_SAMPLER_TYPE_TOP_K,
+        LLAMA_SAMPLER_TYPE_TFS_Z,
+        LLAMA_SAMPLER_TYPE_TYPICAL_P,
+        LLAMA_SAMPLER_TYPE_TOP_P,
+        LLAMA_SAMPLER_TYPE_MIN_P,
+        LLAMA_SAMPLER_TYPE_TEMPERATURE
    };

-    std::string grammar;  // optional BNF-like grammar to constrain sampling
+    std::string grammar; // optional BNF-like grammar to constrain sampling

-    // Classifier-Free Guidance
-    // https://arxiv.org/abs/2306.17806
-    std::string cfg_negative_prompt; // string to help guidance
-    float       cfg_scale     = 1.f; // how strong is guidance
+    std::vector<llama_logit_bias> logit_bias; // logit biases to apply

-    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
+    // print the parameters into a string
+    std::string print_all() const;

-    std::vector<llama_token> penalty_prompt_tokens;
-    bool                     use_penalty_prompt_tokens = false;
-} llama_sampling_params;
+    // print the samplers into a string
+    std::string print_samplers() const;
+} gpt_sampling_params;

-// general sampler context
-// TODO: move to llama.h
-struct llama_sampling_context {
-    // parameters that will be used for sampling
-    llama_sampling_params params;
+// overload of llama_sampling_init using gpt_sampling_params
+struct llama_sampling * llama_sampling_init(const struct llama_model * model, const struct gpt_sampling_params & params);

-    // mirostat sampler state
-    float mirostat_mu;
+void llama_sampling_cp(llama_sampling * src, llama_sampling *& dst);

-    llama_grammar * grammar;
-
-    // internal
-    grammar_parser::parse_state parsed_grammar;
-
-    // TODO: replace with ring-buffer
-    std::vector<llama_token>      prev;
-    std::vector<llama_token_data> cur;
-    size_t n_valid; // Number of correct top tokens with correct probabilities.
-
-    std::mt19937 rng;
-};
-
-#include "common.h"
-
-// Create a new sampling context instance.
-struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params);
-
-void llama_sampling_free(struct llama_sampling_context * ctx);
-
-// Reset the sampler context
-// - clear prev tokens
-// - reset grammar
-void llama_sampling_reset(llama_sampling_context * ctx);
-
-// Set the sampler seed
-void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed);
-
-// Copy the sampler context
-void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst);
-
-// Get the last sampled token
-llama_token llama_sampling_last(llama_sampling_context * ctx);
-
-// Get a string representation of the last sampled tokens
-std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n);
-
-// Print sampling parameters into a string
-std::string llama_sampling_print(const llama_sampling_params & params);
-
-// Print sampling order into a string
-std::string llama_sampling_order_print(const llama_sampling_params & params);
-
-std::string llama_sampling_type_to_str(llama_sampler_type sampler_type);
-
-std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
-std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string);
-
-// this is a common sampling function used across the examples for convenience
-// it can serve as a starting point for implementing your own sampling function
-// Note: When using multiple sequences, it is the caller's responsibility to call
-//       llama_sampling_reset when a sequence ends
+// common sampling implementation:
 //
-// required:
-//  - ctx_main:     context to use for sampling
-//  - ctx_sampling: sampling-specific context
-//
-// optional:
-//  - ctx_cfg:      context to use for classifier-free guidance
-//  - idx:          sample from llama_get_logits_ith(ctx, idx)
-//
-// returns:
-//  - token:      sampled token
-//  - candidates: vector of candidate tokens
+// - set logits
+// - apply the configured sampling constraints
+// - check if the token fits the grammar (if any)
+// - if not: resample by first applying the grammar constraints and then sampling again (slower path)
 //
 llama_token llama_sampling_sample(
-        struct llama_sampling_context * ctx_sampling,
-        struct llama_context * ctx_main,
-        struct llama_context * ctx_cfg,
-        int idx = -1);
+        struct llama_sampling * smpl,
+         struct llama_context * ctx,
+                          int   idx);

-// Prepares and adjusts the set of token candidates for sampling based on penalties, biases, and sampling parameters.
-llama_token_data_array llama_sampling_prepare(
-        struct llama_sampling_context * ctx_sampling,
-        struct llama_context * ctx_main,
-        struct llama_context * ctx_cfg,
-        int idx = 0,
-        bool apply_grammar = true,
-        std::vector<float> * original_logits = nullptr);
+// helpers

-void llama_sampling_accept(
-        struct llama_sampling_context * ctx_sampling,
-        struct llama_context * ctx_main,
-        llama_token id,
-        bool apply_grammar);
+// get a string representation of the last accepted tokens
+std::string llama_sampling_prev_str(llama_sampling * smpl, llama_context * ctx, int n);
+
+char        llama_sampling_type_to_chr(enum llama_sampler_type sampler_type);
+std::string llama_sampling_type_to_str(enum llama_sampler_type sampler_type);
+
+std::vector<enum llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
+std::vector<enum llama_sampler_type> llama_sampling_types_from_chars(const std::string & chars);
--- a/common/stb_image.h
+++ b/common/stb_image.h
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -3,6 +3,7 @@

 from __future__ import annotations

+import ast
 import logging
 import argparse
 import contextlib
@@ -63,6 +64,7 @@ class Model:
    model_name: str | None
    metadata_override: Path | None
    dir_model_card: Path
+    is_lora: bool

    # subclasses should define this!
    model_arch: gguf.MODEL_ARCH
@@ -70,7 +72,7 @@ class Model:
    def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
                 use_temp_file: bool = False, eager: bool = False,
                 metadata_override: Path | None = None, model_name: str | None = None,
-                 split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
+                 split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False, is_lora: bool = False):
        if type(self) is Model:
            raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")

@@ -92,6 +94,7 @@ class Model:
        self.metadata_override = metadata_override
        self.model_name = model_name
        self.dir_model_card = dir_model  # overridden in convert_lora_to_gguf.py
+        self.is_lora = is_lora  # true if model is used inside convert_lora_to_gguf.py

        # Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
        if self.ftype == gguf.LlamaFileType.GUESSED:
@@ -295,9 +298,13 @@ class Model:
                            gguf.MODEL_TENSOR.FFN_GATE_INP,
                            gguf.MODEL_TENSOR.POS_EMBD,
                            gguf.MODEL_TENSOR.TOKEN_TYPES,
+                            gguf.MODEL_TENSOR.SSM_CONV1D,
+                            gguf.MODEL_TENSOR.TIME_MIX_FIRST,
+                            gguf.MODEL_TENSOR.TIME_MIX_W1,
+                            gguf.MODEL_TENSOR.TIME_MIX_W2,
                        )
                    )
-                    or not name.endswith(".weight")
+                    or not new_name.endswith(".weight")
                ):
                    data_qtype = gguf.GGMLQuantizationType.F32

@@ -590,6 +597,15 @@ class Model:
        if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249":
            # ref: https://huggingface.co/HuggingFaceTB/SmolLM-135M
            res = "smollm"
+        if chkhsh == "3c30d3ad1d6b64202cd222813e7736c2db6e1bd6d67197090fc1211fbc612ae7":
+            # ref: https://huggingface.co/bigscience/bloom
+            res = "bloom"
+        if chkhsh == "bc01ce58980e1db43859146dc51b1758b3b88729b217a74792e9f8d43e479d21":
+            # ref: https://huggingface.co/TurkuNLP/gpt3-finnish-small
+            res = "gpt3-finnish"
+        if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae":
+            # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct
+            res = "exaone"

        if res is None:
            logger.warning("\n")
@@ -893,7 +909,7 @@ class GPTNeoXModel(Model):
        return tensors


-@Model.register("BloomForCausalLM")
+@Model.register("BloomForCausalLM", "BloomModel")
 class BloomModel(Model):
    model_arch = gguf.MODEL_ARCH.BLOOM

@@ -1560,7 +1576,7 @@ class LlamaModel(Model):
        if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
            if rope_scaling.get("rope_type", '').lower() == "llama3":
                base = self.hparams.get("rope_theta", 10000.0)
-                dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
+                dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
                freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))

                factor = rope_scaling.get("factor", 8.0)
@@ -1583,7 +1599,8 @@ class LlamaModel(Model):
                        smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
                        rope_factors.append(1 / ((1 - smooth) / factor + smooth))

-                self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
+                if not self.is_lora:
+                    self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))

        super().prepare_tensors()

@@ -2130,8 +2147,9 @@ class Phi3MiniModel(Model):
        if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
            raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')

-        self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG]  + ".weight", np.array(long_factors, dtype=np.float32))
-        self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32))
+        if not self.is_lora:
+            self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG]  + ".weight", np.array(long_factors, dtype=np.float32))
+            self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32))


@Model.register("PlamoForCausalLM")
@@ -2702,7 +2720,85 @@ class StarCoder2Model(Model):
    model_arch = gguf.MODEL_ARCH.STARCODER2


-@Model.register("MambaForCausalLM", "MambaLMHeadModel")
+@Model.register("Rwkv6ForCausalLM")
+class Rwkv6Model(Model):
+    model_arch = gguf.MODEL_ARCH.RWKV6
+
+    def set_vocab(self):
+        assert (self.dir_model / "rwkv_vocab_v20230424.txt").is_file()
+        vocab_size = self.hparams.get("vocab_size", 65536)
+
+        tokens: list[bytes] = ['<s>'.encode("utf-8")]
+        toktypes: list[int] = [gguf.TokenType.CONTROL]
+
+        with open(self.dir_model / "rwkv_vocab_v20230424.txt", "r", encoding="utf-8") as f:
+            lines = f.readlines()
+            for line in lines:
+                parts = line.split(' ')
+                assert len(parts) >= 3
+                token, token_len = ast.literal_eval(' '.join(parts[1:-1])), int(parts[-1])
+                token = token.encode("utf-8") if isinstance(token, str) else token
+                assert isinstance(token, bytes)
+                assert len(token) == token_len
+                token_text: str = repr(token)[2:-1]  # "b'\xff'" -> "\xff"
+                tokens.append(token_text.encode("utf-8"))
+                toktypes.append(gguf.TokenType.NORMAL)
+        remainder = vocab_size - len(tokens)
+        assert remainder >= 0
+        for i in range(len(tokens), vocab_size):
+            tokens.append(f"[PAD{i}]".encode("utf-8"))
+            toktypes.append(gguf.TokenType.UNUSED)
+
+        self.gguf_writer.add_tokenizer_model("rwkv")
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_types(toktypes)
+
+    def set_gguf_parameters(self):
+        block_count = self.hparams["num_hidden_layers"]
+        head_size = self.hparams["head_size"]
+        hidden_size = self.hparams["hidden_size"]
+        layer_norm_eps = self.hparams["layer_norm_epsilon"]
+        rescale_every_n_layers = self.hparams["rescale_every"]
+        intermediate_size = self.hparams["intermediate_size"] if self.hparams["intermediate_size"] is not None else int((hidden_size * 3.5) // 32 * 32)
+        time_mix_extra_dim = 64 if hidden_size == 4096 else 32
+        time_decay_extra_dim = 128 if hidden_size == 4096 else 64
+
+        # RWKV isn't context limited
+        self.gguf_writer.add_context_length(1048576)
+        self.gguf_writer.add_embedding_length(hidden_size)
+        self.gguf_writer.add_block_count(block_count)
+        self.gguf_writer.add_layer_norm_eps(layer_norm_eps)
+        self.gguf_writer.add_rescale_every_n_layers(rescale_every_n_layers)
+        self.gguf_writer.add_wkv_head_size(head_size)
+        self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim)
+        self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim)
+        self.gguf_writer.add_feed_forward_length(intermediate_size)
+        self.gguf_writer.add_file_type(self.ftype)
+
+        # required by llama.cpp, unused
+        self.gguf_writer.add_head_count(0)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        new_name = self.map_tensor_name(name)
+
+        if not (new_name.endswith(".weight") or new_name.endswith(".bias")):
+            new_name += ".weight"
+
+        if new_name.endswith("time_mix_w1.weight") or new_name.endswith("time_mix_decay_w1.weight") or new_name.endswith("time_mix_decay_w2.weight"):
+            data_torch = data_torch.transpose(0, 1)
+
+        if new_name.endswith("time_mix_w2.weight"):
+            data_torch = data_torch.permute(0, 2, 1)
+
+        rescale_every_n_layers = self.hparams["rescale_every"]
+        if rescale_every_n_layers > 0:
+            if new_name.endswith("time_mix_output.weight") or new_name.endswith("channel_mix_value.weight"):
+                data_torch = data_torch.div_(2 ** int(bid // rescale_every_n_layers))
+
+        yield (new_name, data_torch)
+
+
+@Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM")
 class MambaModel(Model):
    model_arch = gguf.MODEL_ARCH.MAMBA

@@ -2733,7 +2829,10 @@ class MambaModel(Model):
        # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58
        dt_rank      = self.find_hparam(["time_step_rank",     "dt_rank"],      optional=True) or -(d_model // -16)
        rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5
-
+        use_dt_b_c_norm = False
+        # For falconmamba we do apply RMS norm on B / DT and C layers
+        if self.find_hparam(["model_type"], optional=True) in ("falcon_mamba",):
+            use_dt_b_c_norm = True
        # Fail early for models which don't have a block expansion factor of 2
        assert d_inner == 2 * d_model

@@ -2741,12 +2840,13 @@ class MambaModel(Model):
        self.gguf_writer.add_embedding_length(d_model)
        self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
        self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading
-        self.gguf_writer.add_block_count(self.hparams["n_layer"])
+        self.gguf_writer.add_block_count(self.block_count)
        self.gguf_writer.add_ssm_conv_kernel(d_conv)
        self.gguf_writer.add_ssm_inner_size(d_inner)
        self.gguf_writer.add_ssm_state_size(d_state)
        self.gguf_writer.add_ssm_time_step_rank(dt_rank)
        self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
+        self.gguf_writer.add_ssm_dt_b_c_rms(use_dt_b_c_norm) # For classic Mamba we don't apply rms norm on B / DT layers
        self.gguf_writer.add_file_type(self.ftype)

    _tok_embd = None
@@ -2773,23 +2873,6 @@ class MambaModel(Model):

        return [(new_name, data_torch)]

-    def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
-        if bid is not None and new_name in (
-            self.format_tensor_name(
-                n, bid, ".weight" if name.endswith(".weight") else ""
-            )
-            for n in [
-                gguf.MODEL_TENSOR.SSM_CONV1D,
-                gguf.MODEL_TENSOR.SSM_X,
-                gguf.MODEL_TENSOR.SSM_DT,
-                gguf.MODEL_TENSOR.SSM_A,
-                gguf.MODEL_TENSOR.SSM_D,
-            ]
-        ):
-            return gguf.GGMLQuantizationType.F32
-
-        return super().tensor_force_quant(name, new_name, bid, n_dims)
-

@Model.register("CohereForCausalLM")
 class CommandR2Model(Model):
@@ -3734,8 +3817,121 @@ class ChatGLMModel(Model):
        name = name.removeprefix("transformer.")
        return [(self.map_tensor_name(name), data_torch)]

-###### CONVERSION LOGIC ######

+@Model.register("NemotronForCausalLM")
+class NemotronModel(Model):
+    model_arch = gguf.MODEL_ARCH.NEMOTRON
+
+    def set_vocab(self):
+        self._set_vocab_sentencepiece()
+        self.gguf_writer.add_pad_token_id(0)
+        self.gguf_writer.add_unk_token_id(1)
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        hparams = self.hparams
+        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
+
+        f_norm_eps = self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon", "norm_eps"])
+        self.gguf_writer.add_layer_norm_eps(f_norm_eps)
+
+        # * Partial RoPE
+        rot_pct = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"])
+        n_embd = self.find_hparam(["hidden_size", "n_embd"])
+        n_head = self.find_hparam(["num_attention_heads", "n_head"])
+        self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
+
+        # * RopeScaling for Nemotron
+        if "rope_scaling" not in self.hparams or self.hparams["rope_scaling"] is None:
+            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
+        else:
+            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
+            self.gguf_writer.add_rope_scaling_factor(self.hparams["factor"])
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # * Adding +1 to LayerNorm's weights here to implement layernorm1p w/o changing anything on the GGML engine side
+        #   model.layers.{l}.input_layernorm.weight
+        #   model.layers.{l}.post_attention_layernorm.weight
+        #   model.norm.weight
+        if name.endswith("norm.weight"):
+            data_torch = data_torch + 1
+
+        return [(self.map_tensor_name(name), data_torch)]
+
+
+@Model.register("ExaoneForCausalLM")
+class ExaoneModel(Model):
+    model_arch = gguf.MODEL_ARCH.EXAONE
+
+    def set_gguf_parameters(self):
+        hparams = self.hparams
+
+        assert (hparams["activation_function"] == "silu")
+
+        max_position_embeddings = hparams["max_position_embeddings"]
+        embed_dim = hparams["hidden_size"]
+        num_heads = hparams["num_attention_heads"]
+        num_kv_heads = hparams.get("num_key_value_heads", num_heads)
+        layer_norm_eps = hparams["layer_norm_epsilon"]
+        intermediate_size = hparams["intermediate_size"] if "intermediate_size" in hparams else 4 * embed_dim
+        num_layers = hparams["num_layers"]
+        # ignore for now as EXAONE-3.0-7.8B-Instruct attentino_dropout is 0.0
+        # attention_dropout_rate = hparams["attention_dropout"]
+        # ignore for now as EXAONE-3.0-7.8B-Instruct embed_dropout is 0.0
+        # embed_dropout_rate = hparams["embed_dropout"]
+        self.gguf_writer.add_embedding_length(embed_dim)
+        self.gguf_writer.add_head_count(num_heads)
+        self.gguf_writer.add_head_count_kv(num_kv_heads)
+        self.gguf_writer.add_context_length(max_position_embeddings)
+        self.gguf_writer.add_layer_norm_rms_eps(layer_norm_eps)
+        self.gguf_writer.add_feed_forward_length(intermediate_size)
+        self.gguf_writer.add_block_count(num_layers)
+        self.gguf_writer.add_file_type(self.ftype)
+
+        if (rope_theta := self.hparams.get("rope_theta")) is not None:
+            self.gguf_writer.add_rope_freq_base(rope_theta)
+        rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True)
+        rotary_factor = rotary_factor if rotary_factor is not None else 1.0
+        self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
+        if hparams.get("rope_scaling") is not None and "factor" in hparams["rope_scaling"]:
+            if hparams["rope_scaling"].get("type") == "linear":
+                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
+                self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])
+
+    def prepare_tensors(self):
+        if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
+            if rope_scaling.get("rope_type", '').lower() == "llama3":
+                base = self.hparams.get("rope_theta", 10000.0)
+                dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
+                freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
+
+                factor = rope_scaling.get("factor", 8.0)
+                low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
+                high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
+                old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
+
+                low_freq_wavelen = old_context_len / low_freq_factor
+                high_freq_wavelen = old_context_len / high_freq_factor
+                assert low_freq_wavelen != high_freq_wavelen
+
+                rope_factors = []
+                for freq in freqs:
+                    wavelen = 2 * math.pi / freq
+                    if wavelen < high_freq_wavelen:
+                        rope_factors.append(1)
+                    elif wavelen > low_freq_wavelen:
+                        rope_factors.append(factor)
+                    else:
+                        smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
+                        rope_factors.append(1 / ((1 - smooth) / factor + smooth))
+
+                if not self.is_lora:
+                    self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
+
+        super().prepare_tensors()
+
+
+###### CONVERSION LOGIC ######

 # tree of lazy tensors
 class LazyTorchTensor(gguf.LazyBase):
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -94,6 +94,9 @@ models = [
    {"name": "codeshell",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", },
    {"name": "tekken",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },
    {"name": "smollm",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", },
+    {'name': "bloom",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", },
+    {'name': "gpt3-finnish",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", },
+    {"name": "exaone",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
 ]


--- a/convert_llama_ggml_to_gguf.py
+++ b/convert_llama_ggml_to_gguf.py
@@ -116,7 +116,7 @@ class Tensor:
        assert quant is not None, 'Unknown tensor type'
        (blksize, tysize) = quant
        offset += 12
-        self.dtype= dtype
+        self.dtype= gguf.GGMLQuantizationType(dtype)
        self.dims = struct.unpack(f'<{n_dims}I', data[offset:offset + (4 * n_dims)])
        offset += 4 * n_dims
        self.name = bytes(data[offset:offset + name_len])
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@@ -386,6 +386,7 @@ if __name__ == '__main__':
            dry_run=args.dry_run,
            dir_lora_model=dir_lora,
            lora_alpha=alpha,
+            is_lora=True,
        )

        logger.info("Exporting model...")
--- a/docs/backend/CANN.md
+++ b/docs/backend/CANN.md
@@ -0,0 +1,259 @@
+# llama.cpp for CANN
+
+ - [Background](#background)
+ - [News](#news)
+ - [OS](#os)
+ - [Hardware](#hardware)
+ - [Model Supports](#model-supports)
+ - [DataType Supports](#datatype-supports)
+ - [Docker](#docker)
+ - [Linux](#linux)
+ - [TODO](#todo)
+
+
+## Background
+
+**Ascend NPU** is a range of AI processors using Neural Processing Unit. It will efficiently handle matrix-matrix multiplication, dot-product and scalars.
+
+**CANN** (Compute Architecture for Neural Networks) is a heterogeneous computing architecture for AI scenarios, providing support for multiple AI frameworks on the top and serving AI processors and programming at the bottom. It plays a crucial role in bridging the gap between upper and lower layers, and is a key platform for improving the computing efficiency of Ascend AI processors. Meanwhile, it offers a highly efficient and easy-to-use programming interface for diverse application scenarios, allowing users to rapidly build AI applications and services based on the Ascend platform.
+
+**Llama.cpp + CANN**
+
+The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the ability of AscendC and ACLNN which are intergrated to CANN Toolkit and kernels to using Ascend NPU directly.
+
+## News
+
+- 2024.8
+  - Support `Q4_0` and `Q8_0` data type for Ascend NPU.
+- 2024.7
+  - Create CANN backend for Ascend NPU.
+
+## OS
+
+| OS      | Status  | Verified                                       |
+|:-------:|:-------:|:----------------------------------------------:|
+| Linux   | Support | Ubuntu 22.04, OpenEuler22.03                   |
+
+
+## Hardware
+
+### Ascend NPU
+
+**Verified devices**
+| Ascend NPU                    | Status  |
+|:-----------------------------:|:-------:|
+| Atlas 300T A2                 | Support |
+
+*Notes:*
+
+- If you have trouble with Ascend NPU device, please create a issue with **[CANN]** prefix/tag.
+- If you run successfully with your Ascend NPU device, please help update the upper table.
+
+
+## Model Supports
+
+| Model Name                  | FP16  | Q8_0 | Q4_0 |
+|:----------------------------|:-----:|:----:|:----:|
+| AquilaChat2-7B              |   √   |   √  |   √  |
+| Baichuan-7b                 |   √   |   √  |   √  |
+| Baichuan2-7B-Chat           |   √   |   √  |   √  |
+| bitnet_b1_58-large          |   √   |   √  |   √  |
+| bloom-560m                  |   √   |   x  |   √  |
+| bloomz-alpaca-560m          |   √   |   x  |   √  |
+| c4ai-command-r-35B-v01      |   x   |   x  |   x  |
+| chatglm3-6B                 |   x   |   x  |   x  |
+| chinese-alpaca-2-1.3b       |   √   |   √  |   √  |
+| CodeShell-7B                |   √   |   √  |   √  |
+| deepseek-ai_deepseek-coder-1.3B-base | x |   x  |   x  |
+| deepseek-ai_DeepSeek-V2-Lite | x   |   x  |   x   |
+| deepseek-coder-6.7B-instruct | x   |   x  |   x   |
+| DeepSeek-V2-Lite-64x1.5B    |   x   |   x  |   x  |
+| falcon-7b-instruct          |   √   |   √  |   √  |
+| flan-t5-large               |   √   |   √  |   √  |
+| gemma-2-9b-it               |   √   |   √  |   √  |
+| glm-4-9B                    |   x   |   x  |   x  |
+| gpt2                        |   √   |   √  |   √  |
+| Gpt2-163M                   |   √   |   √  |   √  |
+| granite-3B-code-instruct    |   √   |   √  |   √  |
+| GritLM-7B                   |   √   |   √  |   √  |
+| internlm2_5-7b-chat         |   √   |   √  |   √  |
+| koala-7B-HF                 |   √   |   √  |   √  |
+| Llama-2-7b-chat-hf          |   √   |   √  |   √  |
+| Llama-3-Smaug-8B            |   √   |   √  |   √  |
+| Llama2-Chinese-7b-Chat      |   √   |   √  |   √  |
+| Llama3-8B                   |   √   |   √  |   √  |
+| Llama3-8b-chinese           |   √   |   √  |   √  |
+| mamba-130m-hf               |   √   |   √  |   √  |
+| Mistral-7B-Instruct-v0.2    |   √   |   √  |   √  |
+| Mixtral-8x7B-Instruct-v0.1  |   x   |   √  |   √  |
+| mpt-7B                      |   √   |   √  |   √  |
+| OLMo-1B-hf                  |   √   |   √  |   √  |
+| OpenELM-3B-Instruct         |   √   |   √  |   √  |
+| Orion-14b-base              |   √   |   √  |   √  |
+| phi1                        |   x   |   x  |   x  |
+| phi2                        |   x   |   x  |   x  |
+| Phi-3-mini-4k-instruct      |   √   |   √  |   √  |
+| plamo-13b                   |   √   |   √  |   √  |
+| pythia-70M                  |   x   |   x  |   x  |
+| Qwen-7B                     |   √   |   √  |   √  |
+| Qwen2-1.5B-Instruct         |   √   |   x  |   √  |
+| Refact-1_6B-fim             |   √   |   √  |   √  |
+| SmolLM-135M                 |   √   |   √  |   √  |
+| stablelm-zephyr             |   x   |   x  |   x  |
+| stablelm-2-zephyr-1_6b      |   x   |   x  |   x  |
+| starcoderbase-1b            |   √   |   √  |   √  |
+| starcoder2-3b               |   √   |   √  |   √  |
+| vigogne-7b-chat             |   √   |   √  |   √  |
+| xverse-7b-chat              |   √   |   √  |   √  |
+| Yi-6b-Chat                  |   √   |   √  |   √  |
+
+
+
+## DataType Supports
+
+| DataType               | Status  |
+|:----------------------:|:-------:|
+| FP16                   | Support |
+| Q8_0                   | Support |
+| Q4_0                   | Support |
+
+## Docker
+
+### Build Images
+You can get a image with llama.cpp in one command.
+```sh
+docker build -t llama-cpp-cann -f .devops/llama-cli-cann.Dockerfile .
+```
+
+### Run container
+
+```sh
+# Find all cards.
+npu-smi info
+
+# Select the cards that you want to use, make sure these cards are not used by someone.
+# Following using cards of device0.
+docker run --name llamacpp --device /dev/davinci0  --device /dev/davinci_manager --device /dev/devmm_svm --device /dev/hisi_hdc -v /usr/local/dcmi:/usr/local/dcmi -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info -v /PATH_TO_YOUR_MODELS/:/app/models -it llama-cpp-cann -m /app/models/MODEL_PATH -ngl 32 -p "Building a website can be done in 10 simple steps:"
+```
+
+*Notes:*
+
+- You may need to install Ascend Driver and firmware on the **host** machine *(Please refer to the [Linux configuration](#linux) for details)*.
+
+## Linux
+
+### I. Setup Environment
+
+1. **Install Ascend Driver and firmware**
+
+    ```sh
+    # create driver running user.
+    sudo groupadd -g HwHiAiUser
+    sudo useradd -g HwHiAiUser -d /home/HwHiAiUser -m HwHiAiUser -s /bin/bash
+    sudo usermod -aG HwHiAiUser $USER
+
+    # download driver from https://www.hiascend.com/hardware/firmware-drivers/community according to your system
+    # and install driver.
+    sudo sh Ascend-hdk-910b-npu-driver_x.x.x_linux-{arch}.run --full --install-for-all
+    ```
+
+    Once installed, run `npu-smi info` to check whether driver is installed successfully.
+    ```sh
+    +-------------------------------------------------------------------------------------------+
+    | npu-smi 24.1.rc2               Version: 24.1.rc2                                          |
+    +----------------------+---------------+----------------------------------------------------+
+    | NPU   Name           | Health        | Power(W)    Temp(C)           Hugepages-Usage(page)|
+    | Chip                 | Bus-Id        | AICore(%)   Memory-Usage(MB)  HBM-Usage(MB)        |
+    +======================+===============+====================================================+
+    | 2     xxx            | OK            | 64.4        51                15   / 15            |
+    | 0                    | 0000:01:00.0  | 0           1873 / 15077      0    / 32768         |
+    +======================+===============+====================================================+
+    | 5     xxx            | OK            | 64.0        52                15   / 15            |
+    | 0                    | 0000:81:00.0  | 0           1874 / 15077      0    / 32768         |
+    +======================+===============+====================================================+
+    | No running processes found in NPU 2                                                       |
+    +======================+===============+====================================================+
+    | No running processes found in NPU 5                                                       |
+    +======================+===============+====================================================+
+    ```
+
+2. **Install Ascend Firmware**
+    ```sh
+    # download driver from https://www.hiascend.com/hardware/firmware-drivers/community according to your system
+    # and install driver.
+    sudo sh Ascend-hdk-910b-npu-firmware_x.x.x.x.X.run --full
+    ```
+    If the following messaage appers, firmware is installed successfully.
+    ```sh
+    Firmware package installed successfully!
+    ```
+
+
+3. **Install CANN toolkit and kernels**
+
+    CANN toolkit and kernels can be obtained from the official [CANN Toolkit](https://www.hiascend.com/zh/developer/download/community/result?module=cann) page.
+
+    Please download the corresponding version that satified your system. The minimum version required is 8.0.RC2.alpha002 and here is the install command.
+    ```sh
+    pip3 install attrs numpy decorator sympy cffi pyyaml pathlib2 psutil protobuf scipy requests absl-py wheel typing_extensions
+    sh Ascend-cann-toolkit_8.0.RC2.alpha002_linux-aarch64.run --install
+    sh Ascend-cann-kernels-910b_8.0.RC2.alpha002_linux.run --install
+    ```
+
+    Set Ascend Variables:
+    ```sh
+    echo "source ~/Ascend/ascend-toolkit/set_env.sh" >> ~/.bashrc
+    source ~/.bashrc
+    ```
+
+Upon a successful installation, CANN is enabled for the available ascend devices.
+
+### II. Build llama.cpp
+
+```sh
+cmake -B build -DGGML_CANN=on -DCMAKE_BUILD_TYPE=release
+cmake --build build --config release
+```
+
+### III. Run the inference
+
+1. **Retrieve and prepare model**
+
+    You can refer to the general [*Prepare and Quantize*](../../README.md#prepare-and-quantize) guide for model prepration.
+
+    **Notes**:
+
+      - CANN backend only supports FP16/Q4_0/Q8_0 models currently.
+
+2. **Launch inference**
+
+    There are two device selection modes:
+
+    - Single device: Use one device target specified by the user.
+    - Multiple devices: Automatically choose the devices with the same backend.
+
+    | Device selection | Parameter                              |
+    |:----------------:|:--------------------------------------:|
+    | Single device    | --split-mode none --main-gpu DEVICE_ID |
+    | Multiple devices | --split-mode layer (default)           |
+
+    Examples:
+
+    - Use device 0:
+
+    ```sh
+    ./build/bin/llama-cli -m path_to_model -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
+    ```
+
+    - Use multiple devices:
+
+    ```sh
+    ./build/bin/llama-cli -m path_to_model -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
+    ```
+
+### **GitHub contribution**:
+Please add the **[CANN]** prefix/tag in issues/PRs titles to help the CANN-team check/address them without delay.
+
+
+## TODO
+- Support more models and data types.
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -20,7 +20,7 @@
 **oneAPI** is an open ecosystem and a standard-based specification, supporting multiple architectures including but not limited to intel CPUs, GPUs and FPGAs. The key components of the oneAPI ecosystem include:

 - **DPCPP** *(Data Parallel C++)*: The primary oneAPI SYCL implementation, which includes the icpx/icx Compilers.
- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. oneMKL - Math Kernel Library)*.
+- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. oneMKL and oneDNN)*.
 - **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs.
 - **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.

@@ -28,10 +28,6 @@

 The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it could support other vendor GPUs: Nvidia GPU (*AMD GPU coming*).

-When targeting **Intel CPU**, it is recommended to use llama.cpp for [Intel oneMKL](README.md#intel-onemkl) backend.
-
-It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
-
 ## Recommended Release

 The SYCL backend would be broken by some PRs due to no online CI.
@@ -45,6 +41,10 @@ The following release is verified with good quality:

 ## News

+
+- 2024.8
+  - Use oneDNN as the default GEMM library, improve the compatibility for new Intel GPUs.
+
 - 2024.5
  - Performance is increased: 34 -> 37 tokens/s of llama-2-7b.Q4_0 on Arc770.
  - Arch Linux is verified successfully.
@@ -196,7 +196,7 @@ Please follow the instructions for downloading and installing the Toolkit for Li

 Following guidelines/code snippets assume the default installation values. Otherwise, please make sure the necessary changes are reflected where applicable.

-Upon a successful installation, SYCL is enabled for the available intel devices, along with relevant libraries such as oneAPI MKL for intel GPUs.
+Upon a successful installation, SYCL is enabled for the available intel devices, along with relevant libraries such as oneAPI oneDNN for Intel GPUs.

 - **Adding support to Nvidia GPUs**

@@ -255,8 +255,6 @@ or
 # Export relevant ENV variables
 source /opt/intel/oneapi/setvars.sh

-# Build LLAMA with MKL BLAS acceleration for intel GPU
-
 # Option 1: Use FP32 (recommended for better performance in most cases)
 cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx

@@ -338,12 +336,12 @@ Choose one of following methods to run.
 - Use device 0:

 ```sh
-./examples/sycl/run_llama2.sh 0
+./examples/sycl/run-llama2.sh 0
 ```
 - Use multiple devices:

 ```sh
-./examples/sycl/run_llama2.sh
+./examples/sycl/run-llama2.sh
 ```

 2. Command line
--- a/docs/build.md
+++ b/docs/build.md
@@ -352,6 +352,31 @@ cmake --build build --config Release
 # ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
 ```

+### CANN
+This provides NPU acceleration using the AI cores of your Ascend NPU. And [CANN](https://www.hiascend.com/en/software/cann) is a hierarchical APIs to help you to quickly build AI applications and service based on Ascend NPU.
+
+For more information about Ascend NPU in [Ascend Community](https://www.hiascend.com/en/).
+
+Make sure to have the CANN toolkit installed. You can download it from here: [CANN Toolkit](https://www.hiascend.com/developer/download/community/result?module=cann)
+
+Go to `llama.cpp` directory and build using CMake.
+```bash
+cmake -B build -DGGML_CANN=on -DCMAKE_BUILD_TYPE=release
+cmake --build build --config release
+```
+
+You can test with:
+
+`./build/llama-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32`
+
+If the fllowing info is output on screen, you are using `llama.cpp by CANN backend`:
+```bash
+llm_load_tensors:       CANN buffer size = 13313.00 MiB
+llama_new_context_with_model:       CANN compute buffer size =  1260.81 MiB
+```
+
+For detailed info, such as model/device supports, CANN install, please refer to [llama.cpp for CANN](./backend/CANN.md).
+
 ### Android

 To read documentation for how to build on Android, [click here](./android.md)
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -66,8 +66,8 @@ You may want to pass in some different `ARGS`, depending on the CUDA environment

 The defaults are:

- `CUDA_VERSION` set to `11.7.1`
- `CUDA_DOCKER_ARCH` set to `all`
+- `CUDA_VERSION` set to `12.6.0`
+- `CUDA_DOCKER_ARCH` set to the cmake build default, which includes all the supported architectures

 The resulting images, are essentially the same as the non-CUDA images:

--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@@ -18,7 +18,7 @@ constexpr float rms_norm_eps = 5e-6f;
 #endif

 static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
-    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
+    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);

    if (plan.work_size > 0) {
        buf.resize(plan.work_size);
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -200,7 +200,7 @@ int main(int argc, char ** argv) {
        }
    }

-    llama_print_timings(ctx);
+    llama_print_timings(ctx, nullptr);

    llama_batch_free(batch);

--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@@ -27,7 +27,6 @@ guard let model = llama_load_model_from_file(modelPath.cString(using: .utf8), mo
    print("Failed to load model")
    exit(1)
 }
-
 defer {
    llama_free_model(model)
 }
@@ -37,7 +36,6 @@ var tokens = tokenize(text: prompt, add_bos: true)
 let n_kv_req = UInt32(tokens.count) + UInt32((n_len - Int(tokens.count)) * n_parallel)

 var context_params = llama_context_default_params()
-context_params.seed = 1234
 context_params.n_ctx = n_kv_req
 context_params.n_batch = UInt32(max(n_len, n_parallel))
 context_params.n_threads = 8
@@ -48,11 +46,24 @@ guard context != nil else {
    print("Failed to initialize context")
    exit(1)
 }
-
 defer {
    llama_free(context)
 }

+var sparams = llama_sampling_params()
+sparams.top_k = 40
+sparams.top_p = 0.9
+sparams.temp  = 0.4
+
+let smpl = llama_sampling_init(model, sparams)
+guard smpl != nil else {
+    print("Failed to initialize sampling")
+    exit(1)
+}
+defer {
+    llama_sampling_free(smpl)
+}
+
 let n_ctx = llama_n_ctx(context)

 print("\nn_len = \(n_len), n_ctx = \(n_ctx), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n")
@@ -125,32 +136,17 @@ while n_cur <= n_len {
            continue
        }

-        var n_vocab = llama_n_vocab(model)
        var logits = llama_get_logits_ith(context, i_batch[i])

-        var candidates: [llama_token_data] = .init(repeating: llama_token_data(), count: Int(n_vocab))
+        llama_sampling_set_logits(smpl, logits)

-        for token_id in 0 ..< n_vocab {
-            candidates.append(llama_token_data(id: token_id, logit: logits![Int(token_id)], p: 0.0))
-        }
+        llama_sampling_top_k(smpl, nil)
+        llama_sampling_top_p(smpl, nil)
+        llama_sampling_temp (smpl, nil)

-        var candidates_p: llama_token_data_array = .init(
-            data: &candidates,
-            size: candidates.count,
-            sorted: false
-        )
+        let new_token_id = llama_sampling_sample_dist(smpl, nil)

-        let top_k: Int32 = 40
-        let top_p: Float = 0.9
-        let temp: Float = 0.4
-
-        llama_sample_top_k(context, &candidates_p, top_k, 1)
-        llama_sample_top_p(context, &candidates_p, top_p, 1)
-        llama_sample_temp(context, &candidates_p, temp)
-
-        let new_token_id = llama_sample_token(context, &candidates_p)
-
-        // const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
+        // const llama_token new_token_id = llama_sampling_sample_greedy(smpl, nil);

        // is it an end of stream? -> mark the stream as finished
        if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
@@ -212,7 +208,7 @@ let t_main_end = ggml_time_us()

 print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n")

-llama_print_timings(context)
+llama_print_timings(context, smpl)

 private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
    let utf8Count = text.utf8.count
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -2,7 +2,6 @@
 #include "llama.h"

 #include <algorithm>
-#include <cmath>
 #include <cstdio>
 #include <string>
 #include <vector>
@@ -65,6 +64,15 @@ int main(int argc, char ** argv) {

    llama_context * ctx = llama_new_context_with_model(model, ctx_params);

+    auto sparams = llama_sampling_default_params();
+
+    sparams.seed  = params.sparams.seed;
+    sparams.top_k = 40;
+    sparams.top_p = 0.9f;
+    sparams.temp  = 0.4f;
+
+    llama_sampling * smpl = llama_sampling_init(model, sparams);
+
    if (ctx == NULL) {
        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
        return 1;
@@ -164,29 +172,17 @@ int main(int argc, char ** argv) {
                continue;
            }

-            auto   n_vocab = llama_n_vocab(model);
-            auto * logits  = llama_get_logits_ith(ctx, i_batch[i]);
+            const auto * logits = llama_get_logits_ith(ctx, i_batch[i]);

-            std::vector<llama_token_data> candidates;
-            candidates.reserve(n_vocab);
+            llama_sampling_set_logits(smpl, logits);

-            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-                candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
-            }
+            llama_sampling_top_k(smpl, nullptr);
+            llama_sampling_top_p(smpl, nullptr);
+            llama_sampling_temp (smpl, nullptr);

-            llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+            const llama_token new_token_id = llama_sampling_sample_dist(smpl, nullptr);

-            const int   top_k = 40;
-            const float top_p = 0.9f;
-            const float temp  = 0.4f;
-
-            llama_sample_top_k(ctx, &candidates_p, top_k, 1);
-            llama_sample_top_p(ctx, &candidates_p, top_p, 1);
-            llama_sample_temp (ctx, &candidates_p, temp);
-
-            const llama_token new_token_id = llama_sample_token(ctx, &candidates_p);
-
-            //const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
+            //const llama_token new_token_id = llama_sampling_sample_greedy(smpl, nullptr);

            // is it an end of generation? -> mark the stream as finished
            if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
@@ -244,12 +240,13 @@ int main(int argc, char ** argv) {
    LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
            __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));

-    llama_print_timings(ctx);
+    llama_print_timings(ctx, smpl);

    fprintf(stderr, "\n");

    llama_batch_free(batch);

+    llama_sampling_free(smpl);
    llama_free(ctx);
    llama_free_model(model);

--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@@ -21,7 +21,7 @@
 #endif

 static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
-    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
+    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);

    if (plan.work_size > 0) {
        buf.resize(plan.work_size);
@@ -54,7 +54,7 @@ static void tensor_dump(const ggml_tensor * tensor, const char * name) {
 #define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor)

 struct benchmark_params_struct {
-    int32_t n_threads     = 1;
+    int     n_threads     = 1;
    int32_t n_iterations  = 10;
 };

--- a/examples/cvector-generator/cvector-generator.cpp
+++ b/examples/cvector-generator/cvector-generator.cpp
@@ -271,7 +271,7 @@ struct tokenized_prompt {
    size_t max_seq_len;

    tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
-        const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
+        const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
        tokens_pos = ::llama_tokenize(ctx, pos, add_bos, true);
        tokens_neg = ::llama_tokenize(ctx, neg, add_bos, true);
        max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
@@ -486,8 +486,8 @@ int main(int argc, char ** argv) {
    if (use_pca) {
        // run PCA
        PCA::pca_params pca_params;
-        pca_params.n_threads = params.n_threads;
-        pca_params.n_batch = params.n_pca_batch;
+        pca_params.n_threads    = params.cpuparams.n_threads;
+        pca_params.n_batch      = params.n_pca_batch;
        pca_params.n_iterations = params.n_pca_iterations;
        PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
    } else {
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -90,13 +90,7 @@ int main(int argc, char ** argv) {

    print_build_info();

-    if (params.seed == LLAMA_DEFAULT_SEED) {
-        params.seed = time(NULL);
-    }
-
-    fprintf(stderr, "%s: seed  = %u\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
+    LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);

    llama_backend_init();
    llama_numa_init(params.numa);
@@ -314,7 +308,7 @@ int main(int argc, char ** argv) {
    }

    // clean up
-    llama_print_timings(ctx);
+    llama_print_timings(ctx, nullptr);
    llama_batch_free(batch);
    llama_free(ctx);
    llama_free_model(model);
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -127,7 +127,7 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
 }

 static bool run(llama_context * ctx, const gpt_params & params) {
-    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
+    const bool add_bos = llama_add_bos_token(llama_get_model(ctx));

    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);

@@ -151,8 +151,6 @@ int main(int argc, char ** argv) {

    print_build_info();

-    std::mt19937 rng(params.seed);
-
    llama_backend_init();
    llama_numa_init(params.numa);

@@ -183,7 +181,7 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    llama_print_timings(ctx);
+    llama_print_timings(ctx, nullptr);

    llama_free(ctx);
    llama_free_model(model);
--- a/examples/export-lora/README.md
+++ b/examples/export-lora/README.md
@@ -17,9 +17,9 @@ For example:

 ```bash
 ./bin/llama-export-lora \
-    -m open-llama-3b-v2-q8_0.gguf \
-    -o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \
-    --lora lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.gguf
+    -m open-llama-3b-v2.gguf \
+    -o open-llama-3b-v2-english2tokipona-chat.gguf \
+    --lora lora-open-llama-3b-v2-english2tokipona-chat-LATEST.gguf
 ```

 Multiple LORA adapters can be applied by passing multiple `--lora FNAME` or `--lora-scaled FNAME S` command line parameters:
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@@ -10,6 +10,12 @@

 static bool g_verbose = false;

+struct tensor_transformation {
+    struct ggml_tensor * in;
+    struct ggml_tensor * out;
+    bool is_copy;
+};
+
 static std::string get_kv_str(struct gguf_context * ctx_gguf, const std::string & key){
    int id = gguf_find_key(ctx_gguf, key.c_str());
    return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id));
@@ -198,8 +204,7 @@ struct lora_merge_ctx {
        }

        // mapping base tensor to out tensor (same shape with base, but different type)
-        // if out_tensor == nullptr, we only copy it
-        std::vector<std::pair<struct ggml_tensor *, struct ggml_tensor *>> base_to_out_tensors;
+        std::vector<tensor_transformation> trans;
        for (auto & it : base_model.tensors) {
            bool t_a = true;
            bool t_b = true;
@@ -212,14 +217,22 @@ struct lora_merge_ctx {
                // only copy
                struct ggml_tensor * cpy_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor);
                ggml_set_name(cpy_tensor, base_tensor->name);
-                base_to_out_tensors.push_back(std::make_pair(cpy_tensor, nullptr));
+                trans.push_back({
+                    cpy_tensor,
+                    cpy_tensor,
+                    true,
+                });
                gguf_add_tensor(ctx_out, cpy_tensor);
            } else if (t_a && t_b) {
                // need merging
                struct ggml_tensor * out_tensor = ggml_new_tensor(
                    ctx_out_ggml, get_out_tensor_type(base_tensor), GGML_MAX_DIMS, base_tensor->ne);
                ggml_set_name(out_tensor, base_tensor->name);
-                base_to_out_tensors.push_back(std::make_pair(base_tensor, out_tensor));
+                trans.push_back({
+                    base_tensor,
+                    out_tensor,
+                    false,
+                });
                gguf_add_tensor(ctx_out, out_tensor);
            } else {
                throw std::runtime_error("tensor " + it.first + " missing either lora_a or lora_b");
@@ -234,12 +247,12 @@ struct lora_merge_ctx {

        // process base model tensors
        size_t n_merged = 0;
-        for (auto & it : base_to_out_tensors) {
-            if (it.second != nullptr) {
-                merge_tensor(it.first, it.second);
+        for (auto & it : trans) {
+            if (!it.is_copy) {
+                merge_tensor(it.in, it.out);
                n_merged++;
            } else {
-                copy_tensor(it.first);
+                copy_tensor(it.in);
            }
        }

@@ -252,7 +265,7 @@ struct lora_merge_ctx {
        }

        printf("%s : merged %ld tensors with lora adapters\n", __func__, n_merged);
-        printf("%s : wrote %ld tensors to output file\n", __func__, base_to_out_tensors.size());
+        printf("%s : wrote %ld tensors to output file\n", __func__, trans.size());
    }

    void copy_tensor(struct ggml_tensor * base) {
@@ -285,6 +298,10 @@ struct lora_merge_ctx {
        for (size_t i = 0; i < adapters.size(); ++i) {
            auto t_a = adapters[i]->get_tensor(name_lora_a);
            auto t_b = adapters[i]->get_tensor(name_lora_b);
+            // TODO: add support for quantized lora
+            if (ggml_is_quantized(t_a->type) || ggml_is_quantized(t_b->type)) {
+                throw std::runtime_error("quantized LoRA adapters is not supported, please retry with f16 or f32");
+            }
            inp_a[i] = ggml_dup_tensor(ctx, t_a);
            inp_b[i] = ggml_dup_tensor(ctx, t_b);
        }
@@ -393,7 +410,7 @@ int main(int argc, char ** argv) {

    g_verbose = (params.verbosity == 1);
    try {
-        lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.n_threads);
+        lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.cpuparams.n_threads);
        ctx.run_merge();
    } catch (const std::exception & err) {
        fprintf(stderr, "%s\n", err.what());
--- a/examples/gbnf-validator/gbnf-validator.cpp
+++ b/examples/gbnf-validator/gbnf-validator.cpp
@@ -1,9 +1,5 @@
-#define LLAMA_API_INTERNAL
-
-#include "grammar-parser.h"
-#include "ggml.h"
-#include "llama.h"
 #include "unicode.h"
+#include "llama-grammar.h"

 #include <cstdio>
 #include <cstdlib>
@@ -12,22 +8,21 @@
 #include <string>
 #include <vector>

-static bool llama_sample_grammar_string(struct llama_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) {
-    auto decoded = decode_utf8(input_str, {});
-    const auto & code_points = decoded.first;
+static bool llama_grammar_validate(struct llama_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) {
+    const auto cpts = unicode_cpts_from_utf8(input_str);

    const llama_grammar_rules  & rules      = llama_grammar_get_rules (grammar);
          llama_grammar_stacks & cur_stacks = llama_grammar_get_stacks(grammar);

    size_t pos = 0;
-    for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
+    for (const auto & cpt : cpts) {
        const llama_grammar_stacks prev_stacks = llama_grammar_get_stacks(grammar); // copy

-        llama_grammar_accept(rules, prev_stacks, *it, cur_stacks);
+        cur_stacks = llama_grammar_accept(rules, prev_stacks, cpt);

        if (cur_stacks.empty()) {
            error_pos = pos;
-            error_msg = "Unexpected character '" + unicode_cpt_to_utf8(*it) + "'";
+            error_msg = "Unexpected character '" + unicode_cpt_to_utf8(cpt) + "'";
            cur_stacks = prev_stacks;
            return false;
        }
@@ -85,27 +80,7 @@ int main(int argc, char** argv) {
        grammar_str = buffer.str();
    }

-    // Parse the GBNF grammar
-    auto parsed_grammar = grammar_parser::parse(grammar_str.c_str());
-
-    // will be empty (default) if there are parse errors
-    if (parsed_grammar.rules.empty()) {
-        fprintf(stdout, "%s: failed to parse grammar\n", __func__);
-        return 1;
-    }
-
-    // Ensure that there is a "root" node.
-    if (parsed_grammar.symbol_ids.find("root") == parsed_grammar.symbol_ids.end()) {
-        fprintf(stdout, "%s: grammar does not contain a 'root' symbol\n", __func__);
-        return 1;
-    }
-
-    std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
-
-    // Create the LLAMA grammar
-    auto grammar = llama_grammar_init(
-            grammar_rules.data(),
-            grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
+    llama_grammar * grammar = llama_grammar_init_impl(nullptr, grammar_str.c_str(), "root");
    if (grammar == nullptr) {
        throw std::runtime_error("Failed to initialize llama_grammar");
    }
@@ -122,7 +97,7 @@ int main(int argc, char** argv) {
    // Validate the input string against the grammar
    size_t error_pos;
    std::string error_msg;
-    bool is_valid = llama_sample_grammar_string(grammar, input_str, error_pos, error_msg);
+    bool is_valid = llama_grammar_validate(grammar, input_str, error_pos, error_msg);

    if (is_valid) {
        fprintf(stdout, "Input string is valid according to the grammar.\n");
@@ -131,7 +106,7 @@ int main(int argc, char** argv) {
    }

    // Clean up
-    llama_grammar_free(grammar);
+    llama_grammar_free_impl(grammar);

    return 0;
 }
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@@ -9,7 +9,7 @@
 static std::vector<std::vector<float>> encode(llama_context * ctx, const std::vector<std::string> & sentences, const std::string & instruction) {
    std::vector<std::vector<float>> result;

-    const llama_model * mdl = llama_get_model(ctx);
+    const llama_model * model = llama_get_model(ctx);

    llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1);

@@ -18,16 +18,16 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve

        const std::string input_string = instruction + sentences[i];

-        std::vector<llama_token> inputs = llama_tokenize(mdl, input_string, true, false);
+        std::vector<llama_token> inputs = llama_tokenize(model, input_string, true, false);

        const int32_t n_toks = inputs.size();

        // GritLM seems to have EOS = ""
        // https://github.com/ContextualAI/gritlm/blob/92025b16534712b31b3c4aaaf069350e222bd5f8/gritlm/gritlm.py#L18
-        // inputs.push_back(llama_token_eos(mdl));
+        // inputs.push_back(llama_token_eos(model));

        // we want to ignore instruction tokens for mean pooling
-        const int32_t n_inst = llama_tokenize(mdl, instruction, true, false).size();
+        const int32_t n_inst = llama_tokenize(model, instruction, true, false).size();

 #ifdef GRIT_DEBUG
        // debug tokens - should be matching as referenced in the GritLM sample
@@ -51,7 +51,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
        llama_decode(ctx, batch);

        // get embedding dimensions
-        uint64_t n_embd = llama_n_embd(mdl);
+        uint64_t n_embd = llama_n_embd(model);

        // allocate embedding output
        std::vector<float> emb_unorm(n_embd, 0.0f);
@@ -92,11 +92,11 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
    return result;
 }

-static std::string generate(llama_context * ctx, const std::string & prompt, bool stream) {
+static std::string generate(llama_context * ctx, llama_sampling * smpl, const std::string & prompt, bool stream) {
    std::string result;

-    const llama_model * mdl = llama_get_model(ctx);
-    llama_token eos_token = llama_token_eos(mdl);
+    const llama_model * model = llama_get_model(ctx);
+    llama_token eos_token = llama_token_eos(model);

    llama_kv_cache_clear(ctx);
    llama_set_embeddings(ctx, false);
@@ -104,28 +104,27 @@ static std::string generate(llama_context * ctx, const std::string & prompt, boo

    llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);

-    std::vector<llama_token> inputs = llama_tokenize(mdl, prompt, false, true);
+    std::vector<llama_token> inputs = llama_tokenize(model, prompt, false, true);
    int32_t i_current_token = 0;

    while (true) {
        llama_batch_clear(bat);
-        auto n_inputs = (int32_t)inputs.size();
-        for (int32_t i = 0; i < n_inputs; i++) {
-            llama_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1);
+        {
+            const int32_t n_inputs = inputs.size();
+
+            for (int32_t i = 0; i < n_inputs; i++) {
+                llama_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1);
+            }
        }
        inputs.clear();

        llama_decode(ctx, bat);
-        auto logits = llama_get_logits_ith(ctx, bat.n_tokens - 1);

-        auto candidates = std::vector<llama_token_data>(llama_n_vocab(mdl));
-        auto n_candidates = (int32_t)candidates.size();
-        for (int32_t token = 0; token < n_candidates; token++) {
-            candidates[token] = llama_token_data{ token, logits[token], 0.0f };
-        }
-        auto candidates_p = llama_token_data_array{ candidates.data(), candidates.size(), false };
+        const auto * logits = llama_get_logits_ith(ctx, bat.n_tokens - 1);

-        llama_token token = llama_sample_token_greedy(ctx, &candidates_p);
+        llama_sampling_set_logits(smpl, logits);
+
+        llama_token token = llama_sampling_sample_greedy(smpl, nullptr);
        if (token == eos_token) {
            break;
        }
@@ -167,10 +166,12 @@ int main(int argc, char * argv[]) {

    llama_backend_init();

-    llama_model * mdl = llama_load_model_from_file(params.model.c_str(), mparams);
+    llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams);

    // create generation context
-    llama_context * ctx = llama_new_context_with_model(mdl, cparams);
+    llama_context * ctx = llama_new_context_with_model(model, cparams);
+
+    llama_sampling * smpl = llama_sampling_init(model, llama_sampling_default_params());

    // ### Embedding/Representation ###
    // samples taken from: https://github.com/ContextualAI/gritlm#basic
@@ -191,7 +192,7 @@ int main(int argc, char * argv[]) {
        const std::vector<std::vector<float>> d_rep = encode(ctx, documents, gritlm_instruction(""));
        const std::vector<std::vector<float>> q_rep = encode(ctx, queries,   gritlm_instruction(instruction));

-        const int n_embd = llama_n_embd(mdl);
+        const int n_embd = llama_n_embd(model);

        const float cosine_sim_q0_d0 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd);
        const float cosine_sim_q0_d1 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd);
@@ -208,11 +209,12 @@ int main(int argc, char * argv[]) {
    // GritLM models are not finetuned with system prompts, as you can just include system-like instructions together with your user instruction
    {
        const std::string prompt = "<|user|>\nPlease write me a poem about my recent hike of Mt. Fuji at midnight in the style of Shakespeare.\n<|assistant|>\n";
-        std::string response = generate(ctx, prompt, true);
+        std::string response = generate(ctx, smpl, prompt, true);
    }

+    llama_sampling_free(smpl);
    llama_free(ctx);
-    llama_free_model(mdl);
+    llama_free_model(model);
    llama_backend_free();

    return 0;
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -433,8 +433,8 @@ static void process_logits(
 }

 static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
-    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
-    GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
+    const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
+    GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
    const int n_ctx = llama_n_ctx(ctx);

    auto tim1 = std::chrono::high_resolution_clock::now();
@@ -638,7 +638,7 @@ int main(int argc, char ** argv) {

    g_collector.save_imatrix();

-    llama_print_timings(ctx);
+    llama_print_timings(ctx, nullptr);

    llama_free(ctx);
    llama_free_model(model);
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -2,7 +2,6 @@

 #include "console.h"
 #include "llama.h"
-#include "grammar-parser.h"

 #include <cassert>
 #include <cinttypes>
@@ -34,6 +33,7 @@

 static llama_context           ** g_ctx;
 static llama_model             ** g_model;
+static llama_sampling          ** g_smpl;
 static gpt_params               * g_params;
 static std::vector<llama_token> * g_input_tokens;
 static std::ostringstream       * g_output_ss;
@@ -93,7 +93,7 @@ static void sigint_handler(int signo) {
        } else {
            console::cleanup();
            printf("\n");
-            llama_print_timings(*g_ctx);
+            llama_print_timings(*g_ctx, *g_smpl);
            write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
            _exit(130);
        }
@@ -103,7 +103,6 @@ static void sigint_handler(int signo) {

 int main(int argc, char ** argv) {
    gpt_params params;
-    llama_sampling_params & sparams = params.sparams;
    g_params = &params;

    if (!gpt_params_parse(argc, argv, params)) {
@@ -111,6 +110,8 @@ int main(int argc, char ** argv) {
        return 1;
    }

+    auto & sparams = params.sparams;
+
 #ifndef LOG_DISABLE_LOGS
    log_set_target(log_filename_generator("infill", "log"));
    LOG_TEE("Log start\n");
@@ -156,26 +157,21 @@ int main(int argc, char ** argv) {
        LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
    }

-    LOG_TEE("%s: build = %d (%s)\n",      __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
-    LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);
+    print_build_info();

-    if (params.seed == LLAMA_DEFAULT_SEED) {
-        params.seed = time(NULL);
-    }
-
-    LOG_TEE("%s: seed  = %u\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
+    LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);

    LOG("%s: llama backend init\n", __func__);
    llama_backend_init();
    llama_numa_init(params.numa);

-    llama_model * model;
-    llama_context * ctx;
+    llama_model * model = nullptr;
+    llama_context * ctx = nullptr;
+    llama_sampling * smpl = nullptr;

    g_model = &model;
    g_ctx = &ctx;
+    g_smpl = &smpl;

    // load the model and apply lora adapter, if any
    LOG("%s: load the model and apply lora adapter, if any\n", __func__);
@@ -203,8 +199,8 @@ int main(int argc, char ** argv) {
        LOG_TEE("\n");
        LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
    }
-    const bool add_bos = llama_should_add_bos_token(model);
-    GGML_ASSERT(llama_add_eos_token(model) != 1);
+    const bool add_bos = llama_add_bos_token(model);
+    GGML_ASSERT(!llama_add_eos_token(model));
    LOG("add_bos: %d\n", add_bos);

    std::vector<llama_token> embd_inp;
@@ -305,7 +301,7 @@ int main(int argc, char ** argv) {
            LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
        }
    }
-    LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
+    LOG_TEE("sampling: \n%s\n", sparams.print_all().c_str());
    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
    LOG_TEE("\n\n");

@@ -349,7 +345,7 @@ int main(int argc, char ** argv) {

    std::vector<llama_token> embd;

-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
+    smpl = llama_sampling_init(model, sparams);

    while (n_remain != 0 || params.interactive) {
        // predict
@@ -421,11 +417,11 @@ int main(int argc, char ** argv) {
        embd.clear();

        if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
-            const llama_token id = llama_sampling_sample(ctx_sampling, ctx, nullptr);
+            const llama_token id = llama_sampling_sample(smpl, ctx, -1);

-            llama_sampling_accept(ctx_sampling, ctx, id, true);
+            llama_sampling_accept(smpl, id, true);

-            LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
+            // LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, smpl->prev.to_vector()).c_str());

            embd.push_back(id);

@@ -444,7 +440,7 @@ int main(int argc, char ** argv) {

                // push the prompt in the sampling context in order to apply repetition penalties later
                // for the prompt, we don't apply grammar rules
-                llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false);
+                llama_sampling_accept(smpl, embd_inp[n_consumed], false);

                ++n_consumed;
                if ((int) embd.size() >= params.n_batch) {
@@ -476,7 +472,7 @@ int main(int argc, char ** argv) {
        // if not currently processing queued inputs;
        if ((int) embd_inp.size() <= n_consumed) {
            // deal with eot token in infill mode
-            if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){
+            if ((llama_sampling_last(smpl) == llama_token_eot(model) || is_interacting) && params.interactive){
                if (is_interacting && !params.interactive_first) {
                    // print an eot token
                    printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
@@ -542,7 +538,7 @@ int main(int argc, char ** argv) {
                is_interacting = false;
            }
            // deal with end of generation tokens in interactive mode
-            else if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) {
+            else if (llama_token_is_eog(model, llama_sampling_last(smpl))) {
                LOG("found EOS token\n");

                if (params.interactive) {
@@ -615,7 +611,7 @@ int main(int argc, char ** argv) {

            if (n_past > 0) {
                if (is_interacting) {
-                    llama_sampling_reset(ctx_sampling);
+                    llama_sampling_reset(smpl);
                }
                is_interacting = false;
            }
@@ -638,13 +634,13 @@ int main(int argc, char ** argv) {
        fflush(stdout);
    }

-    llama_print_timings(ctx);
+    llama_print_timings(ctx, smpl);
    write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);

    llama_free(ctx);
    llama_free_model(model);

-    llama_sampling_free(ctx_sampling);
+    llama_sampling_free(smpl);
    llama_backend_free();

 #ifndef LOG_DISABLE_LOGS
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -16,6 +16,7 @@
 #include <sstream>
 #include <string>
 #include <vector>
+#include <thread>

 #include "ggml.h"
 #include "llama.h"
@@ -225,6 +226,9 @@ struct cmd_params {
    std::vector<ggml_type> type_k;
    std::vector<ggml_type> type_v;
    std::vector<int> n_threads;
+    std::vector<std::string> cpu_mask;
+    std::vector<bool> cpu_strict;
+    std::vector<int> poll;
    std::vector<int> n_gpu_layers;
    std::vector<std::string> rpc_servers;
    std::vector<llama_split_mode> split_mode;
@@ -236,6 +240,8 @@ struct cmd_params {
    std::vector<bool> embeddings;
    ggml_numa_strategy numa;
    int reps;
+    ggml_sched_priority prio;
+    int delay;
    bool verbose;
    output_formats output_format;
    output_formats output_format_stderr;
@@ -251,6 +257,9 @@ static const cmd_params cmd_params_defaults = {
    /* type_k               */ {GGML_TYPE_F16},
    /* type_v               */ {GGML_TYPE_F16},
    /* n_threads            */ {cpu_get_num_math()},
+    /* cpu_mask             */ {"0x0"},
+    /* cpu_strict           */ {false},
+    /* poll                 */ {50},
    /* n_gpu_layers         */ {99},
    /* rpc_servers          */ {""},
    /* split_mode           */ {LLAMA_SPLIT_MODE_LAYER},
@@ -262,6 +271,8 @@ static const cmd_params cmd_params_defaults = {
    /* embeddings           */ {false},
    /* numa                 */ GGML_NUMA_STRATEGY_DISABLED,
    /* reps                 */ 5,
+    /* prio                 */ GGML_SCHED_PRIO_NORMAL,
+    /* delay                */ 0,
    /* verbose              */ false,
    /* output_format        */ MARKDOWN,
    /* output_format_stderr */ NONE,
@@ -281,6 +292,9 @@ static void print_usage(int /* argc */, char ** argv) {
    printf("  -ctk, --cache-type-k <t>            (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
    printf("  -ctv, --cache-type-v <t>            (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
    printf("  -t, --threads <n>                   (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
+    printf("  -C, --cpu-mask <hex,hex>            (default: %s)\n", join(cmd_params_defaults.cpu_mask, ",").c_str());
+    printf("  --cpu-strict <0|1>                  (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str());
+    printf("  --poll <0...100>                    (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
    printf("  -ngl, --n-gpu-layers <n>            (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
    printf("  -rpc, --rpc <rpc_servers>           (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
    printf("  -sm, --split-mode <none|layer|row>  (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
@@ -292,6 +306,8 @@ static void print_usage(int /* argc */, char ** argv) {
    printf("  -embd, --embeddings <0|1>           (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
    printf("  -ts, --tensor-split <ts0/ts1/..>    (default: 0)\n");
    printf("  -r, --repetitions <n>               (default: %d)\n", cmd_params_defaults.reps);
+    printf("  --prio <0|1|2|3>                    (default: %d)\n", cmd_params_defaults.prio);
+    printf("  --delay <0...N> (seconds)           (default: %d)\n", cmd_params_defaults.delay);
    printf("  -o, --output <csv|json|md|sql>      (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
    printf("  -oe, --output-err <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
    printf("  -v, --verbose                       (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
@@ -338,6 +354,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
    params.output_format_stderr = cmd_params_defaults.output_format_stderr;
    params.reps = cmd_params_defaults.reps;
    params.numa = cmd_params_defaults.numa;
+    params.prio = cmd_params_defaults.prio;
+    params.delay = cmd_params_defaults.delay;

    for (int i = 1; i < argc; i++) {
        arg = argv[i];
@@ -433,6 +451,27 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
            }
            auto p = string_split<int>(argv[i], split_delim);
            params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
+        } else if (arg == "-C" || arg == "--cpu-mask") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = string_split<std::string>(argv[i], split_delim);
+            params.cpu_mask.insert(params.cpu_mask.end(), p.begin(), p.end());
+        } else if (arg == "--cpu-strict") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = string_split<bool>(argv[i], split_delim);
+            params.cpu_strict.insert(params.cpu_strict.end(), p.begin(), p.end());
+        } else if (arg == "--poll") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = string_split<int>(argv[i], split_delim);
+            params.poll.insert(params.poll.end(), p.begin(), p.end());
        } else if (arg == "-ngl" || arg == "--n-gpu-layers") {
            if (++i >= argc) {
                invalid_param = true;
@@ -541,6 +580,18 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                break;
            }
            params.reps = std::stoi(argv[i]);
+        } else if (arg == "--prio") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.prio = (enum ggml_sched_priority) std::stoi(argv[i]);
+        } else if (arg == "--delay") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.delay = std::stoi(argv[i]);
        } else if (arg == "-o" || arg == "--output") {
            if (++i >= argc) {
                invalid_param = true;
@@ -585,6 +636,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
    if (params.use_mmap.empty())     { params.use_mmap = cmd_params_defaults.use_mmap; }
    if (params.embeddings.empty())   { params.embeddings = cmd_params_defaults.embeddings; }
    if (params.n_threads.empty())    { params.n_threads = cmd_params_defaults.n_threads; }
+    if (params.cpu_mask.empty())     { params.cpu_mask  = cmd_params_defaults.cpu_mask;  }
+    if (params.cpu_strict.empty())   { params.cpu_strict = cmd_params_defaults.cpu_strict; }
+    if (params.poll.empty())         { params.poll = cmd_params_defaults.poll; }

    return params;
 }
@@ -598,6 +652,9 @@ struct cmd_params_instance {
    ggml_type type_k;
    ggml_type type_v;
    int n_threads;
+    std::string cpu_mask;
+    bool cpu_strict;
+    int poll;
    int n_gpu_layers;
    std::string rpc_servers;
    llama_split_mode split_mode;
@@ -667,7 +724,10 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
    for (const auto & tv : params.type_v)
    for (const auto & nkvo : params.no_kv_offload)
    for (const auto & fa : params.flash_attn)
-    for (const auto & nt : params.n_threads) {
+    for (const auto & nt : params.n_threads)
+    for (const auto & cm : params.cpu_mask)
+    for (const auto & cs : params.cpu_strict)
+    for (const auto & pl : params.poll) {
        for (const auto & n_prompt : params.n_prompt) {
            if (n_prompt == 0) {
                continue;
@@ -681,6 +741,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .type_k       = */ tk,
                /* .type_v       = */ tv,
                /* .n_threads    = */ nt,
+                /* .cpu_mask     = */ cm,
+                /* .cpu_strict   = */ cs,
+                /* .poll         = */ pl,
                /* .n_gpu_layers = */ nl,
                /* .rpc_servers  = */ rpc,
                /* .split_mode   = */ sm,
@@ -707,6 +770,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .type_k       = */ tk,
                /* .type_v       = */ tv,
                /* .n_threads    = */ nt,
+                /* .cpu_mask     = */ cm,
+                /* .cpu_strict   = */ cs,
+                /* .poll         = */ pl,
                /* .n_gpu_layers = */ nl,
                /* .rpc_servers  = */ rpc,
                /* .split_mode   = */ sm,
@@ -733,6 +799,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .type_k       = */ tk,
                /* .type_v       = */ tv,
                /* .n_threads    = */ nt,
+                /* .cpu_mask     = */ cm,
+                /* .cpu_strict   = */ cs,
+                /* .poll         = */ pl,
                /* .n_gpu_layers = */ nl,
                /* .rpc_servers  = */ rpc,
                /* .split_mode   = */ sm,
@@ -769,6 +838,9 @@ struct test {
    int n_batch;
    int n_ubatch;
    int n_threads;
+    std::string cpu_mask;
+    bool cpu_strict;
+    int poll;
    bool has_rpc;
    ggml_type type_k;
    ggml_type type_v;
@@ -795,6 +867,9 @@ struct test {
        n_batch = inst.n_batch;
        n_ubatch = inst.n_ubatch;
        n_threads = inst.n_threads;
+        cpu_mask = inst.cpu_mask;
+        cpu_strict = inst.cpu_strict;
+        poll = inst.poll;
        has_rpc = !inst.rpc_servers.empty();
        type_k = inst.type_k;
        type_v = inst.type_v;
@@ -872,13 +947,14 @@ struct test {
            "cpu_info", "gpu_info",
            "model_filename", "model_type", "model_size", "model_n_params",
            "n_batch", "n_ubatch",
-            "n_threads", "type_k", "type_v",
+            "n_threads", "cpu_mask", "cpu_strict", "poll",
+            "type_k", "type_v",
            "n_gpu_layers", "split_mode",
            "main_gpu", "no_kv_offload", "flash_attn",
            "tensor_split", "use_mmap", "embeddings",
            "n_prompt", "n_gen", "test_time",
            "avg_ns", "stddev_ns",
-            "avg_ts", "stddev_ts"
+            "avg_ts", "stddev_ts",
        };
        return fields;
    }
@@ -887,7 +963,7 @@ struct test {

    static field_type get_field_type(const std::string & field) {
        if (field == "build_number" || field == "n_batch" || field == "n_ubatch" ||
-            field == "n_threads" ||
+            field == "n_threads" || field == "poll" ||
            field == "model_size" || field == "model_n_params" ||
            field == "n_gpu_layers" || field == "main_gpu" ||
            field == "n_prompt" || field == "n_gen" ||
@@ -896,6 +972,7 @@ struct test {
        }
        if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
            field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
+            field == "cpu_strict" ||
            field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
            return BOOL;
        }
@@ -928,7 +1005,8 @@ struct test {
            cpu_info, gpu_info,
            model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
            std::to_string(n_batch), std::to_string(n_ubatch),
-            std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
+            std::to_string(n_threads), cpu_mask, std::to_string(cpu_strict), std::to_string(poll),
+            ggml_type_name(type_k), ggml_type_name(type_v),
            std::to_string(n_gpu_layers), split_mode_str(split_mode),
            std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn),
            tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
@@ -1067,7 +1145,7 @@ struct markdown_printer : public printer {
            return -30;
        }
        if (field == "t/s") {
-            return 16;
+            return 20;
        }
        if (field == "size" || field == "params") {
            return 10;
@@ -1149,6 +1227,15 @@ struct markdown_printer : public printer {
        if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
            fields.emplace_back("n_threads");
        }
+        if (params.cpu_mask.size() > 1 || params.cpu_mask != cmd_params_defaults.cpu_mask) {
+            fields.emplace_back("cpu_mask");
+        }
+        if (params.cpu_strict.size() > 1 || params.cpu_strict != cmd_params_defaults.cpu_strict) {
+            fields.emplace_back("cpu_strict");
+        }
+        if (params.poll.size() > 1 || params.poll != cmd_params_defaults.poll) {
+            fields.emplace_back("poll");
+        }
        if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
            fields.emplace_back("n_batch");
        }
@@ -1383,6 +1470,8 @@ int main(int argc, char ** argv) {
    llama_backend_init();
    llama_numa_init(params.numa);

+    set_process_priority(params.prio);
+
    // initialize printer
    std::unique_ptr<printer> p = create_printer(params.output_format);
    std::unique_ptr<printer> p_err = create_printer(params.output_format_stderr);
@@ -1428,6 +1517,28 @@ int main(int argc, char ** argv) {

        llama_kv_cache_clear(ctx);

+        // cool off before the test
+        if (params.delay) {
+            std::this_thread::sleep_for(std::chrono::seconds(params.delay));
+        }
+
+        struct ggml_threadpool_params tpp = ggml_threadpool_params_default(t.n_threads);
+        if (!parse_cpu_mask(t.cpu_mask, tpp.cpumask)) {
+            LOG_TEE("%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str());
+            exit(1);
+        }
+        tpp.strict_cpu = t.cpu_strict;
+        tpp.poll       = t.poll;
+        tpp.prio       = params.prio;
+
+        struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
+        if (!threadpool) {
+            LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
+            exit(1);
+        }
+
+        llama_attach_threadpool(ctx, threadpool, NULL);
+
        // warmup run
        if (t.n_prompt > 0) {
            //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
@@ -1463,9 +1574,11 @@ int main(int argc, char ** argv) {
            fflush(p_err->fout);
        }

-        llama_print_timings(ctx);
+        llama_print_timings(ctx, nullptr);

        llama_free(ctx);
+
+        ggml_threadpool_free(threadpool);
    }

    llama_free_model(lmodel);
--- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
@@ -120,8 +120,8 @@ Java_android_llama_cpp_LLamaAndroid_new_1context(JNIEnv *env, jobject, jlong jmo
    LOGi("Using %d threads", n_threads);

    llama_context_params ctx_params = llama_context_default_params();
-    ctx_params.seed  = 1234;
-    ctx_params.n_ctx = 2048;
+
+    ctx_params.n_ctx           = 2048;
    ctx_params.n_threads       = n_threads;
    ctx_params.n_threads_batch = n_threads;

@@ -380,11 +380,13 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
        JNIEnv * env,
        jobject,
        jlong context_pointer,
+        jlong sampling_pointer,
        jlong batch_pointer,
        jint n_len,
        jobject intvar_ncur
 ) {
    const auto context = reinterpret_cast<llama_context *>(context_pointer);
+    const auto sampling = reinterpret_cast<llama_sampling *>(sampling_pointer);
    const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
    const auto model = llama_get_model(context);

@@ -392,20 +394,12 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
    if (!la_int_var_value) la_int_var_value = env->GetMethodID(la_int_var, "getValue", "()I");
    if (!la_int_var_inc) la_int_var_inc = env->GetMethodID(la_int_var, "inc", "()V");

-    auto n_vocab = llama_n_vocab(model);
-    auto logits = llama_get_logits_ith(context, batch->n_tokens - 1);
+    const auto * logits = llama_get_logits_ith(context, batch->n_tokens - 1);

-    std::vector<llama_token_data> candidates;
-    candidates.reserve(n_vocab);
-
-    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-        candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
-    }
-
-    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+    llama_sampling_set_logits(sampling, logits);

    // sample the most likely token
-    const auto new_token_id = llama_sample_token_greedy(context, &candidates_p);
+    const auto new_token_id = llama_sampling_sample_greedy(sampling, nullptr);

    const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
    if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@@ -24,6 +24,7 @@ func llama_batch_add(_ batch: inout llama_batch, _ id: llama_token, _ pos: llama
 actor LlamaContext {
    private var model: OpaquePointer
    private var context: OpaquePointer
+    private var sampling: OpaquePointer
    private var batch: llama_batch
    private var tokens_list: [llama_token]
    var is_done: Bool = false
@@ -42,9 +43,11 @@ actor LlamaContext {
        self.tokens_list = []
        self.batch = llama_batch_init(512, 0, 1)
        self.temporary_invalid_cchars = []
+        self.sampling = llama_sampling_init(context, llama_sampling_default_params())
    }

    deinit {
+        llama_sampling_free(sampling)
        llama_batch_free(batch)
        llama_free(context)
        llama_free_model(model)
@@ -69,10 +72,9 @@ actor LlamaContext {
        print("Using \(n_threads) threads")

        var ctx_params = llama_context_default_params()
-        ctx_params.seed  = 1234
        ctx_params.n_ctx = 2048
-        ctx_params.n_threads       = UInt32(n_threads)
-        ctx_params.n_threads_batch = UInt32(n_threads)
+        ctx_params.n_threads       = Int32(n_threads)
+        ctx_params.n_threads_batch = Int32(n_threads)

        let context = llama_new_context_with_model(model, ctx_params)
        guard let context else {
@@ -147,17 +149,9 @@ actor LlamaContext {
        let n_vocab = llama_n_vocab(model)
        let logits = llama_get_logits_ith(context, batch.n_tokens - 1)

-        var candidates = Array<llama_token_data>()
-        candidates.reserveCapacity(Int(n_vocab))
+        llama_sampling_set_logits(sampling, logits);

-        for token_id in 0..<n_vocab {
-            candidates.append(llama_token_data(id: token_id, logit: logits![Int(token_id)], p: 0.0))
-        }
-        candidates.withUnsafeMutableBufferPointer() { buffer in
-            var candidates_p = llama_token_data_array(data: buffer.baseAddress, size: buffer.count, sorted: false)
-
-            new_token_id = llama_sample_token_greedy(context, &candidates_p)
-        }
+        new_token_id = llama_sampling_sample_greedy(sampling, nil)

        if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
            print("\n")
--- a/examples/llava/README-minicpmv2.5.md
+++ b/examples/llava/README-minicpmv2.5.md
@@ -15,9 +15,9 @@ cd llama.cpp
 Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us)

 ```bash
-python ./examples/minicpmv/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
-python ./examples/minicpmv/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5
-python ./convert-hf-to-gguf.py ../MiniCPM-Llama3-V-2_5/model
+python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
+python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2
+python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model

 # quantize int4 version
 ./llama-quantize ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf Q4_K_M
--- a/examples/llava/README-minicpmv2.6.md
+++ b/examples/llava/README-minicpmv2.6.md
@@ -0,0 +1,107 @@
+## MiniCPM-V 2.6
+
+### Prepare models and code
+
+Download [MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6) PyTorch model from huggingface to "MiniCPM-V-2_6" folder.
+
+Clone llama.cpp:
+```bash
+git clone git@github.com:OpenBMB/llama.cpp.git
+cd llama.cpp
+git checkout minicpmv-main
+```
+
+### Usage of MiniCPM-V 2.6
+
+Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) by us)
+
+```bash
+python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-V-2_6
+python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-2_6 --minicpmv-projector ../MiniCPM-V-2_6/minicpmv.projector --output-dir ../MiniCPM-V-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 3
+python ./convert_hf_to_gguf.py ../MiniCPM-V-2_6/model
+
+# quantize int4 version
+./llama-quantize ../MiniCPM-V-2_6/model/ggml-model-f16.gguf ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M
+```
+
+Build for Linux or Mac
+
+```bash
+make
+make llama-minicpmv-cli
+```
+
+Inference on Linux or Mac
+```
+# run f16 version
+./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
+
+# run quantized int4 version
+./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg  -p "What is in the image?"
+
+# or run in interactive mode
+./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i
+```
+
+### Video
+Install FFmpeg
+```
+brew install ffmpeg
+brew install pkg-config
+```
+
+### Android
+
+#### Build on Android device using Termux
+We found that build on Android device would bring better runtime performance, so we recommend to build on device.
+
+[Termux](https://github.com/termux/termux-app#installation) is a terminal app on Android device (no root required).
+
+Install tools in Termux:
+```
+apt update && apt upgrade -y
+apt install git make cmake
+```
+
+It's recommended to move your model inside the `~/` directory for best performance:
+```
+cd storage/downloads
+mv model.gguf ~/
+```
+
+#### Building the Project using Android NDK
+Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake.
+
+Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux:
+
+```bash
+mkdir build-android
+cd build-android
+export NDK=/your_ndk_path
+cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
+make
+```
+
+Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).
+
+Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
+
+(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
+```
+$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/
+$cd /data/data/com.termux/files/home/bin
+$chmod +x ./*
+```
+
+Download models and push them to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/`
+
+```
+$mv /sdcard/llama.cpp/ggml-model-Q4_K_M.gguf /data/data/com.termux/files/home/model/
+$mv /sdcard/llama.cpp/mmproj-model-f16.gguf /data/data/com.termux/files/home/model/
+```
+
+Now, you can start chatting:
+```
+$cd /data/data/com.termux/files/home/bin
+$./llama-minicpmv-cli -m ../model/ggml-model-Q4_K_M.gguf --mmproj ../model/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg  -p "What is in the image?"
+```
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -20,6 +20,10 @@
 #include "ggml-cann.h"
 #endif

+#ifdef GGML_USE_VULKAN
+#include "ggml-vulkan.h"
+#endif
+
 #define STB_IMAGE_IMPLEMENTATION
 #include "stb_image.h"

@@ -81,6 +85,7 @@ static std::string format(const char * fmt, ...) {
 #define KEY_HAS_VIS_ENC         "clip.has_vision_encoder"
 #define KEY_HAS_LLAVA_PROJ      "clip.has_llava_projector"
 #define KEY_HAS_MINICPMV_PROJ   "clip.has_minicpmv_projector"
+#define KEY_MINICPMV_VERSION    "clip.minicpmv_version"
 #define KEY_USE_GELU            "clip.use_gelu"
 #define KEY_N_EMBD              "clip.%s.embedding_length"
 #define KEY_N_FF                "clip.%s.feed_forward_length"
@@ -211,13 +216,19 @@ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int

 static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
    if (search.empty()) {
-        return; // Avoid infinite loop if 'search' is an empty string
+        return;
    }
+    std::string builder;
+    builder.reserve(s.length());
    size_t pos = 0;
-    while ((pos = s.find(search, pos)) != std::string::npos) {
-        s.replace(pos, search.length(), replace);
-        pos += replace.length();
+    size_t last_pos = 0;
+    while ((pos = s.find(search, last_pos)) != std::string::npos) {
+        builder.append(s, last_pos, pos - last_pos);
+        builder.append(replace);
+        last_pos = pos + search.length();
    }
+    builder.append(s, last_pos, std::string::npos);
+    s = std::move(builder);
 }

 static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
@@ -526,6 +537,7 @@ struct clip_ctx {
    bool has_vision_encoder  = false;
    bool has_llava_projector = false;
    bool has_minicpmv_projector = false;
+    int minicpmv_version = 2;

    struct clip_vision_model vision_model;
    projector_type proj_type = PROJECTOR_TYPE_MLP;
@@ -641,7 +653,12 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
    if (ctx->has_minicpmv_projector) {
        int pos_w = image_size_width/patch_size;
        int pos_h = image_size_height/patch_size;
-        pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 4096, pos_w * pos_h, 1);
+        if (ctx->minicpmv_version == 2) {
+            pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 4096, pos_w * pos_h, 1);
+        }
+        else if (ctx->minicpmv_version == 3) {
+            pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1);
+        }
        ggml_set_name(pos_embed, "pos_embed");
        ggml_set_input(pos_embed);
    }
@@ -768,8 +785,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            embeddings = ggml_gelu(ctx0, embeddings);
            embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
            embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
-
-        } else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
+        }
+        else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
            embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
            embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
            // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
@@ -949,10 +966,20 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            }

            { // attention
-                const int hidden_size = 4096;
+                int hidden_size = 4096;
                const int d_head = 128;
-                const int n_head = hidden_size/d_head;
-                const int num_query = 96;
+                int n_head = hidden_size/d_head;
+                int num_query = 96;
+                if (ctx->minicpmv_version == 2) {
+                    hidden_size = 4096;
+                    n_head = hidden_size/d_head;
+                    num_query = 96;
+                }
+                else if (ctx->minicpmv_version == 3) {
+                    hidden_size = 3584;
+                    n_head = hidden_size/d_head;
+                    num_query = 64;
+                }

                struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b);
                Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
@@ -1091,7 +1118,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        }
    }

-    clip_ctx * new_clip = new clip_ctx;
+    clip_ctx * new_clip = new clip_ctx{};

    // update projector type
    {
@@ -1125,6 +1152,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
    LOG_TEE("%s: CLIP using CANN backend\n", __func__);
 #endif

+#ifdef GGML_USE_VULKAN
+    new_clip->backend = ggml_backend_vk_init(0);
+    LOG_TEE("%s: CLIP using Vulkan backend\n", __func__);
+#endif

    if (!new_clip->backend) {
        new_clip->backend = ggml_backend_cpu_init();
@@ -1149,6 +1180,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            new_clip->has_minicpmv_projector = gguf_get_val_bool(ctx, idx);
        }

+        idx = gguf_find_key(ctx, KEY_MINICPMV_VERSION);
+        if (idx != -1) {
+            new_clip->minicpmv_version = gguf_get_val_i32(ctx, idx);
+        }
+
        // GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search

        GGML_ASSERT(new_clip->has_vision_encoder);
@@ -1587,7 +1623,7 @@ static void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32*
    }
 }

-inline float clip(float x, float lower, float upper) {
+inline int clip(int x, int lower, int upper) {
    return std::max(lower, std::min(x, upper));
 }

@@ -1791,10 +1827,6 @@ static std::pair<int, int> uhd_get_refine_size(std::pair<int, int> original_size
    return refine_size;
 }

-inline int clip(int x, int lower, int upper) {
-    return std::max(lower, std::min(x, upper));
-}
-
 static std::pair<int, int> uhd_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) {
    std::vector<int> candidate_split_grids_nums;
    for (int i : {multiple - 1, multiple, multiple + 1}) {
@@ -1910,10 +1942,12 @@ int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) {
 // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
 // res_imgs memory is being allocated here, previous allocations will be freed if found
 bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {
-    if (clip_is_minicpmv(ctx)) {
-        std::vector<std::vector<clip_image_u8 *>> imgs = uhd_slice_image(img);
+
+    if(clip_is_minicpmv(ctx)){
+        int max_slice_nums = 9;
+        std::vector<std::vector<clip_image_u8 *>> imgs = uhd_slice_image(img, max_slice_nums);
        res_imgs->size = 0;
-        for (size_t i = 0; i < imgs.size(); ++i) {
+        for (size_t i = 0; i < imgs.size(); ++i){
            res_imgs->size += imgs[i].size();
        }
        res_imgs->data = new clip_image_f32[res_imgs->size];
@@ -2146,7 +2180,12 @@ int clip_n_patches(const struct clip_ctx * ctx) {
    if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
        n_patches /= 4;
    } else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
-        n_patches = 96;
+        if (ctx->minicpmv_version == 2) {
+            n_patches = 96;
+        }
+        else if (ctx->minicpmv_version == 3) {
+            n_patches = 64;
+        }
    }

    return n_patches;
@@ -2282,6 +2321,11 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
    const int patch_size    = hparams.patch_size;
    const int num_patches   = ((image_size_width / patch_size) * (image_size_height / patch_size));
    const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
+    if(ctx->load_image_size==nullptr){
+        ctx->load_image_size= clip_image_size_init();
+    }
+    const int pos_w = ctx->load_image_size->width/patch_size;
+    const int pos_h = ctx->load_image_size->height/patch_size;

    {
        struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
@@ -2316,8 +2360,18 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
            //    -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
            struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
            int* positions_data = (int*)malloc(ggml_nbytes(positions));
-            for (int i = 0; i < num_positions; i++) {
-                positions_data[i] = std::floor(70.0*i/num_positions);
+            int bucket_coords_h[70];
+            int bucket_coords_w[70];
+            for (int i = 0; i < pos_h; i++){
+                bucket_coords_h[i] = std::floor(70.0*i/pos_h);
+            }
+            for (int i = 0; i < pos_w; i++){
+                bucket_coords_w[i] = std::floor(70.0*i/pos_w);
+            }
+            for (int i = 0, id = 0; i < pos_h; i++){
+                for (int j = 0; j < pos_w; j++){
+                    positions_data[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j];
+                }
            }
            ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
            free(positions_data);
@@ -2328,12 +2382,13 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
            //    -> https://huggingface.co/Qwen/Qwen-VL/tree/main
            //    -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
            struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed");
-            if(ctx->load_image_size==nullptr){
-                ctx->load_image_size= clip_image_size_init();
-            }
-            int pos_w = ctx->load_image_size->width/patch_size;
-            int pos_h = ctx->load_image_size->height/patch_size;
            int embed_dim = 4096;
+            if (ctx->minicpmv_version == 2) {
+                embed_dim = 4096;
+            }
+            else if (ctx->minicpmv_version == 3) {
+                embed_dim = 3584;
+            }
            auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));

            float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed));
@@ -2346,7 +2401,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
            ggml_backend_tensor_set(pos_embed, pos_embed_data, 0, ggml_nbytes(pos_embed));
            free(pos_embed_data);
        }
-    } else {
+    }
+    else{
        {
            if (ctx->has_class_embedding) {
                struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");
@@ -2548,13 +2604,21 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
        return ctx->vision_model.mm_3_b->ne[0];
    }
    if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
-        return 4096;
+        if (ctx->minicpmv_version == 2) {
+            return 4096;
+        }
+        else if (ctx->minicpmv_version == 3) {
+            return 3584;
+        }
    }

    std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
    throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
 }

-bool clip_is_minicpmv(const struct clip_ctx * ctx) {
-    return ctx->has_minicpmv_projector;
+int clip_is_minicpmv(const struct clip_ctx * ctx) {
+    if (ctx->has_minicpmv_projector) {
+        return ctx->minicpmv_version;
+    }
+    return 0;
 }
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@@ -85,7 +85,7 @@ CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, cons

 CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);

-CLIP_API bool clip_is_minicpmv(const struct clip_ctx * ctx);
+CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);

 #ifdef __cplusplus
 }
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -40,11 +40,11 @@ static bool eval_string(struct llama_context * ctx_llama, const char* str, int n
    return true;
 }

-static const char * sample(struct llama_sampling_context * ctx_sampling,
+static const char * sample(struct llama_sampling * smpl,
                           struct llama_context * ctx_llama,
                           int * n_past) {
-    const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL);
-    llama_sampling_accept(ctx_sampling, ctx_llama, id, true);
+    const llama_token id = llama_sampling_sample(smpl, ctx_llama, -1);
+    llama_sampling_accept(smpl, id, true);
    static std::string ret;
    if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
        ret = "</s>";
@@ -129,14 +129,14 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para
        if (!params->image.empty()) {
            LOG_TEE("using base64 encoded image instead of command line image path\n");
        }
-        embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads, prompt);
+        embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt);
        if (!embed) {
            LOG_TEE("%s: can't load image from prompt\n", __func__);
            return NULL;
        }
        params->prompt = remove_image_from_prompt(prompt);
    } else {
-        embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, fname.c_str());
+        embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->cpuparams.n_threads, fname.c_str());
        if (!embed) {
            fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
            return NULL;
@@ -191,15 +191,15 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_

    LOG_TEE("\n");

-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
-    if (!ctx_sampling) {
+    struct llama_sampling * smpl = llama_sampling_init(ctx_llava->model, params->sparams);
+    if (!smpl) {
        fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
        exit(1);
    }

    std::string response = "";
    for (int i = 0; i < max_tgt_len; i++) {
-        const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
+        const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past);
        response += tmp;
        if (strcmp(tmp, "</s>") == 0) break;
        if (strstr(tmp, "###")) break; // Yi-VL behavior
@@ -211,7 +211,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
        fflush(stdout);
    }

-    llama_sampling_free(ctx_sampling);
+    llama_sampling_free(smpl);
    printf("\n");
 }

@@ -310,7 +310,7 @@ int main(int argc, char ** argv) {
        // process the prompt
        process_prompt(ctx_llava, image_embed, &params, params.prompt);

-        llama_print_timings(ctx_llava->ctx_llama);
+        llama_print_timings(ctx_llava->ctx_llama, nullptr);
        llava_image_embed_free(image_embed);
        ctx_llava->model = NULL;
        llava_free(ctx_llava);
@@ -327,7 +327,7 @@ int main(int argc, char ** argv) {
            // process the prompt
            process_prompt(ctx_llava, image_embed, &params, params.prompt);

-            llama_print_timings(ctx_llava->ctx_llama);
+            llama_print_timings(ctx_llava->ctx_llama, nullptr);
            llava_image_embed_free(image_embed);
            ctx_llava->model = NULL;
            llava_free(ctx_llava);
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -256,7 +256,14 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
            load_image_size->width = img_res_v.data[i].nx;
            load_image_size->height = img_res_v.data[i].ny;
            clip_add_load_image_size(ctx_clip, load_image_size);
-            const bool encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
+            bool encoded = false;
+            int has_minicpmv_projector = clip_is_minicpmv(ctx_clip);
+            if (has_minicpmv_projector == 2) {
+                encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
+            }
+            else if (has_minicpmv_projector == 3) {
+                encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
+            }
            if (!encoded) {
                LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
                return false;
--- a/examples/llava/minicpmv-cli.cpp
+++ b/examples/llava/minicpmv-cli.cpp
@@ -134,7 +134,13 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
    std::string system_prompt;
    int idx = 0;
    int num_image_embeds = embeds->n_image_pos / clip_n_patches(ctx_llava->ctx_clip);
-    system_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n";
+    int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
+    if (has_minicpmv_projector == 2) {
+        system_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n";
+    }
+    else if (has_minicpmv_projector == 3) {
+        system_prompt = "<|im_start|>user\n";
+    }
    LOG_TEE("%s: image token past: %d\n", __func__, n_past);
    eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
    process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
@@ -157,11 +163,11 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
    LOG_TEE("%s: image token past: %d\n", __func__, n_past);
 }

-static const char * sample(struct llama_sampling_context * ctx_sampling,
+static const char * sample(struct llama_sampling * smpl,
                           struct llama_context * ctx_llama,
                           int * n_past) {
-    const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL);
-    llama_sampling_accept(ctx_sampling, ctx_llama, id, true);
+    const llama_token id = llama_sampling_sample(smpl, ctx_llama, -1);
+    llama_sampling_accept(smpl, id, true);
    static std::string ret;
    if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
        ret = "</s>";
@@ -174,7 +180,7 @@ static const char * sample(struct llama_sampling_context * ctx_sampling,

 static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){
    auto ctx_clip = clip_init_context(params);
-    auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->n_threads, fname.c_str());
+    auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str());
    if (!embeds) {
        std::cerr << "error: failed to load image " << fname << ". Terminating\n\n";
        return NULL;
@@ -208,23 +214,37 @@ static struct llava_context * minicpmv_init(gpt_params * params, const std::stri
    return ctx_llava;
 }

-static struct llama_sampling_context * llama_init(struct llava_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false){
+static struct llama_sampling * llama_init(struct llava_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false){
    std::string user_prompt = prompt;
-    if (!is_first) user_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" + prompt;
+    int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
+    if (!is_first) {
+        if (has_minicpmv_projector == 2) {
+            user_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" + prompt;
+        }
+        else if (has_minicpmv_projector == 3) {
+            user_prompt = "<|im_start|>user\n" + prompt;
+        }
+    }

    eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
-    eval_string(ctx_llava->ctx_llama, "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", params->n_batch, &n_past, false);
+    if (has_minicpmv_projector == 2) {
+        eval_string(ctx_llava->ctx_llama, "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", params->n_batch, &n_past, false);
+    }
+    else if (has_minicpmv_projector == 3) {
+        eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false);
+    }
+
    // generate the response

    LOG_TEE("\n");

-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
-    return ctx_sampling;
+    struct llama_sampling * smpl = llama_sampling_init(ctx_llava->model, params->sparams);
+    return smpl;
 }

-static const char * llama_loop(struct llava_context * ctx_llava,struct llama_sampling_context * ctx_sampling, int &n_past){
+static const char * llama_loop(struct llava_context * ctx_llava,struct llama_sampling * smpl, int &n_past){

-    const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
+    const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past);
    return tmp;
 }

@@ -258,12 +278,12 @@ int main(int argc, char ** argv) {
        if (!params.prompt.empty()) {
            LOG_TEE("<user>%s\n", params.prompt.c_str());
            LOG_TEE("<assistant>");
-            auto ctx_sampling = llama_init(ctx_llava, &params, params.prompt.c_str(), n_past, true);
+            auto smpl = llama_init(ctx_llava, &params, params.prompt.c_str(), n_past, true);
            const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
            std::string response = "";
            bool have_tmp = false;
            for (int i = 0; i < max_tgt_len; i++) {
-                auto tmp = llama_loop(ctx_llava, ctx_sampling, n_past);
+                auto tmp = llama_loop(ctx_llava, smpl, n_past);
                response += tmp;
                if (strcmp(tmp, "</s>") == 0){
                    if(!have_tmp)continue;
@@ -276,18 +296,18 @@ int main(int argc, char ** argv) {

                fflush(stdout);
            }
-            llama_sampling_free(ctx_sampling);
+            llama_sampling_free(smpl);
        }else {
            while (true) {
                LOG_TEE("<user>");
                std::string prompt;
                std::getline(std::cin, prompt);
                LOG_TEE("<assistant>");
-                auto ctx_sampling = llama_init(ctx_llava, &params, prompt, n_past, true);
+                auto smpl = llama_init(ctx_llava, &params, prompt, n_past, true);
                const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
                std::string response = "";
                for (int i = 0; i < max_tgt_len; i++) {
-                    auto tmp = llama_loop(ctx_llava, ctx_sampling, n_past);
+                    auto tmp = llama_loop(ctx_llava, smpl, n_past);
                    response += tmp;
                    if (strcmp(tmp, "</s>") == 0) break;
                    if (strstr(tmp, "###")) break; // Yi-VL behavior
@@ -295,11 +315,11 @@ int main(int argc, char ** argv) {
                    if (strstr(response.c_str(), "<user>")) break; // minicpm-v
                    fflush(stdout);
                }
-                llama_sampling_free(ctx_sampling);
+                llama_sampling_free(smpl);
            }
        }
        printf("\n");
-        llama_print_timings(ctx_llava->ctx_llama);
+        llama_print_timings(ctx_llava->ctx_llama, nullptr);

        ctx_llava->model = NULL;
        llava_free(ctx_llava);
--- a/examples/llava/minicpmv-convert-image-encoder-to-gguf.py
+++ b/examples/llava/minicpmv-convert-image-encoder-to-gguf.py
@@ -1,9 +1,416 @@
-import argparse
+# coding=utf-8
+# Copyright 2024 Google AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Siglip model. """
+# Copied from  HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit and add tgt_sizes
+
+
 import os
+import math
+import warnings
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn.init import _calculate_fan_in_and_fan_out
+
+from transformers.activations import ACT2FN
+from transformers.modeling_utils import PreTrainedModel
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import (
+    logging,
+)
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+class SiglipVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`SiglipVisionModel`]. It is used to instantiate a
+    Siglip vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the vision encoder of the Siglip
+    [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of channels in the input images.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+    Example:
+    ```python
+    >>> from transformers import SiglipVisionConfig, SiglipVisionModel
+    >>> # Initializing a SiglipVisionConfig with google/siglip-base-patch16-224 style configuration
+    >>> configuration = SiglipVisionConfig()
+    >>> # Initializing a SiglipVisionModel (with random weights) from the google/siglip-base-patch16-224 style configuration
+    >>> model = SiglipVisionModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "siglip_vision_model"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        image_size=224,
+        patch_size=16,
+        hidden_act="gelu_pytorch_tanh",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+
+_CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224"
+
+SIGLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "google/siglip-base-patch16-224",
+    # See all SigLIP models at https://huggingface.co/models?filter=siglip
+]
+
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+
+
+def _trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
+
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn(
+            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+            "The distribution of values may be incorrect.",
+            stacklevel=2,
+        )
+
+    # Values are generated by using a truncated uniform distribution and
+    # then using the inverse CDF for the normal distribution.
+    # Get upper and lower cdf values
+    l = norm_cdf((a - mean) / std)
+    u = norm_cdf((b - mean) / std)
+
+    # Uniformly fill tensor with values from [l, u], then translate to
+    # [2l-1, 2u-1].
+    tensor.uniform_(2 * l - 1, 2 * u - 1)
+
+    # Use inverse cdf transform for normal distribution to get truncated
+    # standard normal
+    if tensor.dtype in [torch.float16, torch.bfloat16]:
+        # The `erfinv_` op is not (yet?) defined in float16+cpu, bfloat16+gpu
+        og_dtype = tensor.dtype
+        tensor = tensor.to(torch.float32)
+        tensor.erfinv_()
+        tensor = tensor.to(og_dtype)
+    else:
+        tensor.erfinv_()
+
+    # Transform to proper mean, std
+    tensor.mul_(std * math.sqrt(2.0))
+    tensor.add_(mean)
+
+    # Clamp to ensure it's in the proper range
+    if tensor.dtype == torch.float16:
+        # The `clamp_` op is not (yet?) defined in float16+cpu
+        tensor = tensor.to(torch.float32)
+        tensor.clamp_(min=a, max=b)
+        tensor = tensor.to(torch.float16)
+    else:
+        tensor.clamp_(min=a, max=b)
+
+
+def trunc_normal_tf_(
+    tensor: torch.Tensor, mean: float = 0.0, std: float = 1.0, a: float = -2.0, b: float = 2.0
+):
+    """Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \\leq \text{mean} \\leq b`.
+    NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
+    bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
+    and the result is subsquently scaled and shifted by the mean and std args.
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    """
+    with torch.no_grad():
+        _trunc_normal_(tensor, 0, 1.0, a, b)
+        tensor.mul_(std).add_(mean)
+
+
+def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
+    denom = fan_in
+    if mode == "fan_in":
+        denom = fan_in
+    elif mode == "fan_out":
+        denom = fan_out
+    elif mode == "fan_avg":
+        denom = (fan_in + fan_out) / 2
+
+    variance = scale / denom
+
+    if distribution == "truncated_normal":
+        # constant is stddev of standard normal truncated to (-2, 2)
+        trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
+    elif distribution == "normal":
+        with torch.no_grad():
+            tensor.normal_(std=math.sqrt(variance))
+    elif distribution == "uniform":
+        bound = math.sqrt(3 * variance)
+        with torch.no_grad():
+            tensor.uniform_(-bound, bound)
+    else:
+        raise ValueError(f"invalid distribution {distribution}")
+
+
+def lecun_normal_(tensor):
+    variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")
+
+
+def default_flax_embed_init(tensor):
+    variance_scaling_(tensor, mode="fan_in", distribution="normal")
+
+class SiglipVisionEmbeddings(nn.Module):
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+        )
+
+        self.num_patches_per_side = self.image_size // self.patch_size
+        self.num_patches = self.num_patches_per_side**2
+        self.num_positions = self.num_patches
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+
+class SiglipAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    # Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Siglip
+class SiglipMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->Siglip
+class SiglipEncoderLayer(nn.Module):
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+        self.self_attn = (
+            SiglipAttention(config)
+        )
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = SiglipMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+class SiglipPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = SiglipVisionConfig
+    base_model_prefix = "siglip"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+
+        if isinstance(module, SiglipVisionEmbeddings):
+            width = self.config.hidden_size
+            nn.init.normal_(module.position_embedding.weight, std=1 / np.sqrt(width))
+        elif isinstance(module, nn.Embedding):
+            default_flax_embed_init(module.weight)
+        elif isinstance(module, SiglipAttention):
+            nn.init.normal_(module.q_proj.weight)
+            nn.init.normal_(module.k_proj.weight)
+            nn.init.normal_(module.v_proj.weight)
+            nn.init.normal_(module.out_proj.weight)
+            nn.init.zeros_(module.q_proj.bias)
+            nn.init.zeros_(module.k_proj.bias)
+            nn.init.zeros_(module.v_proj.bias)
+            nn.init.zeros_(module.out_proj.bias)
+        elif isinstance(module, SiglipMLP):
+            nn.init.normal_(module.fc1.weight)
+            nn.init.normal_(module.fc2.weight)
+            nn.init.normal_(module.fc1.bias, std=1e-6)
+            nn.init.normal_(module.fc2.bias, std=1e-6)
+        elif isinstance(module, (nn.Linear, nn.Conv2d)):
+            lecun_normal_(module.weight)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+SIGLIP_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`SiglipVisionConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+SIGLIP_VISION_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->Siglip
+class SiglipEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`SiglipEncoderLayer`].
+    Args:
+        config: SiglipConfig
+    """
+
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+class SiglipVisionTransformer(SiglipPreTrainedModel):
+    config_class = SiglipVisionConfig
+    main_input_name = "pixel_values"
+    _supports_flash_attn_2 = True
+
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__(config)
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = SiglipVisionEmbeddings(config)
+        self.encoder = SiglipEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.embeddings.patch_embedding
+
+import argparse
 import json
 import re

-import torch
 import numpy as np
 from gguf import *
 from transformers.models.idefics2.modeling_idefics2 import Idefics2VisionTransformer, Idefics2VisionConfig
@@ -94,6 +501,7 @@ default_image_mean = [0.48145466, 0.4578275, 0.40821073]
 default_image_std = [0.26862954, 0.26130258, 0.27577711]
 ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
 ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
+ap.add_argument('--minicpmv_version', type=int, help='minicpmv_version: MiniCPM-V-2 use 1; MiniCPM-V-2.5 use 2; MiniCPM-V-2.6 use 3', default=2)

 # with proper
 args = ap.parse_args()
@@ -135,6 +543,15 @@ if args.use_f32:
 #     model = CLIPModel.from_pretrained(dir_model)
 #     processor = CLIPProcessor.from_pretrained(dir_model)

+minicpmv_version = args.minicpmv_version
+emb_dim = 4096
+if minicpmv_version == 1:
+    emb_dim = 2304
+elif minicpmv_version == 2:
+    emb_dim = 4096
+elif minicpmv_version == 3:
+    emb_dim = 3584
+
 default_vision_config = {
        "hidden_size": 1152,
        "image_size": 980,
@@ -144,8 +561,12 @@ default_vision_config = {
        "num_hidden_layers": 27,
        "patch_size": 14,
    }
+
 vision_config = Idefics2VisionConfig(**default_vision_config)
 model = Idefics2VisionTransformer(vision_config)
+if minicpmv_version == 3:
+    vision_config = SiglipVisionConfig(**default_vision_config)
+    model = SiglipVisionTransformer(vision_config)

 processor = None
 # if model.attn_pool is not None:
@@ -158,6 +579,7 @@ fname_middle = None
 has_text_encoder = True
 has_vision_encoder = True
 has_minicpmv_projector = False
+
 if args.text_only:
    fname_middle = "text-"
    has_vision_encoder = False
@@ -165,6 +587,7 @@ elif args.minicpmv_projector is not None:
    fname_middle = "mmproj-"
    has_text_encoder = False
    has_minicpmv_projector = True
+    minicpmv_version = 3
 elif args.vision_only:
    fname_middle = "vision-"
    has_text_encoder = False
@@ -189,6 +612,7 @@ elif has_minicpmv_projector:
    fout.add_description("image encoder for MiniCPM-V")
    # add projector type
    fout.add_string("clip.projector_type", "resampler")
+    fout.add_int32("clip.minicpmv_version", minicpmv_version)
 else:
    fout.add_description("two-tower CLIP model")

@@ -274,11 +698,11 @@ def _replace_name_resampler(s, v):
    if re.match("resampler.pos_embed", s):
        return {
            s: v,
-            re.sub("pos_embed", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(4096, (70, 70))),
+            re.sub("pos_embed", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(emb_dim, (70, 70))),
        }
    if re.match("resampler.proj", s):
        return {
-            re.sub("proj", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(4096, (70, 70))),
+            re.sub("proj", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(emb_dim, (70, 70))),
            re.sub("proj", "proj.weight", s): v.transpose(-1, -2).contiguous(),
        }
    if re.match("resampler.attn.in_proj_.*", s):
--- a/examples/llava/minicpmv-surgery.py
+++ b/examples/llava/minicpmv-surgery.py
@@ -4,7 +4,7 @@ import torch
 from transformers import AutoModel, AutoTokenizer

 ap = argparse.ArgumentParser()
-ap.add_argument("-m", "--model", help="Path to MiniCPM-V-2.5 model")
+ap.add_argument("-m", "--model", help="Path to MiniCPM-V model")
 args = ap.parse_args()

 # find the model part that includes the the multimodal projector weights
@@ -29,7 +29,6 @@ if len(clip_tensors) > 0:
            f.write("{}\n")

 config = model.llm.config
-config._name_or_path = "openbmb/MiniCPM-Llama3-V-2.5"
 config.auto_map = {
    "AutoConfig": "configuration_minicpm.MiniCPMConfig",
    "AutoModel": "modeling_minicpm.MiniCPMModel",
@@ -40,7 +39,6 @@ config.auto_map = {
 model.llm.save_pretrained(f"{args.model}/model")
 tok = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
 tok.save_pretrained(f"{args.model}/model")
-# os.system(f"cp {args.model}/modeling_minicpm.py {args.model}/MiniCPM_l3/modeling_minicpm.py")

 print("Done!")
 print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.")
--- a/examples/llava/requirements.txt
+++ b/examples/llava/requirements.txt
@@ -2,4 +2,4 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 pillow~=10.2.0
 torch~=2.2.1
-torchvision==0.17.1
+torchvision~=0.17.1
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@@ -1,7 +1,6 @@
 #include "common.h"
 #include "llama.h"

-#include <cmath>
 #include <cstdio>
 #include <string>
 #include <vector>
@@ -118,7 +117,7 @@ int main(int argc, char ** argv) {
    llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);

    // target model sampling context
-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
+    struct llama_sampling * smpl = llama_sampling_init(model, params.sparams);

    // verification n-grams
    std::vector<ngram_data> ngrams_cur(G);
@@ -159,9 +158,9 @@ int main(int argc, char ** argv) {

    // sample first token
    {
-        id = llama_sampling_sample(ctx_sampling, ctx, NULL, 0);
+        id = llama_sampling_sample(smpl, ctx, 0);

-        llama_sampling_accept(ctx_sampling, ctx, id, true);
+        llama_sampling_accept(smpl, id, true);

        {
            const std::string token_str = llama_token_to_piece(ctx, id);
@@ -284,9 +283,9 @@ int main(int argc, char ** argv) {
            }

            // sample the next token
-            id = llama_sampling_sample(ctx_sampling, ctx, NULL, i_batch);
+            id = llama_sampling_sample(smpl, ctx, i_batch);

-            llama_sampling_accept(ctx_sampling, ctx, id, true);
+            llama_sampling_accept(smpl, id, true);

            // print
            {
@@ -361,7 +360,7 @@ int main(int argc, char ** argv) {
                if (v == 0) {
                    // sample from the last level
                    for (int i = 0; i < W; i++) {
-                        tokens_j[N - 2][i] = llama_sampling_sample(ctx_sampling, ctx, NULL, ngrams_cur.size()*(N-1) + W*(N - 2) + i);
+                        tokens_j[N - 2][i] = llama_sampling_sample(smpl, ctx, ngrams_cur.size()*(N-1) + W*(N - 2) + i);
                    }
                } else {
                    for (int i = 0; i < W; i++) {
@@ -468,10 +467,10 @@ int main(int argc, char ** argv) {
    LOG_TEE("n_predict = %d\n", n_predict);
    LOG_TEE("n_accept  = %d\n", n_accept);

-    llama_print_timings(ctx);
+    llama_print_timings(ctx, smpl);

    llama_kv_cache_view_free(&kvc_view);
-    llama_sampling_free(ctx_sampling);
+    llama_sampling_free(smpl);

    llama_batch_free(batch);

--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -3,13 +3,11 @@
 #include "common.h"
 #include "ngram-cache.h"

-#include <cmath>
 #include <cstdint>
 #include <cstdio>
 #include <fstream>
 #include <string>
 #include <vector>
-#include <unordered_map>

 int main(int argc, char ** argv){
    gpt_params params;
@@ -106,7 +104,7 @@ int main(int argc, char ** argv){

    bool has_eos = false;

-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
+    struct llama_sampling * smpl = llama_sampling_init(model, params.sparams);

    std::vector<llama_token> draft;

@@ -130,9 +128,9 @@ int main(int argc, char ** argv){
        int i_dft = 0;
        while (true) {
            // sample from the target model
-            llama_token id = llama_sampling_sample(ctx_sampling, ctx, NULL, i_dft);
+            llama_token id = llama_sampling_sample(smpl, ctx, i_dft);

-            llama_sampling_accept(ctx_sampling, ctx, id, true);
+            llama_sampling_accept(smpl, id, true);

            const std::string token_str = llama_token_to_piece(ctx, id);

@@ -241,9 +239,9 @@ int main(int argc, char ** argv){
    LOG_TEE("accept       = %.3f%%\n", 100.0f * n_accept / n_drafted);

    LOG_TEE("\ntarget:\n");
-    llama_print_timings(ctx);
+    llama_print_timings(ctx, smpl);

-    llama_sampling_free(ctx_sampling);
+    llama_sampling_free(smpl);
    llama_batch_free(batch_tgt);

    llama_free(ctx);
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -33,6 +33,7 @@

 static llama_context           ** g_ctx;
 static llama_model             ** g_model;
+static llama_sampling          ** g_smpl;
 static gpt_params               * g_params;
 static std::vector<llama_token> * g_input_tokens;
 static std::ostringstream       * g_output_ss;
@@ -105,7 +106,7 @@ static void sigint_handler(int signo) {
        } else {
            console::cleanup();
            printf("\n");
-            llama_print_timings(*g_ctx);
+            llama_print_timings(*g_ctx, *g_smpl);
            write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
            _exit(130);
        }
@@ -121,8 +122,7 @@ static void llama_log_callback_logTee(ggml_log_level level, const char * text, v

 static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, std::string role, std::string content) {
    llama_chat_msg new_msg{role, content};
-    auto formatted = llama_chat_format_single(
-        model, g_params->chat_template, chat_msgs, new_msg, role == "user");
+    auto formatted = llama_chat_format_single(model, g_params->chat_template, chat_msgs, new_msg, role == "user");
    chat_msgs.push_back({role, content});
    LOG("formatted: %s\n", formatted.c_str());
    return formatted;
@@ -137,7 +137,7 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    llama_sampling_params & sparams = params.sparams;
+    auto & sparams = params.sparams;

 #ifndef LOG_DISABLE_LOGS
    log_set_target(log_filename_generator("main", "log"));
@@ -183,27 +183,23 @@ int main(int argc, char ** argv) {
        LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
    }

-    LOG_TEE("%s: build = %d (%s)\n",      __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
-    LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);
+    print_build_info();

-    if (params.seed == LLAMA_DEFAULT_SEED) {
-        params.seed = time(NULL);
-    }
-
-    LOG_TEE("%s: seed  = %u\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
+    LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);

    LOG("%s: llama backend init\n", __func__);
    llama_backend_init();
    llama_numa_init(params.numa);

-    llama_model * model;
-    llama_context * ctx;
-    llama_context * ctx_guidance = NULL;
+    llama_model * model = nullptr;
+    llama_context * ctx = nullptr;
+    llama_sampling * smpl = nullptr;
+
    std::vector<llama_chat_msg> chat_msgs;
+
    g_model = &model;
    g_ctx = &ctx;
+    g_smpl = &smpl;

    // load the model and apply lora adapter, if any
    LOG("%s: load the model and apply lora adapter, if any\n", __func__);
@@ -211,16 +207,43 @@ int main(int argc, char ** argv) {

    model = llama_init.model;
    ctx = llama_init.context;
-    if (sparams.cfg_scale > 1.f) {
-        struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
-        ctx_guidance = llama_new_context_with_model(model, lparams);
-    }

    if (model == NULL) {
        LOG_TEE("%s: error: unable to load model\n", __func__);
        return 1;
    }

+    LOG("%s: llama threadpool init = n_threads = %d\n",
+        __func__,
+        (int) params.cpuparams.n_threads
+    );
+    struct ggml_threadpool_params tpp_batch =
+            ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
+    struct ggml_threadpool_params tpp =
+            ggml_threadpool_params_from_cpu_params(params.cpuparams);
+
+    set_process_priority(params.cpuparams.priority);
+
+    struct ggml_threadpool * threadpool_batch = NULL;
+    if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
+        threadpool_batch = ggml_threadpool_new(&tpp_batch);
+        if (!threadpool_batch) {
+            LOG_TEE("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
+            exit(1);
+        }
+
+        // Start the non-batch threadpool in the paused state
+        tpp.paused = true;
+    }
+
+    struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
+    if (!threadpool) {
+        LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
+        exit(1);
+    }
+
+    llama_attach_threadpool(ctx, threadpool, threadpool_batch);
+
    const int n_ctx_train = llama_n_ctx_train(model);
    const int n_ctx = llama_n_ctx(ctx);
    LOG("n_ctx: %d\n", n_ctx);
@@ -267,9 +290,9 @@ int main(int argc, char ** argv) {
        }
    }

-    const bool add_bos = llama_should_add_bos_token(model);
+    const bool add_bos = llama_add_bos_token(model);
    if (!llama_model_has_encoder(model)) {
-        GGML_ASSERT(llama_add_eos_token(model) != 1);
+        GGML_ASSERT(!llama_add_eos_token(model));
    }
    LOG("add_bos: %d\n", add_bos);

@@ -303,24 +326,6 @@ int main(int argc, char ** argv) {
    }

    // Tokenize negative prompt
-    std::vector<llama_token> guidance_inp;
-    int guidance_offset = 0;
-    int original_prompt_len = 0;
-    if (ctx_guidance) {
-        LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
-
-        guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true, true);
-        LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
-
-        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true, true);
-        LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
-
-        original_prompt_len = original_inp.size();
-        guidance_offset = (int)guidance_inp.size() - original_prompt_len;
-        LOG("original_prompt_len: %s", log_tostr(original_prompt_len));
-        LOG("guidance_offset:     %s", log_tostr(guidance_offset));
-    }
-
    if ((int) embd_inp.size() > n_ctx - 4) {
        LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
        return 1;
@@ -352,8 +357,8 @@ int main(int argc, char ** argv) {
    }

    LOGLN(
-            "recalculate the cached logits (check): embd_inp.empty() %s, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu, embd_inp.size() %zu",
-            log_tostr(embd_inp.empty()), n_matching_session_tokens, embd_inp.size(), session_tokens.size(), embd_inp.size());
+            "recalculate the cached logits (check): embd_inp.empty() %s, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu",
+            log_tostr(embd_inp.empty()), n_matching_session_tokens, embd_inp.size(), session_tokens.size());

    // if we will use the cache for the full prompt without reaching the end of the cache, force
    // reevaluation of the last token to recalculate the cached logits
@@ -387,15 +392,6 @@ int main(int argc, char ** argv) {
            LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
        }

-        if (ctx_guidance) {
-            LOG_TEE("\n");
-            LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
-            LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
-            for (int i = 0; i < (int) guidance_inp.size(); i++) {
-                LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
-            }
-        }
-
        if (params.n_keep > add_bos) {
            LOG_TEE("%s: static prompt based on n_keep: '", __func__);
            for (int i = 0; i < params.n_keep; i++) {
@@ -461,8 +457,8 @@ int main(int argc, char ** argv) {
            }
        }
    }
-    LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
-    LOG_TEE("sampling order: \n%s\n", llama_sampling_order_print(sparams).c_str());
+    LOG_TEE("sampling params: \n%s\n", sparams.print_all().c_str());
+    LOG_TEE("sampling order:  \n%s\n", sparams.print_samplers().c_str());
    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);

    // group-attention state
@@ -509,7 +505,6 @@ int main(int argc, char ** argv) {
    int n_remain           = params.n_predict;
    int n_consumed         = 0;
    int n_session_consumed = 0;
-    int n_past_guidance    = 0;

    std::vector<int>   input_tokens;  g_input_tokens  = &input_tokens;
    std::vector<int>   output_tokens; g_output_tokens = &output_tokens;
@@ -521,7 +516,6 @@ int main(int argc, char ** argv) {
    display = params.display_prompt;

    std::vector<llama_token> embd;
-    std::vector<llama_token> embd_guidance;

    // tokenized antiprompts
    std::vector<std::vector<llama_token>> antiprompt_ids;
@@ -531,8 +525,8 @@ int main(int argc, char ** argv) {
        antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true));
    }

-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
-    if (!ctx_sampling) {
+    smpl = llama_sampling_init(model, sparams);
+    if (!smpl) {
        fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
        exit(1);
    }
@@ -578,7 +572,7 @@ int main(int argc, char ** argv) {
                // if we run out of context:
                // - take the n_keep first tokens from the original prompt (via n_past)
                // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
-                if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) >= n_ctx) {
+                if (n_past + (int) embd.size() >= n_ctx) {
                    if (params.n_predict == -2) {
                        LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
                        break;
@@ -595,11 +589,7 @@ int main(int argc, char ** argv) {

                    n_past -= n_discard;

-                    if (ctx_guidance) {
-                        n_past_guidance -= n_discard;
-                    }
-
-                    LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
+                    LOG("after swap: n_past = %d\n", n_past);

                    LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());

@@ -652,46 +642,6 @@ int main(int argc, char ** argv) {
                }
            }

-            // evaluate tokens in batches
-            // embd is typically prepared beforehand to fit within a batch, but not always
-            if (ctx_guidance) {
-                int input_size = 0;
-                llama_token * input_buf = NULL;
-
-                if (n_past_guidance < (int) guidance_inp.size()) {
-                    // Guidance context should have the same data with these modifications:
-                    //
-                    // * Replace the initial prompt
-                    // * Shift everything by guidance_offset
-                    embd_guidance = guidance_inp;
-                    if (embd.begin() + original_prompt_len < embd.end()) {
-                        embd_guidance.insert(
-                            embd_guidance.end(),
-                            embd.begin() + original_prompt_len,
-                            embd.end()
-                        );
-                    }
-
-                    input_buf  = embd_guidance.data();
-                    input_size = embd_guidance.size();
-
-                    LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str());
-                } else {
-                    input_buf  = embd.data();
-                    input_size = embd.size();
-                }
-
-                for (int i = 0; i < input_size; i += params.n_batch) {
-                    int n_eval = std::min(input_size - i, params.n_batch);
-                    if (llama_decode(ctx_guidance, llama_batch_get_one(input_buf + i, n_eval, n_past_guidance, 0))) {
-                        LOG_TEE("%s : failed to eval\n", __func__);
-                        return 1;
-                    }
-
-                    n_past_guidance += n_eval;
-                }
-            }
-
            for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
                int n_eval = (int) embd.size() - i;
                if (n_eval > params.n_batch) {
@@ -721,7 +671,6 @@ int main(int argc, char ** argv) {
        }

        embd.clear();
-        embd_guidance.clear();

        if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
            // optionally save the session on first sample (for faster prompt loading next time)
@@ -732,11 +681,11 @@ int main(int argc, char ** argv) {
                LOG("saved session to %s\n", path_session.c_str());
            }

-            const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
+            const llama_token id = llama_sampling_sample(smpl, ctx, -1);

-            llama_sampling_accept(ctx_sampling, ctx, id, /* apply_grammar= */ true);
+            llama_sampling_accept(smpl, id, /* apply_grammar= */ true);

-            LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
+            // LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, smpl->prev.to_vector()).c_str());

            embd.push_back(id);

@@ -755,7 +704,7 @@ int main(int argc, char ** argv) {

                // push the prompt in the sampling context in order to apply repetition penalties later
                // for the prompt, we don't apply grammar rules
-                llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], /* apply_grammar= */ false);
+                llama_sampling_accept(smpl, embd_inp[n_consumed], /* apply_grammar= */ false);

                ++n_consumed;
                if ((int) embd.size() >= params.n_batch) {
@@ -798,7 +747,7 @@ int main(int argc, char ** argv) {
            // check for reverse prompt in the last n_prev tokens
            if (!params.antiprompt.empty()) {
                const int n_prev = 32;
-                const std::string last_output = llama_sampling_prev_str(ctx_sampling, ctx, n_prev);
+                const std::string last_output = llama_sampling_prev_str(smpl, ctx, n_prev);

                is_antiprompt = false;
                // Check if each of the reverse prompts appears at the end of the output.
@@ -820,7 +769,7 @@ int main(int argc, char ** argv) {
                }

                // check for reverse prompt using special tokens
-                llama_token last_token = llama_sampling_last(ctx_sampling);
+                llama_token last_token = llama_sampling_last(smpl);
                for (std::vector<llama_token> ids : antiprompt_ids) {
                    if (ids.size() == 1 && last_token == ids[0]) {
                        if (params.interactive) {
@@ -837,7 +786,7 @@ int main(int argc, char ** argv) {
            }

            // deal with end of generation tokens in interactive mode
-            if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) {
+            if (llama_token_is_eog(model, llama_sampling_last(smpl))) {
                LOG("found an EOG token\n");

                if (params.interactive) {
@@ -858,7 +807,7 @@ int main(int argc, char ** argv) {

            // if current token is not EOG, we add it to current assistant message
            if (params.conversation) {
-                auto id = llama_sampling_last(ctx_sampling);
+                auto id = llama_sampling_last(smpl);
                assistant_ss << llama_token_to_piece(ctx, id, false);
            }

@@ -954,7 +903,7 @@ int main(int argc, char ** argv) {

            if (n_past > 0) {
                if (is_interacting) {
-                    llama_sampling_reset(ctx_sampling);
+                    llama_sampling_reset(smpl);
                }
                is_interacting = false;
            }
@@ -979,16 +928,18 @@ int main(int argc, char ** argv) {
        llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
    }

-    llama_print_timings(ctx);
+    llama_print_timings(ctx, smpl);
    write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);

-    if (ctx_guidance) { llama_free(ctx_guidance); }
    llama_free(ctx);
    llama_free_model(model);

-    llama_sampling_free(ctx_sampling);
+    llama_sampling_free(smpl);
    llama_backend_free();

+    ggml_threadpool_free(threadpool);
+    ggml_threadpool_free(threadpool_batch);
+
 #ifndef LOG_DISABLE_LOGS
    LOG_TEE("Log end\n");
 #endif // LOG_DISABLE_LOGS
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -50,8 +50,8 @@ static std::vector<std::string> k_prompts = {

 struct client {
    ~client() {
-        if (ctx_sampling) {
-            llama_sampling_free(ctx_sampling);
+        if (smpl) {
+            llama_sampling_free(smpl);
        }
    }

@@ -72,7 +72,7 @@ struct client {
    std::string prompt;
    std::string response;

-    struct llama_sampling_context * ctx_sampling = nullptr;
+    struct llama_sampling * smpl = nullptr;
 };

 static void print_date_time() {
@@ -161,7 +161,7 @@ int main(int argc, char ** argv) {
    for (size_t i = 0; i < clients.size(); ++i) {
        auto & client = clients[i];
        client.id = i;
-        client.ctx_sampling = llama_sampling_init(params.sparams);
+        client.smpl = llama_sampling_init(model, params.sparams);
    }

    std::vector<llama_token> tokens_system;
@@ -253,7 +253,7 @@ int main(int argc, char ** argv) {
                    client.prompt   = client.input + "\nAssistant:";
                    client.response = "";

-                    llama_sampling_reset(client.ctx_sampling);
+                    llama_sampling_reset(client.smpl);

                    // do not prepend BOS because we have a system prompt!
                    std::vector<llama_token> tokens_prompt;
@@ -341,9 +341,9 @@ int main(int argc, char ** argv) {
                //printf("client %d, seq %d, token %d, pos %d, batch %d\n",
                //        client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);

-                const llama_token id = llama_sampling_sample(client.ctx_sampling, ctx, NULL, client.i_batch - i);
+                const llama_token id = llama_sampling_sample(client.smpl, ctx, client.i_batch - i);

-                llama_sampling_accept(client.ctx_sampling, ctx, id, true);
+                llama_sampling_accept(client.smpl, id, true);

                if (client.n_decoded == 1) {
                    // start measuring generation time after the first token to make sure all concurrent clients
@@ -371,7 +371,7 @@ int main(int argc, char ** argv) {
                    }

                    // delete only the generated part of the sequence, i.e. keep the system prompt in the cache
-                    llama_kv_cache_seq_rm(ctx, client.id + 1, -1, -1);
+                    llama_kv_cache_seq_rm(ctx,    client.id + 1, -1, -1);
                    llama_kv_cache_seq_cp(ctx, 0, client.id + 1, -1, -1);

                    const auto t_main_end = ggml_time_us();
@@ -413,7 +413,8 @@ int main(int argc, char ** argv) {

    LOG_TEE("\n");

-    llama_print_timings(ctx);
+    // TODO: print sampling/grammar timings for all clients
+    llama_print_timings(ctx, nullptr);

    llama_batch_free(batch);

--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -26,8 +26,6 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    srand(params.seed == LLAMA_DEFAULT_SEED ? time(NULL) : params.seed);
-
    int n_junk = params.n_junk;
    int n_keep = params.n_keep;
    int n_grp  = params.grp_attn_n;
@@ -80,12 +78,13 @@ int main(int argc, char ** argv) {
    GGML_ASSERT(ctx_params.n_batch % n_grp == 0 && "n_batch must be divisible by n_grp");

    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
-
    if (ctx == NULL) {
        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
        return 1;
    }

+    llama_sampling * smpl = llama_sampling_init(model, llama_sampling_default_params());
+
    // tokenize the prompt
    std::vector<llama_token> tokens_list;
    tokens_list = ::llama_tokenize(ctx, params.prompt, true);
@@ -217,20 +216,12 @@ int main(int argc, char ** argv) {
    while (n_cur <= n_len) {
        // sample the next token
        {
-            auto   n_vocab = llama_n_vocab(model);
-            auto * logits  = llama_get_logits_ith(ctx, batch.n_tokens - 1);
+            const auto * logits = llama_get_logits_ith(ctx, batch.n_tokens - 1);

-            std::vector<llama_token_data> candidates;
-            candidates.reserve(n_vocab);
-
-            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-                candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
-            }
-
-            llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+            llama_sampling_set_logits(smpl, logits);

            // sample the most likely token
-            const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
+            const llama_token new_token_id = llama_sampling_sample_greedy(smpl, nullptr);

            // is it an end of generation?
            if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
@@ -267,12 +258,13 @@ int main(int argc, char ** argv) {
    LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
            __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));

-    llama_print_timings(ctx);
+    llama_print_timings(ctx, nullptr);

    fprintf(stderr, "\n");

    llama_batch_free(batch);

+    llama_sampling_free(smpl);
    llama_free(ctx);
    llama_free_model(model);

--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -340,8 +340,8 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
    // Output: `perplexity: 13.5106 [114/114]`
    // BOS tokens will be added for each chunk before eval

-    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
-    GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
+    const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
+    GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));

    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);

@@ -480,8 +480,8 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
    // Output: `perplexity: 13.5106 [114/114]`
    // BOS tokens will be added for each chunk before eval

-    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
-    GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
+    const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
+    GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));

    std::ofstream logits_stream;
    if (!params.logits_file.empty()) {
@@ -1733,8 +1733,8 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
    const int n_batch = params.n_batch;
    const int num_batches = (n_ctx + n_batch - 1)/n_batch;
    const int nv = 2*((n_vocab + 1)/2) + 4;
-    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
-    GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
+    const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
+    GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));

    std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv);
    std::vector<float>    kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
@@ -2007,13 +2007,7 @@ int main(int argc, char ** argv) {

    print_build_info();

-    if (params.seed == LLAMA_DEFAULT_SEED) {
-        params.seed = time(NULL);
-    }
-
-    fprintf(stderr, "%s: seed  = %u\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
+    LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);

    llama_backend_init();
    llama_numa_init(params.numa);
@@ -2054,7 +2048,7 @@ int main(int argc, char ** argv) {
        results = perplexity(ctx, params, n_ctx);
    }

-    llama_print_timings(ctx);
+    llama_print_timings(ctx, nullptr);
    write_logfile(ctx, params, model, results);

    llama_free(ctx);
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -1,7 +1,7 @@
-#define LLAMA_API_INTERNAL
 #include "common.h"
 #include "ggml.h"
 #include "llama.h"
+#include "llama-impl.h"

 #include <algorithm>
 #include <cassert>
@@ -319,8 +319,7 @@ int main(int argc, char ** argv) {
        }

        auto cparams = llama_context_default_params();
-        cparams.n_ctx      = 256;
-        cparams.seed       = 1;
+        cparams.n_ctx = 256;

        ctx = llama_new_context_with_model(model, cparams);

--- a/examples/quantize/README.md
+++ b/examples/quantize/README.md
@@ -34,7 +34,7 @@ Run the quantized model:

 ```bash
 # start inference on a gguf model
-./llama-cli -m ./models/mymodel/ggml-model-Q4_K_M.gguf -n 128
+./llama-cli -m ./models/mymodel/ggml-model-Q4_K_M.gguf -cnv -p "You are a helpful assistant"
 ```

 When running the larger models, make sure you have enough disk space to store all the intermediate files.
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -104,7 +104,7 @@ static void usage(const char * executable) {
    printf("  --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
    printf("  --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
    printf("  --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
-    printf("  --keep-split: will generate quatized model in the same shards as input");
+    printf("  --keep-split: will generate quantized model in the same shards as input\n");
    printf("  --override-kv KEY=TYPE:VALUE\n");
    printf("      Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
    printf("Note: --include-weights and --exclude-weights cannot be used together\n");
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@@ -253,6 +253,8 @@ int main(int argc, char ** argv) {
        chunks[i].tokens.clear();
    }

+    struct llama_batch query_batch = llama_batch_init(n_batch, 0, 1);
+
    // start loop, receive query and return top k similar chunks based on cosine similarity
    std::string query;
    while (true) {
@@ -260,7 +262,6 @@ int main(int argc, char ** argv) {
        std::getline(std::cin, query);
        std::vector<int32_t> query_tokens = llama_tokenize(ctx, query, true);

-        struct llama_batch query_batch = llama_batch_init(n_batch, 0, 1);
        batch_add_seq(query_batch, query_tokens, 0);

        std::vector<float> query_emb(n_embd, 0);
@@ -293,7 +294,8 @@ int main(int argc, char ** argv) {
    }

    // clean up
-    llama_print_timings(ctx);
+    llama_print_timings(ctx, nullptr);
+    llama_batch_free(query_batch);
    llama_free(ctx);
    llama_free_model(model);
    llama_backend_free();
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -3,12 +3,12 @@

 #include <vector>
 #include <cstdio>
-#include <chrono>

 int main(int argc, char ** argv) {
    gpt_params params;

    params.prompt = "The quick brown fox";
+    params.sparams.seed = 1234;

    if (!gpt_params_parse(argc, argv, params)) {
        gpt_params_print_usage(argc, argv, params);
@@ -38,6 +38,11 @@ int main(int argc, char ** argv) {
        return 1;
    }

+    llama_sampling_params sparams = llama_sampling_default_params();
+    sparams.seed = params.sparams.seed;
+
+    llama_sampling * smpl = llama_sampling_init(model, sparams);
+
    // tokenize prompt
    auto tokens = llama_tokenize(ctx, params.prompt, true);

@@ -64,16 +69,11 @@ int main(int argc, char ** argv) {
    printf("\nfirst run: %s", params.prompt.c_str());

    for (auto i = 0; i < params.n_predict; i++) {
-        auto * logits = llama_get_logits(ctx);
-        auto n_vocab = llama_n_vocab(model);
+        const auto * logits = llama_get_logits(ctx);

-        std::vector<llama_token_data> candidates;
-        candidates.reserve(n_vocab);
-        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-            candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
-        }
-        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-        auto next_token = llama_sample_token(ctx, &candidates_p);
+        llama_sampling_set_logits(smpl, logits);
+
+        auto next_token = llama_sampling_sample_dist(smpl, nullptr);
        auto next_token_str = llama_token_to_piece(ctx, next_token);

        printf("%s", next_token_str.c_str());
@@ -96,6 +96,8 @@ int main(int argc, char ** argv) {
    // make new context
    auto * ctx2 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));

+    llama_sampling * smpl2 = llama_sampling_init(model, sparams);
+
    printf("\nsecond run: %s", params.prompt.c_str());

    // load state (rng, logits, embedding and kv_cache) from file
@@ -124,15 +126,11 @@ int main(int argc, char ** argv) {

    // second run
    for (auto i = 0; i < params.n_predict; i++) {
-        auto * logits = llama_get_logits(ctx2);
-        auto n_vocab = llama_n_vocab(model);
-        std::vector<llama_token_data> candidates;
-        candidates.reserve(n_vocab);
-        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-            candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
-        }
-        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-        auto next_token = llama_sample_token(ctx2, &candidates_p);
+        const auto * logits = llama_get_logits(ctx2);
+
+        llama_sampling_set_logits(smpl2, logits);
+
+        auto next_token = llama_sampling_sample_dist(smpl2, nullptr);
        auto next_token_str = llama_token_to_piece(ctx2, next_token);

        printf("%s", next_token_str.c_str());
@@ -157,7 +155,9 @@ int main(int argc, char ** argv) {
    }

    // make new context
-    auto* ctx3 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));
+    auto * ctx3 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));
+
+    llama_sampling * smpl3 = llama_sampling_init(model, sparams);

    printf("\nsingle seq run: %s", params.prompt.c_str());

@@ -215,15 +215,11 @@ int main(int argc, char ** argv) {

    // third run with seq 1 instead of 0
    for (auto i = 0; i < params.n_predict; i++) {
-        auto * logits = llama_get_logits(ctx3);
-        auto n_vocab = llama_n_vocab(model);
-        std::vector<llama_token_data> candidates;
-        candidates.reserve(n_vocab);
-        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-            candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
-        }
-        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-        auto next_token = llama_sample_token(ctx3, &candidates_p);
+        const auto * logits = llama_get_logits(ctx3);
+
+        llama_sampling_set_logits(smpl3, logits);
+
+        auto next_token = llama_sampling_sample_dist(smpl3, nullptr);
        auto next_token_str = llama_token_to_piece(ctx3, next_token);

        printf("%s", next_token_str.c_str());
@@ -240,6 +236,10 @@ int main(int argc, char ** argv) {

    printf("\n");

+    llama_sampling_free(smpl);
+    llama_sampling_free(smpl2);
+    llama_sampling_free(smpl3);
+
    llama_free(ctx3);
    llama_free_model(model);

--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -247,6 +247,51 @@ logging:
         --log-append             Don't truncate the old log file.
 ```

+Available environment variables (if specified, these variables will override parameters specified in arguments):
+
+- `LLAMA_CACHE`: cache directory, used by `--hf-repo`
+- `HF_TOKEN`: Hugging Face access token, used when accessing a gated model with `--hf-repo`
+- `LLAMA_ARG_MODEL`: equivalent to `-m`
+- `LLAMA_ARG_MODEL_URL`: equivalent to `-mu`
+- `LLAMA_ARG_MODEL_ALIAS`: equivalent to `-a`
+- `LLAMA_ARG_HF_REPO`: equivalent to `--hf-repo`
+- `LLAMA_ARG_HF_FILE`: equivalent to `--hf-file`
+- `LLAMA_ARG_THREADS`: equivalent to `-t`
+- `LLAMA_ARG_CTX_SIZE`: equivalent to `-c`
+- `LLAMA_ARG_N_PARALLEL`: equivalent to `-np`
+- `LLAMA_ARG_BATCH`: equivalent to `-b`
+- `LLAMA_ARG_UBATCH`: equivalent to `-ub`
+- `LLAMA_ARG_N_GPU_LAYERS`: equivalent to `-ngl`
+- `LLAMA_ARG_THREADS_HTTP`: equivalent to `--threads-http`
+- `LLAMA_ARG_CHAT_TEMPLATE`: equivalent to `--chat-template`
+- `LLAMA_ARG_N_PREDICT`: equivalent to `-n`
+- `LLAMA_ARG_ENDPOINT_METRICS`: if set to `1`, it will enable metrics endpoint (equivalent to `--metrics`)
+- `LLAMA_ARG_ENDPOINT_SLOTS`: if set to `0`, it will **disable** slots endpoint (equivalent to `--no-slots`). This feature is enabled by default.
+- `LLAMA_ARG_EMBEDDINGS`: if set to `1`, it will enable embeddings endpoint (equivalent to `--embeddings`)
+- `LLAMA_ARG_FLASH_ATTN`: if set to `1`, it will enable flash attention (equivalent to `-fa`)
+- `LLAMA_ARG_CONT_BATCHING`: if set to `0`, it will **disable** continuous batching (equivalent to `--no-cont-batching`). This feature is enabled by default.
+- `LLAMA_ARG_DEFRAG_THOLD`: equivalent to `-dt`
+- `LLAMA_ARG_HOST`: equivalent to `--host`
+- `LLAMA_ARG_PORT`: equivalent to `--port`
+
+Example usage of docker compose with environment variables:
+
+```yml
+services:
+  llamacpp-server:
+    image: ghcr.io/ggerganov/llama.cpp:server
+    ports:
+      - 8080:8080
+    volumes:
+      - ./models:/models
+    environment:
+      # alternatively, you can use "LLAMA_ARG_MODEL_URL" to download the model
+      LLAMA_ARG_MODEL: /models/my_model.gguf
+      LLAMA_ARG_CTX_SIZE: 4096
+      LLAMA_ARG_N_PARALLEL: 2
+      LLAMA_ARG_ENDPOINT_METRICS: 1  # to disable, either remove or set to 0
+      LLAMA_ARG_PORT: 8080
+```

 ## Build

@@ -368,15 +413,16 @@ node index.js

 ## API Endpoints

-### GET `/health`: Returns the current state of the server
+### GET `/health`: Returns heath check result

-  - 503 -> `{"status": "loading model"}` if the model is still being loaded.
-  - 500 -> `{"status": "error"}` if the model failed to load.
-  - 200 -> `{"status": "ok", "slots_idle": 1, "slots_processing": 2 }` if the model is successfully loaded and the server is ready for further requests mentioned below.
-  - 200 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slots are currently available.
-  - 503 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if the query parameter `fail_on_no_slot` is provided and no slots are currently available.
+**Response format**

-  If the query parameter `include_slots` is passed, `slots` field will contain internal slots data except if `--slots-endpoint-disable` is set.
+- HTTP status code 503
+  - Body: `{"error": {"code": 503, "message": "Loading model", "type": "unavailable_error"}}`
+  - Explanation: the model is still being loaded.
+- HTTP status code 200
+  - Body: `{"status": "ok" }`
+  - Explanation: the model is successfully loaded and the server is ready.

 ### POST `/completion`: Given a `prompt`, it returns the predicted completion.

@@ -424,8 +470,6 @@ node index.js

    `frequency_penalty`: Repeat alpha frequency penalty. Default: `0.0`, which is disabled.

-    `penalty_prompt`: This will replace the `prompt` for the purpose of the penalty evaluation. Can be either `null`, a string or an array of numbers representing tokens. Default: `null`, which is to use the original `prompt`.
-
    `mirostat`: Enable Mirostat sampling, controlling perplexity during text generation. Default: `0`, where `0` is disabled, `1` is Mirostat, and `2` is Mirostat 2.0.

    `mirostat_tau`: Set the Mirostat target entropy, parameter tau. Default: `5.0`
@@ -639,10 +683,16 @@ Given a ChatML-formatted json description in `messages`, it returns the predicte
    }'
    ```

-### GET `/slots`: Returns the current slots processing state. Can be disabled with `--slots-endpoint-disable`.
+### GET `/slots`: Returns the current slots processing state
+
+This endpoint can be disabled with `--no-slots`
+
+If query param `?fail_on_no_slot=1` is set, this endpoint will respond with status code 503 if there is no available slots.

 **Response format**

+Example:
+
 ```json
 [
    {
@@ -672,7 +722,6 @@ Given a ChatML-formatted json description in `messages`, it returns the predicte
            "stopping_word": ""
        },
        "penalize_nl": true,
-        "penalty_prompt_tokens": [],
        "presence_penalty": 0.0,
        "prompt": "Say hello to llama.cpp",
        "repeat_last_n": 64,
@@ -696,13 +745,18 @@ Given a ChatML-formatted json description in `messages`, it returns the predicte
        "tfs_z": 1.0,
        "top_k": 40,
        "top_p": 0.949999988079071,
-        "typical_p": 1.0,
-        "use_penalty_prompt_tokens": false
+        "typical_p": 1.0
    }
 ]
 ```

-### GET `/metrics`: Prometheus compatible metrics exporter endpoint if `--metrics` is enabled:
+Possible values for `slot[i].state` are:
+- `0`: SLOT_STATE_IDLE
+- `1`: SLOT_STATE_PROCESSING
+
+### GET `/metrics`: Prometheus compatible metrics exporter
+
+This endpoint is only accessible if `--metrics` is set.

 Available metrics:
 - `llamacpp:prompt_tokens_total`: Number of prompt tokens processed.
@@ -767,6 +821,10 @@ Available metrics:

 ### GET `/lora-adapters`: Get list of all LoRA adapters

+This endpoint returns the loaded LoRA adapters. You can add adapters using `--lora` when starting the server, for example: `--lora my_adapter_1.gguf --lora my_adapter_2.gguf ...`
+
+By default, all adapters will be loaded with scale set to 1. To initialize all adapters scale to 0, add `--lora-init-without-apply`
+
 If an adapter is disabled, the scale will be set to 0.

 **Response format**
--- a/examples/server/public/index.js
+++ b/examples/server/public/index.js
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -23,6 +23,8 @@ from prometheus_client import parser

 # pyright: reportRedeclaration=false

+DEFAULT_TIMEOUT_SECONDS = aiohttp.ClientTimeout(total=600)
+
@step("a server listening on {server_fqdn}:{server_port}")
 def step_server_config(context, server_fqdn: str, server_port: str):
    context.server_fqdn = server_fqdn
@@ -205,27 +207,20 @@ def step_start_server(context):
 async def step_wait_for_the_server_to_be_started(context, expecting_status: Literal['healthy', 'ready', 'idle', 'busy'] | str):
    match expecting_status:
        case 'healthy':
-            await wait_for_health_status(context, context.base_url, 200, 'ok',
-                                         timeout=30)
+            await wait_for_slots_status(context, context.base_url, 200,
+                                        timeout=30)

        case 'ready' | 'idle':
-            await wait_for_health_status(context, context.base_url, 200, 'ok',
-                                         timeout=30,
-                                         params={'fail_on_no_slot': 0, 'include_slots': 0},
-                                         slots_idle=context.n_slots,
-                                         slots_processing=0,
-                                         expected_slots=[{'id': slot_id, 'state': 0}
-                                                         for slot_id in
-                                                         range(context.n_slots if context.n_slots else 1)])
+            await wait_for_slots_status(context, context.base_url, 200,
+                                        timeout=30,
+                                        params={'fail_on_no_slot': 1},
+                                        slots_idle=context.n_slots,
+                                        slots_processing=0)
        case 'busy':
-            await wait_for_health_status(context, context.base_url, 503,
-                                         'no slot available',
-                                         params={'fail_on_no_slot': 0, 'include_slots': 0},
-                                         slots_idle=0,
-                                         slots_processing=context.n_slots,
-                                         expected_slots=[{'id': slot_id, 'state': 1}
-                                                         for slot_id in
-                                                         range(context.n_slots if context.n_slots else 1)])
+            await wait_for_slots_status(context, context.base_url, 503,
+                                        params={'fail_on_no_slot': 1},
+                                        slots_idle=0,
+                                        slots_processing=context.n_slots)
        case _:
            assert False, "unknown status"

@@ -696,7 +691,7 @@ def step_tokenize_set_add_special(context):
@async_run_until_complete
 async def step_tokenize(context):
    context.tokenized_text = context_text(context)
-    async with aiohttp.ClientSession() as session:
+    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
        tokenize_args = {
            "content": context.tokenized_text,
        }
@@ -713,7 +708,7 @@ async def step_tokenize(context):
@async_run_until_complete
 async def step_detokenize(context):
    assert len(context.tokens) > 0
-    async with aiohttp.ClientSession() as session:
+    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
        async with session.post(f'{context.base_url}/detokenize',
                                json={
                                    "tokens": context.tokens,
@@ -742,7 +737,7 @@ def step_strings_for_tokenization(context):
@step('an OPTIONS request is sent from {origin}')
@async_run_until_complete
 async def step_options_request(context, origin):
-    async with aiohttp.ClientSession() as session:
+    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
        headers = {'Authorization': f'Bearer {context.user_api_key}', 'Origin': origin}
        async with session.options(f'{context.base_url}/v1/chat/completions',
                                    headers=headers) as response:
@@ -758,7 +753,7 @@ def step_check_options_header_value(context, cors_header, cors_header_value):
@step('prometheus metrics are exposed')
@async_run_until_complete
 async def step_prometheus_metrics_exported(context):
-    async with aiohttp.ClientSession() as session:
+    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
        async with await session.get(f'{context.base_url}/metrics') as metrics_response:
            assert metrics_response.status == 200
            assert metrics_response.headers['Content-Type'] == "text/plain; version=0.0.4"
@@ -825,13 +820,13 @@ async def concurrent_requests(context, f_completion, *args, **kwargs):
    for prompt_no in range(context.n_prompts):
        shifted_args = [context.prompts.pop(), seeds[prompt_no], *args]
        context.concurrent_tasks.append(asyncio.create_task(f_completion(*shifted_args, **kwargs)))
-    await asyncio.sleep(0.1)
+    await asyncio.sleep(0.01)


@step('the slot {slot_id:d} is saved with filename "{filename}"')
@async_run_until_complete
 async def step_save_slot(context, slot_id, filename):
-    async with aiohttp.ClientSession() as session:
+    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
        async with session.post(f'{context.base_url}/slots/{slot_id}?action=save',
                                json={"filename": filename},
                                headers={"Content-Type": "application/json"}) as response:
@@ -841,7 +836,7 @@ async def step_save_slot(context, slot_id, filename):
@step('the slot {slot_id:d} is restored with filename "{filename}"')
@async_run_until_complete
 async def step_restore_slot(context, slot_id, filename):
-    async with aiohttp.ClientSession() as session:
+    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
        async with session.post(f'{context.base_url}/slots/{slot_id}?action=restore',
                                json={"filename": filename},
                                headers={"Content-Type": "application/json"}) as response:
@@ -851,7 +846,7 @@ async def step_restore_slot(context, slot_id, filename):
@step('the slot {slot_id:d} is erased')
@async_run_until_complete
 async def step_erase_slot(context, slot_id):
-    async with aiohttp.ClientSession() as session:
+    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
        async with session.post(f'{context.base_url}/slots/{slot_id}?action=erase',
                                headers={"Content-Type": "application/json"}) as response:
            context.response = response
@@ -860,7 +855,7 @@ async def step_erase_slot(context, slot_id):
@step('switch {on_or_off} lora adapter {lora_id:d}')
@async_run_until_complete
 async def toggle_lora_adapter(context, on_or_off: str, lora_id: int):
-    async with aiohttp.ClientSession() as session:
+    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
        async with session.post(f'{context.base_url}/lora-adapters',
                                json=[{'id': lora_id, 'scale': 1 if on_or_off == 'on' else 0}],
                                headers={"Content-Type": "application/json"}) as response:
@@ -896,7 +891,7 @@ async def request_completion(prompt,
            print(f"Set user_api_key: {user_api_key}")
        headers['Authorization'] = f'Bearer {user_api_key}'

-    async with aiohttp.ClientSession() as session:
+    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
        async with session.post(f'{base_url}/completion',
                                json={
                                    "input_prefix": prompt_prefix,
@@ -909,8 +904,7 @@ async def request_completion(prompt,
                                    "temperature": temperature if temperature is not None else 0.8,
                                    "n_probs": 2,
                                },
-                                headers=headers,
-                                timeout=3600) as response:
+                                headers=headers) as response:
            if expect_api_error is None or not expect_api_error:
                assert response.status == 200
                assert response.headers['Access-Control-Allow-Origin'] == origin
@@ -968,7 +962,7 @@ async def oai_chat_completions(user_prompt,
    if async_client:
        origin = 'llama.cpp'
        headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin}
-        async with aiohttp.ClientSession() as session:
+        async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
            async with session.post(f'{base_url}{base_path}',
                                    json=payload,
                                    headers=headers) as response:
@@ -1055,7 +1049,7 @@ async def oai_chat_completions(user_prompt,


 async def request_embedding(content, seed, base_url=None) -> list[list[float]]:
-    async with aiohttp.ClientSession() as session:
+    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
        async with session.post(f'{base_url}/embedding',
                                json={
                                    "content": content,
@@ -1075,14 +1069,13 @@ async def request_oai_embeddings(input, seed,
        headers=[]
        if user_api_key is not None:
            headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin}
-        async with aiohttp.ClientSession() as session:
+        async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
            async with session.post(f'{base_url}/v1/embeddings',
                                    json={
                                        "input": input,
                                        "model": model,
                                    },
-                                    headers=headers,
-                                    timeout=3600) as response:
+                                    headers=headers) as response:
                assert response.status == 200, f"received status code not expected: {response.status}"
                assert response.headers['Access-Control-Allow-Origin'] == origin
                assert response.headers['Content-Type'] == "application/json; charset=utf-8"
@@ -1187,44 +1180,35 @@ async def gather_tasks_results(context):
    return n_completions


-async def wait_for_health_status(context,
-                                 base_url,
-                                 expected_http_status_code,
-                                 expected_health_status,
-                                 timeout=3,
-                                 params=None,
-                                 slots_idle=None,
-                                 slots_processing=None,
-                                 expected_slots=None):
+async def wait_for_slots_status(context,
+                                base_url,
+                                expected_http_status_code,
+                                timeout=3,
+                                params=None,
+                                slots_idle=None,
+                                slots_processing=None):
    if context.debug:
-        print(f"Starting checking for health for expected_health_status={expected_health_status}")
+        print(f"Starting checking for health for expected_http_status_code={expected_http_status_code}")
    interval = 0.5
    counter = 0
    if 'GITHUB_ACTIONS' in os.environ:
        timeout *= 2

-    async with aiohttp.ClientSession() as session:
+    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
        while True:
-            async with await session.get(f'{base_url}/health', params=params) as health_response:
-                status_code = health_response.status
-                health = await health_response.json()
+            async with await session.get(f'{base_url}/slots', params=params) as slots_response:
+                status_code = slots_response.status
+                slots = await slots_response.json()
                if context.debug:
-                    print(f"HEALTH - response for expected health status='{expected_health_status}' on "
-                          f"'{base_url}/health'?{params} is {health}\n")
-                if (status_code == expected_http_status_code
-                        and health['status'] == expected_health_status
-                        and (slots_idle is None or health['slots_idle'] == slots_idle)
-                        and (slots_processing is None or health['slots_processing'] == slots_processing)):
-                    if expected_slots is not None:
-                        assert_slots_status(health['slots'], expected_slots)
-                    return
-                if (status_code == expected_http_status_code
-                        and health['status'] == expected_health_status
-                        and (slots_idle is None or health['slots_idle'] == slots_idle)
-                        and (slots_processing is None or health['slots_processing'] == slots_processing)):
-                    if expected_slots is not None:
-                        assert_slots_status(health['slots'], expected_slots)
+                    print(f"slots responses {slots}\n")
+                if status_code == 503 and status_code == expected_http_status_code:
                    return
+                if status_code == 200 and status_code == expected_http_status_code:
+                    n_slots_idle = sum(1 if slot["state"] == 0 else 0 for slot in slots)
+                    n_slots_processing = sum(1 if slot["state"] != 0 else 0 for slot in slots)
+                    if ((slots_idle is None or slots_idle == n_slots_idle)
+                        and (slots_processing is None or slots_processing == n_slots_processing)):
+                        return
            await asyncio.sleep(interval)

            counter += interval
@@ -1238,7 +1222,7 @@ async def wait_for_health_status(context,
                        if n_completions > 0:
                            return

-                assert False, f'{expected_health_status} timeout exceeded {counter}s>={timeout}'
+                assert False, f'slots check timeout exceeded {counter}s>={timeout}'


 def assert_embeddings(embeddings):
@@ -1253,7 +1237,7 @@ def assert_embeddings(embeddings):


 async def request_slots_status(context, expected_slots):
-    async with aiohttp.ClientSession() as session:
+    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
        async with await session.get(f'{context.base_url}/slots') as slots_response:
            assert slots_response.status == 200
            slots = await slots_response.json()
--- a/examples/server/tests/features/wrong_usages.feature
+++ b/examples/server/tests/features/wrong_usages.feature
@@ -8,9 +8,12 @@ Feature: Wrong usage of llama.cpp server
  Scenario: Infinite loop
    Given a server listening on localhost:8080
    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
+    And   42 as server seed
+    And   2048 KV cache size
    # Uncomment below to fix the issue
    #And   64 server max tokens to predict
    Then  the server is starting
+    Then  the server is healthy
    Given a prompt:
      """
      Go to: infinite loop
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -3,6 +3,14 @@
 #include "llama.h"
 #include "common.h"

+#ifndef NDEBUG
+// crash the server in debug mode, otherwise send an http 500 error
+#define CPPHTTPLIB_NO_EXCEPTIONS 1
+#endif
+// increase max payload length to allow use of larger context size
+#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
+#include "httplib.h"
+
 // Change JSON_ASSERT from assert() to GGML_ASSERT:
 #define JSON_ASSERT GGML_ASSERT
 #include "json.hpp"
@@ -279,6 +287,18 @@ static size_t find_partial_stop_string(const std::string &stop, const std::strin
    return std::string::npos;
 }

+static bool json_is_array_of_numbers(json data) {
+    if (data.is_array()) {
+        for (const auto & e : data) {
+            if (!e.is_number()) {
+                return false;
+            }
+        }
+        return true;
+    }
+    return false;
+}
+
 // TODO: reuse llama_detokenize
 template <class Iter>
 static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
@@ -343,6 +363,19 @@ static json probs_vector_to_json(const llama_context * ctx, const std::vector<co
    return out;
 }

+static bool server_sent_event(httplib::DataSink & sink, const char * event, json & data) {
+    const std::string str =
+        std::string(event) + ": " +
+        data.dump(-1, ' ', false, json::error_handler_t::replace) +
+        "\n\n";
+
+    LOG_VERBOSE("data stream", {
+        { "to_send", str }
+    });
+
+    return sink.write(str.c_str(), str.size());
+}
+
 //
 // OAI utils
 //
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -55,6 +55,8 @@ int main(int argc, char ** argv) {
        return 1;
    }

+    llama_sampling * smpl = llama_sampling_init(model, llama_sampling_default_params());
+
    // tokenize the prompt

    std::vector<llama_token> tokens_list;
@@ -110,20 +112,12 @@ int main(int argc, char ** argv) {
    while (n_cur <= n_predict) {
        // sample the next token
        {
-            auto   n_vocab = llama_n_vocab(model);
-            auto * logits  = llama_get_logits_ith(ctx, batch.n_tokens - 1);
+            const auto * logits = llama_get_logits_ith(ctx, batch.n_tokens - 1);

-            std::vector<llama_token_data> candidates;
-            candidates.reserve(n_vocab);
-
-            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-                candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
-            }
-
-            llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+            llama_sampling_set_logits(smpl, logits);

            // sample the most likely token
-            const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
+            const llama_token new_token_id = llama_sampling_sample_greedy(smpl, nullptr);

            // is it an end of generation?
            if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
@@ -160,12 +154,13 @@ int main(int argc, char ** argv) {
    LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
            __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));

-    llama_print_timings(ctx);
+    llama_print_timings(ctx, nullptr);

    fprintf(stderr, "\n");

    llama_batch_free(batch);

+    llama_sampling_free(smpl);
    llama_free(ctx);
    llama_free_model(model);

--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -21,7 +21,7 @@ struct seq_draft {
    std::vector<llama_token> tokens;
    std::vector<std::vector<llama_token_data>> dists;

-    struct llama_sampling_context * ctx_sampling;
+    struct llama_sampling * smpl;
 };

 int main(int argc, char ** argv) {
@@ -37,16 +37,16 @@ int main(int argc, char ** argv) {
        return 1;
    }

+    // for probabilities to be computed even with temp = 0
+    params.sparams.n_probs = 16;
+
    // max number of parallel drafting sequences (i.e. tree branches)
    const int n_seq_dft = params.n_parallel;

    // probability threshold for splitting a draft branch (only for n_seq_dft > 1)
    const float p_split  = params.p_split;

-    if (params.seed == LLAMA_DEFAULT_SEED) {
-        params.seed = time(NULL);
-    }
-    std::default_random_engine rng(params.seed);
+    std::default_random_engine rng(params.sparams.seed);
    std::uniform_real_distribution<> u_dist;

 #ifndef LOG_DISABLE_LOGS
@@ -73,10 +73,11 @@ int main(int argc, char ** argv) {
    // load the draft model
    params.model = params.model_draft;
    params.n_gpu_layers = params.n_gpu_layers_draft;
-    if (params.n_threads_draft > 0) {
-        params.n_threads = params.n_threads_draft;
+    if (params.draft_cpuparams.n_threads > 0) {
+        params.cpuparams.n_threads = params.draft_cpuparams.n_threads;
    }
-    params.n_threads_batch = params.n_threads_batch_draft;
+
+    params.cpuparams_batch.n_threads = params.draft_cpuparams_batch.n_threads;
    llama_init_result llama_init_dft = llama_init_from_gpt_params(params);
    model_dft = llama_init_dft.model;
    ctx_dft = llama_init_dft.context;
@@ -178,19 +179,15 @@ int main(int argc, char ** argv) {
    // used to determine end of generation
    bool has_eos = false;

-    // target model sampling context
-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
+    // target model sampling context (reuse the llama_context's sampling instance)
+    struct llama_sampling * smpl = llama_sampling_init(model_tgt, params.sparams);

    // draft sequence data
    std::vector<seq_draft> drafts(n_seq_dft);

-    params.sparams.grammar.clear(); // the draft samplers will copy the target sampler's grammar
-    if (params.sparams.temp == 0) {
-        params.sparams.temp = -1.0f; // force greedy sampling with probs for the draft model
-    }
-
    for (int s = 0; s < n_seq_dft; ++s) {
-        drafts[s].ctx_sampling = llama_sampling_init(params.sparams);
+        // allocate llama_sampling for each draft sequence
+        drafts[s].smpl = llama_sampling_init(model_dft, params.sparams);
    }

    llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1);
@@ -233,9 +230,15 @@ int main(int argc, char ** argv) {
                if (params.sparams.temp > 0) {
                    // stochastic verification

-                    llama_token_data_array dist_tgt = llama_sampling_prepare(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft], true, NULL);
-                    llama_sample_softmax(ctx_tgt, &dist_tgt);
-                    float p_tgt = 0, p_dft = 0;
+                    llama_sampling_set_logits(smpl, llama_get_logits_ith(ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]));
+
+                    auto & dist_tgt = *llama_sampling_get_candidates(smpl);
+
+                    llama_sampling_grammar(smpl, &dist_tgt);
+                    llama_sampling_softmax(smpl, &dist_tgt);
+
+                    float p_tgt = 0.0f;
+                    float p_dft = 0.0f;

                    // GGML_ASSERT(dist_tgt.size() == dist_dft.size());

@@ -277,7 +280,7 @@ int main(int argc, char ** argv) {
                            accept = true;
                            token_id = drafts[s].tokens[i_dft];
                            token_str = llama_token_to_piece(ctx_tgt, token_id);
-                            llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true);
+                            llama_sampling_accept(smpl, token_id, true);

                            LOG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str());
                            break;
@@ -331,8 +334,8 @@ int main(int argc, char ** argv) {
                        // all drafted tokens were rejected
                        // sample from the target model
                        LOG("all drafted tokens were rejected, sampling from residual distribution\n");
-                        token_id = llama_sample_token(ctx_tgt, &dist_tgt);
-                        llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true);
+                        token_id = llama_sampling_sample_dist(smpl, &dist_tgt);
+                        llama_sampling_accept(smpl, token_id, true);
                        token_str = llama_token_to_piece(ctx_tgt, token_id);
                    }

@@ -341,11 +344,11 @@ int main(int argc, char ** argv) {

                    // sample from the target model
                    LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
-                    token_id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);
+                    token_id = llama_sampling_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]);

-                    llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true);
+                    llama_sampling_accept(smpl, token_id, true);

-                    //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str());
+                    //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, smpl->prev).c_str());

                    token_str = llama_token_to_piece(ctx_tgt, token_id);

@@ -433,7 +436,7 @@ int main(int argc, char ** argv) {
            break;
        }

-        llama_sampling_cp(ctx_sampling, drafts[0].ctx_sampling);
+        llama_sampling_cp(smpl, drafts[0].smpl);

        int n_seq_cur  = 1;
        int n_past_cur = n_past_dft;
@@ -462,20 +465,20 @@ int main(int argc, char ** argv) {
                    continue;
                }

-                llama_sampling_sample(drafts[s].ctx_sampling, ctx_dft, NULL, drafts[s].i_batch_dft);
+                llama_sampling_sample(drafts[s].smpl, ctx_dft, drafts[s].i_batch_dft);

-                const auto & cur_p = drafts[s].ctx_sampling->cur;
+                const auto * cur_p = llama_sampling_get_candidates(drafts[s].smpl);

-                for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p.size()); ++k) {
+                for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p->size); ++k) {
                    LOG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
-                            k, s, i, cur_p[k].id, cur_p[k].p, llama_token_to_piece(ctx_dft, cur_p[k].id).c_str());
+                            k, s, i, cur_p->data[k].id, cur_p->data[k].p, llama_token_to_piece(ctx_dft, cur_p->data[k].id).c_str());
                }

                std::vector<int> sa(1, s);

                // attempt to split the branch if the probability is high enough
                for (int f = 1; f < 8; ++f) {
-                    if (n_seq_cur < n_seq_dft && cur_p[f].p > p_split) {
+                    if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_split) {
                        LOG("splitting seq %3d into %3d\n", s, n_seq_cur);

                        llama_kv_cache_seq_rm(ctx_dft,    n_seq_cur, -1, -1);
@@ -502,7 +505,7 @@ int main(int argc, char ** argv) {
                        drafts[n_seq_cur].i_batch_dft = drafts[s].i_batch_dft;
                        drafts[n_seq_cur].i_batch_tgt = drafts[s].i_batch_tgt;

-                        llama_sampling_cp(drafts[s].ctx_sampling, drafts[n_seq_cur].ctx_sampling);
+                        llama_sampling_cp(drafts[s].smpl, drafts[n_seq_cur].smpl);

                        sa.push_back(n_seq_cur);

@@ -514,15 +517,15 @@ int main(int argc, char ** argv) {

                // add drafted token for each sequence
                for (int is = 0; is < (int) sa.size(); ++is) {
-                    const llama_token id = cur_p[is].id;
+                    const llama_token id = cur_p->data[is].id;

                    const int s = sa[is];

-                    llama_sampling_accept(drafts[s].ctx_sampling, ctx_dft, id, true);
+                    llama_sampling_accept(drafts[s].smpl, id, true);

                    drafts[s].tokens.push_back(id);
                    // save cur_p.data into drafts[s].dists
-                    drafts[s].dists.push_back(cur_p);
+                    drafts[s].dists.push_back({cur_p->data, cur_p->data + cur_p->size});

                    // add unique drafted tokens to the target batch
                    drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens);
@@ -593,14 +596,15 @@ int main(int argc, char ** argv) {
    LOG_TEE("accept    = %.3f%%\n", 100.0f * n_accept / n_drafted);

    LOG_TEE("\ndraft:\n");
-    llama_print_timings(ctx_dft);
+    // TODO: print sampling/grammar timings for all drafts
+    llama_print_timings(ctx_dft, nullptr);

    LOG_TEE("\ntarget:\n");
-    llama_print_timings(ctx_tgt);
+    llama_print_timings(ctx_tgt, smpl);

-    llama_sampling_free(ctx_sampling);
+    llama_sampling_free(smpl);
    for (int s = 0; s < n_seq_dft; ++s) {
-        llama_sampling_free(drafts[s].ctx_sampling);
+        llama_sampling_free(drafts[s].smpl);
    }

    llama_batch_free(batch_dft);
--- a/examples/tokenize/tokenize.cpp
+++ b/examples/tokenize/tokenize.cpp
@@ -362,7 +362,7 @@ int main(int raw_argc, char ** raw_argv) {
        prompt = stdin_buffer.str();
    }

-    const bool model_wants_add_bos = llama_should_add_bos_token(model);
+    const bool model_wants_add_bos = llama_add_bos_token(model);
    const bool add_bos = model_wants_add_bos && !no_bos;
    const bool parse_special = !no_parse_special;

--- a/flake.lock
+++ b/flake.lock
@@ -20,11 +20,11 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1723175592,
-        "narHash": "sha256-M0xJ3FbDUc4fRZ84dPGx5VvgFsOzds77KiBMW/mMTnI=",
+        "lastModified": 1724224976,
+        "narHash": "sha256-Z/ELQhrSd7bMzTO8r7NZgi9g5emh+aRKoCdaAv5fiO0=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "5e0ca22929f3342b19569b21b2f3462f053e497b",
+        "rev": "c374d94f1536013ca8e92341b540eba4c22f9c62",
        "type": "github"
      },
      "original": {
--- a/flake.nix
+++ b/flake.nix
@@ -145,7 +145,9 @@
            # the same path you would with an overlay.
            legacyPackages = {
              llamaPackages = pkgs.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
-              llamaPackagesWindows = pkgs.pkgsCross.mingwW64.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
+              llamaPackagesWindows = pkgs.pkgsCross.mingwW64.callPackage .devops/nix/scope.nix {
+                inherit llamaVersion;
+              };
              llamaPackagesCuda = pkgsCuda.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
              llamaPackagesRocm = pkgsRocm.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
            };
@@ -157,6 +159,7 @@
                default = config.legacyPackages.llamaPackages.llama-cpp;
                vulkan = config.packages.default.override { useVulkan = true; };
                windows = config.legacyPackages.llamaPackagesWindows.llama-cpp;
+                python-scripts = config.legacyPackages.llamaPackages.python-scripts;
              }
              // lib.optionalAttrs pkgs.stdenv.isLinux {
                cuda = config.legacyPackages.llamaPackagesCuda.llama-cpp;
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -129,13 +129,13 @@ option(GGML_CUDA_NO_VMM                     "ggml: do not try to use CUDA VMM"
 option(GGML_CUDA_FA_ALL_QUANTS              "ggml: compile all quants for FlashAttention"     OFF)
 option(GGML_CUDA_USE_GRAPHS                 "ggml: use CUDA graphs (llama.cpp only)"          OFF)

-option(GGML_CURL                            "ggml: use libcurl to download model from an URL" OFF)
 option(GGML_HIPBLAS                         "ggml: use hipBLAS"                               OFF)
 option(GGML_HIP_UMA                         "ggml: use HIP unified memory architecture"       OFF)
 option(GGML_VULKAN                          "ggml: use Vulkan"                                OFF)
 option(GGML_VULKAN_CHECK_RESULTS            "ggml: run Vulkan op checks"                      OFF)
 option(GGML_VULKAN_DEBUG                    "ggml: enable Vulkan debug output"                OFF)
 option(GGML_VULKAN_MEMORY_DEBUG             "ggml: enable Vulkan memory debug output"         OFF)
+option(GGML_VULKAN_PERF                     "ggml: enable Vulkan perf output"                 OFF)
 option(GGML_VULKAN_VALIDATE                 "ggml: enable Vulkan validation"                  OFF)
 option(GGML_VULKAN_RUN_TESTS                "ggml: run Vulkan tests"                          OFF)
 option(GGML_KOMPUTE                         "ggml: use Kompute"                               OFF)
--- a/ggml/include/ggml-alloc.h
+++ b/ggml/include/ggml-alloc.h
@@ -7,8 +7,8 @@ extern "C" {
 #endif

 typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
-typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
-typedef struct ggml_backend * ggml_backend_t;
+typedef struct      ggml_backend_buffer * ggml_backend_buffer_t;
+typedef struct             ggml_backend * ggml_backend_t;

 // Tensor allocator
 struct ggml_tallocr {
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -63,6 +63,7 @@ extern "C" {
    GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
    GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);

+    // "offset" refers to the offset of the tensor data for setting/getting data
    GGML_API GGML_CALL void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
    GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);

@@ -102,6 +103,7 @@ extern "C" {

    GGML_API GGML_CALL bool ggml_backend_is_cpu                (ggml_backend_t backend);
    GGML_API           void ggml_backend_cpu_set_n_threads     (ggml_backend_t backend_cpu, int n_threads);
+    GGML_API           void ggml_backend_cpu_set_threadpool    (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
    GGML_API           void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);

    // Create a backend buffer from an existing pointer
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -220,7 +220,7 @@
 #include <stdio.h>

 #define GGML_FILE_MAGIC   0x67676d6c // "ggml"
-#define GGML_FILE_VERSION 1
+#define GGML_FILE_VERSION 2

 #define GGML_QNT_VERSION        2    // bump this on quantization format changes
 #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
@@ -231,6 +231,8 @@
 #define GGML_MAX_SRC            10
 #ifndef GGML_MAX_NAME
 #define GGML_MAX_NAME           64
+#define GGML_MAX_N_THREADS      512
+
 #endif
 #define GGML_MAX_OP_PARAMS      64
 #define GGML_DEFAULT_N_THREADS  4
@@ -244,6 +246,8 @@
 #define GGML_EXIT_SUCCESS 0
 #define GGML_EXIT_ABORTED 1

+#define GGML_ROPE_TYPE_NEOX 2
+
 #define GGUF_MAGIC "GGUF"

 #define GGUF_VERSION 3
@@ -451,6 +455,8 @@ extern "C" {
        GGML_OP_SQR,
        GGML_OP_SQRT,
        GGML_OP_LOG,
+        GGML_OP_SIN,
+        GGML_OP_COS,
        GGML_OP_SUM,
        GGML_OP_SUM_ROWS,
        GGML_OP_MEAN,
@@ -488,9 +494,11 @@ extern "C" {
        GGML_OP_CLAMP,
        GGML_OP_CONV_TRANSPOSE_1D,
        GGML_OP_IM2COL,
+        GGML_OP_IM2COL_BACK,
        GGML_OP_CONV_TRANSPOSE_2D,
        GGML_OP_POOL_1D,
        GGML_OP_POOL_2D,
+        GGML_OP_POOL_2D_BACK,
        GGML_OP_UPSCALE, // nearest interpolate
        GGML_OP_PAD,
        GGML_OP_ARANGE,
@@ -506,6 +514,7 @@ extern "C" {
        GGML_OP_WIN_UNPART,
        GGML_OP_GET_REL_POS,
        GGML_OP_ADD_REL_POS,
+        GGML_OP_RWKV_WKV,

        GGML_OP_UNARY,

@@ -540,6 +549,7 @@ extern "C" {
        GGML_UNARY_OP_SILU,
        GGML_UNARY_OP_HARDSWISH,
        GGML_UNARY_OP_HARDSIGMOID,
+        GGML_UNARY_OP_EXP,

        GGML_UNARY_OP_COUNT,
    };
@@ -622,6 +632,29 @@ extern "C" {
    // If it returns true, the computation is aborted
    typedef bool (*ggml_abort_callback)(void * data);

+    // Scheduling priorities
+    enum ggml_sched_priority {
+        GGML_SCHED_PRIO_NORMAL,
+        GGML_SCHED_PRIO_MEDIUM,
+        GGML_SCHED_PRIO_HIGH,
+        GGML_SCHED_PRIO_REALTIME
+    };
+
+    // Threadpool params
+    // Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
+    struct ggml_threadpool_params {
+        bool                cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
+        int                 n_threads;                   // number of threads
+        enum ggml_sched_priority prio;                   // thread priority
+        uint32_t            poll;                        // polling level (0 - no polling, 100 - aggressive polling)
+        bool                strict_cpu;                  // strict cpu placement
+        bool                paused;                      // start in paused state
+    };
+
+    struct ggml_threadpool;     // forward declaration, see ggml.c
+
+    typedef struct  ggml_threadpool * ggml_threadpool_t;
+
    // the compute plan that needs to be prepared for ggml_graph_compute()
    // since https://github.com/ggerganov/ggml/issues/287
    struct ggml_cplan {
@@ -629,6 +662,7 @@ extern "C" {
        uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`

        int n_threads;
+        struct ggml_threadpool * threadpool;

        // abort ggml_graph_compute when true
        ggml_abort_callback abort_callback;
@@ -967,6 +1001,22 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

+    GGML_API struct ggml_tensor * ggml_sin(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_sin_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_cos(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_cos_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
    // return scalar
    GGML_API struct ggml_tensor * ggml_sum(
            struct ggml_context * ctx,
@@ -1117,6 +1167,14 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

+    GGML_API struct ggml_tensor * ggml_exp(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_exp_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
    // normalize along rows
    GGML_API struct ggml_tensor * ggml_norm(
            struct ggml_context * ctx,
@@ -1453,8 +1511,8 @@ extern "C" {
            struct ggml_tensor  * b);

    // rotary position embedding
-    // if mode & 1 == 1, skip n_past elements (NOT SUPPORTED)
-    // if mode & 2 == 1, GPT-NeoX style
+    // if (mode & 1) - skip n_past elements (NOT SUPPORTED)
+    // if (mode & GGML_ROPE_TYPE_NEOX) - GPT-NeoX style
    //
    // b is an int32 vector with size a->ne[2], it contains the positions
    GGML_API struct ggml_tensor * ggml_rope(
@@ -1564,34 +1622,49 @@ extern "C" {
            float                 min,
            float                 max);

+    // im2col
+    // converts data into a format that effectively results in a convolution when combined with matrix multiplication
    GGML_API struct ggml_tensor * ggml_im2col(
            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            int                  s0,
-            int                  s1,
-            int                  p0,
-            int                  p1,
-            int                  d0,
-            int                  d1,
-            bool                 is_2D,
-            enum ggml_type       dst_type);
+            struct ggml_tensor  * a,  // convolution kernel
+            struct ggml_tensor  * b,  // data
+            int                   s0, // stride dimension 0
+            int                   s1, // stride dimension 1
+            int                   p0, // padding dimension 0
+            int                   p1, // padding dimension 1
+            int                   d0, // dilation dimension 0
+            int                   d1, // dilation dimension 1
+            bool                  is_2D,
+            enum ggml_type        dst_type);
+
+    GGML_API struct ggml_tensor * ggml_im2col_back(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,  // convolution kernel
+        struct ggml_tensor  * b,  // gradient of im2col output
+        int64_t             * ne, // shape of im2col input
+        int                   s0, // stride dimension 0
+        int                   s1, // stride dimension 1
+        int                   p0, // padding dimension 0
+        int                   p1, // padding dimension 1
+        int                   d0, // dilation dimension 0
+        int                   d1, // dilation dimension 1
+        bool                  is_2D);

    GGML_API struct ggml_tensor * ggml_conv_depthwise_2d(
            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            int                  s0,
-            int                  s1,
-            int                  p0,
-            int                  p1,
-            int                  d0,
-            int                  d1);
+            struct ggml_tensor  * a,  // convolution kernel
+            struct ggml_tensor  * b,  // data
+            int                  s0,  // stride dimension 0
+            int                  s1,  // stride dimension 1
+            int                  p0,  // padding dimension 0
+            int                  p1,  // padding dimension 1
+            int                  d0,  // dilation dimension 0
+            int                  d1); // dilation dimension 1

    GGML_API struct ggml_tensor * ggml_conv_1d(
            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
+            struct ggml_tensor  * a,   // convolution kernel
+            struct ggml_tensor  * b,   // data
            int                   s0,  // stride
            int                   p0,  // padding
            int                   d0); // dilation
@@ -1600,29 +1673,29 @@ extern "C" {
    // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
    GGML_API struct ggml_tensor* ggml_conv_1d_ph(
            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            int                   s,
-            int                   d);
+            struct ggml_tensor  * a,  // convolution kernel
+            struct ggml_tensor  * b,  // data
+            int                   s,  // stride
+            int                   d); // dilation

    GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            int                   s0,
-            int                   p0,
-            int                   d0);
+            struct ggml_tensor  * a,   // convolution kernel
+            struct ggml_tensor  * b,   // data
+            int                   s0,  // stride
+            int                   p0,  // padding
+            int                   d0); // dilation

    GGML_API struct ggml_tensor * ggml_conv_2d(
            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            int                   s0,
-            int                   s1,
-            int                   p0,
-            int                   p1,
-            int                   d0,
-            int                   d1);
+            struct ggml_tensor  * a,   // convolution kernel
+            struct ggml_tensor  * b,   // data
+            int                   s0,  // stride dimension 0
+            int                   s1,  // stride dimension 1
+            int                   p0,  // padding dimension 0
+            int                   p1,  // padding dimension 1
+            int                   d0,  // dilation dimension 0
+            int                   d1); // dilation dimension 1


    // kernel size is a->ne[0] x a->ne[1]
@@ -1684,6 +1757,18 @@ extern "C" {
            float                 p0,
            float                 p1);

+    GGML_API struct ggml_tensor * ggml_pool_2d_back(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * af, // "a"/input used in forward pass
+            enum ggml_op_pool     op,
+            int                   k0,
+            int                   k1,
+            int                   s0,
+            int                   s1,
+            float                 p0,
+            float                 p1);
+
    // nearest interpolate
    // multiplies ne0 and ne1 by scale factor
    // used in stable-diffusion
@@ -1758,7 +1843,8 @@ extern "C" {
            struct ggml_tensor  * v,
            struct ggml_tensor  * mask,
            float                 scale,
-            float                 max_bias);
+            float                 max_bias,
+            float                 logit_softcap);

    GGML_API void ggml_flash_attn_ext_set_prec(
            struct ggml_tensor * a,
@@ -1775,10 +1861,8 @@ extern "C" {

    GGML_API struct ggml_tensor * ggml_ssm_conv(
            struct ggml_context * ctx,
-            struct ggml_tensor  * s,
-            struct ggml_tensor  * x,
-            struct ggml_tensor  * c,
-            struct ggml_tensor  * sq);
+            struct ggml_tensor  * sx,
+            struct ggml_tensor  * c);

    GGML_API struct ggml_tensor * ggml_ssm_scan(
            struct ggml_context * ctx,
@@ -1787,8 +1871,7 @@ extern "C" {
            struct ggml_tensor  * dt,
            struct ggml_tensor  * A,
            struct ggml_tensor  * B,
-            struct ggml_tensor  * C,
-            struct ggml_tensor  * sq);
+            struct ggml_tensor  * C);

    // partition into non-overlapping windows with padding if needed
    // example:
@@ -1840,6 +1923,15 @@ extern "C" {
            struct ggml_tensor  * pw,
            struct ggml_tensor  * ph);

+    GGML_API struct ggml_tensor * ggml_rwkv_wkv(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * k,
+            struct ggml_tensor  * v,
+            struct ggml_tensor  * r,
+            struct ggml_tensor  * tf,
+            struct ggml_tensor  * td,
+            struct ggml_tensor  * state);
+
    // custom operators

    typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
@@ -2010,10 +2102,23 @@ extern "C" {
    GGML_API size_t ggml_graph_overhead(void);
    GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);

+    GGML_API struct ggml_threadpool_params   ggml_threadpool_params_default(int n_threads);
+    GGML_API void                            ggml_threadpool_params_init  (struct ggml_threadpool_params *p, int n_threads);
+    GGML_API bool                            ggml_threadpool_params_match (const struct ggml_threadpool_params *p0, const struct ggml_threadpool_params *p1);
+    GGML_API struct ggml_threadpool*         ggml_threadpool_new          (struct ggml_threadpool_params  * params);
+    GGML_API void                            ggml_threadpool_free         (struct ggml_threadpool * threadpool);
+    GGML_API int                             ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
+    GGML_API void                            ggml_threadpool_pause        (struct ggml_threadpool * threadpool);
+    GGML_API void                            ggml_threadpool_resume       (struct ggml_threadpool * threadpool);
+
    // ggml_graph_plan() has to be called before ggml_graph_compute()
    // when plan.work_size > 0, caller must allocate memory for plan.work_data
-    GGML_API struct ggml_cplan ggml_graph_plan   (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
-    GGML_API enum ggml_status  ggml_graph_compute(      struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
+    GGML_API struct ggml_cplan ggml_graph_plan(
+                  const struct ggml_cgraph * cgraph,
+                                       int   n_threads, /* = GGML_DEFAULT_N_THREADS */
+                    struct ggml_threadpool * threadpool /* = NULL */ );
+    GGML_API enum ggml_status  ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
+
    // same as ggml_graph_compute() but the work data is allocated as a part of the context
    // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
    GGML_API enum ggml_status  ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -549,6 +549,13 @@ if (GGML_SYCL)
    file(GLOB   GGML_SOURCES_SYCL "ggml-sycl/*.cpp")
    list(APPEND GGML_SOURCES_SYCL "ggml-sycl.cpp")

+    find_package(DNNL)
+    message("-- DNNL found:" ${DNNL_FOUND})
+    if (GGML_SYCL_TARGET STREQUAL "INTEL")
+        add_compile_definitions(GGML_SYCL_DNNL=${DNNL_FOUND})
+    else()
+        add_compile_definitions(GGML_SYCL_DNNL=0)
+    endif()
    if (WIN32)
        find_package(IntelSYCL REQUIRED)
        find_package(MKL REQUIRED)
@@ -561,6 +568,9 @@ if (GGML_SYCL)
            set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl pthread m dl onemkl)
        endif()
    endif()
+    if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
+        list(APPEND GGML_EXTRA_LIBS DNNL::dnnl)
+    endif()
 endif()

 if (GGML_RPC)
@@ -602,6 +612,10 @@ if (GGML_VULKAN)
            add_compile_definitions(GGML_VULKAN_MEMORY_DEBUG)
        endif()

+        if (GGML_VULKAN_PERF)
+            add_compile_definitions(GGML_VULKAN_PERF)
+        endif()
+
        if (GGML_VULKAN_VALIDATE)
            add_compile_definitions(GGML_VULKAN_VALIDATE)
        endif()
@@ -1233,7 +1247,7 @@ endif()

 # Data types, macros and functions related to controlling CPU affinity and
 # some memory allocation are available on Linux through GNU extensions in libc
-if (CMAKE_SYSTEM_NAME MATCHES "Linux")
+if (CMAKE_SYSTEM_NAME MATCHES "Linux" OR CMAKE_SYSTEM_NAME MATCHES "Android")
    add_compile_definitions(_GNU_SOURCE)
 endif()

--- a/ggml/src/ggml-aarch64.c
+++ b/ggml/src/ggml-aarch64.c
@@ -337,33 +337,18 @@ static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict ds
 }

 size_t quantize_q4_0_4x4(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
-    if (!quant_weights) {
-        return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 4);
-    }
-    else {
-        assert(false);
-        return 0;
-    }
+    UNUSED(quant_weights);
+    return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 4);
 }

 size_t quantize_q4_0_4x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
-    if (!quant_weights) {
-        return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 8);
-    }
-    else {
-        assert(false);
-        return 0;
-    }
+    UNUSED(quant_weights);
+    return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 8);
 }

 size_t quantize_q4_0_8x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
-    if (!quant_weights) {
-        return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 8, 8);
-    }
-    else {
-        assert(false);
-        return 0;
-    }
+    UNUSED(quant_weights);
+    return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 8, 8);
 }

 void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
--- a/ggml/src/ggml-backend.c
+++ b/ggml/src/ggml-backend.c
@@ -722,9 +722,11 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
 #endif

 struct ggml_backend_cpu_context {
-    int n_threads;
-    void * work_data;
-    size_t work_size;
+    int                 n_threads;
+    ggml_threadpool_t   threadpool;
+
+    void *              work_data;
+    size_t              work_size;

    ggml_abort_callback abort_callback;
    void *              abort_callback_data;
@@ -759,7 +761,7 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg

    struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));

-    cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
+    cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
    cpu_plan->cgraph = *cgraph; // FIXME: deep copy

    if (cpu_plan->cplan.work_size > 0) {
@@ -796,7 +798,7 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backe
 GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;

-    struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
+    struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);

    if (cpu_ctx->work_size < cplan.work_size) {
        free(cpu_ctx->work_data);
@@ -873,6 +875,7 @@ ggml_backend_t ggml_backend_cpu_init(void) {
    }

    ctx->n_threads           = GGML_DEFAULT_N_THREADS;
+    ctx->threadpool          = NULL;
    ctx->work_data           = NULL;
    ctx->work_size           = 0;
    ctx->abort_callback      = NULL;
@@ -903,6 +906,18 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
    ctx->n_threads = n_threads;
 }

+void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) {
+    GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
+
+    struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
+
+    if (ctx->threadpool && ctx->threadpool != threadpool) {
+        // already had a different threadpool, pause/suspend it before switching
+        ggml_threadpool_pause(ctx->threadpool);
+    }
+    ctx->threadpool = threadpool;
+}
+
 void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
    GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));

@@ -1018,10 +1033,6 @@ static bool ggml_is_view_op(enum ggml_op op) {
 #define GGML_SCHED_MAX_BACKENDS 16
 #endif

-#ifndef GGML_SCHED_MAX_SPLITS
-#define GGML_SCHED_MAX_SPLITS 2048
-#endif
-
 #ifndef GGML_SCHED_MAX_SPLIT_INPUTS
 #define GGML_SCHED_MAX_SPLIT_INPUTS GGML_MAX_SRC
 #endif
@@ -1125,7 +1136,8 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, co
 }

 #if 0
-static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
+#define GGML_SCHED_MAX_SPLITS_DEBUG 4096
+static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS_DEBUG*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
 #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
 #define GET_CAUSE(node) causes[hash_id(node)]
 #else
@@ -1549,7 +1561,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                    sched->splits = realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split));
                    GGML_ASSERT(sched->splits != NULL);
                }
-                GGML_ASSERT(i_split < GGML_SCHED_MAX_SPLITS);
                split = &sched->splits[i_split];
                split->backend_id = node_backend_id;
                split->i_start = i;
@@ -1865,13 +1876,14 @@ ggml_backend_sched_t ggml_backend_sched_new(
    sched->hv_tensor_backend_ids = malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
    sched->hv_tensor_copies      = malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));

-    const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
+    const size_t ggml_sched_max_splits = graph_size; // at most there is one split for each node in the graph
+    const size_t nodes_size = graph_size + ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2;
    sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
    sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
    sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
    sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));

-    sched->context_buffer_size = GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(graph_size, false);
+    sched->context_buffer_size = ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(graph_size, false);
    sched->context_buffer = malloc(sched->context_buffer_size);

    const int initial_splits_capacity = 16;
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -2881,7 +2881,7 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast,
                             beta_slow, corr_dims);

-    const bool is_neox = mode & 2;
+    const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;

    // init cos/sin cache
    ggml_cann_pool_alloc sin_allocator(
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -9,8 +9,10 @@
 #include "ggml-cuda/binbcast.cuh"
 #include "ggml-cuda/clamp.cuh"
 #include "ggml-cuda/concat.cuh"
+#include "ggml-cuda/conv-transpose-1d.cuh"
 #include "ggml-cuda/convert.cuh"
 #include "ggml-cuda/cpy.cuh"
+#include "ggml-cuda/cross-entropy-loss.cuh"
 #include "ggml-cuda/diagmask.cuh"
 #include "ggml-cuda/dmmv.cuh"
 #include "ggml-cuda/fattn.cuh"
@@ -29,7 +31,6 @@
 #include "ggml-cuda/tsembd.cuh"
 #include "ggml-cuda/unary.cuh"
 #include "ggml-cuda/upscale.cuh"
-#include "ggml-cuda/conv-transpose-1d.cuh"

 #include <algorithm>
 #include <array>
@@ -2181,6 +2182,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
        case GGML_OP_ADD:
            ggml_cuda_op_add(ctx, dst);
            break;
+        case GGML_OP_SUB:
+            ggml_cuda_op_sub(ctx, dst);
+            break;
        case GGML_OP_ACC:
            ggml_cuda_op_acc(ctx, dst);
            break;
@@ -2267,6 +2271,12 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
        case GGML_OP_SQRT:
            ggml_cuda_op_sqrt(ctx, dst);
            break;
+        case GGML_OP_SIN:
+            ggml_cuda_op_sin(ctx, dst);
+            break;
+        case GGML_OP_COS:
+            ggml_cuda_op_cos(ctx, dst);
+            break;
        case GGML_OP_CLAMP:
            ggml_cuda_op_clamp(ctx, dst);
            break;
@@ -2303,6 +2313,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
        case GGML_OP_FLASH_ATTN_EXT:
            ggml_cuda_flash_attn_ext(ctx, dst);
            break;
+        case GGML_OP_CROSS_ENTROPY_LOSS:
+            ggml_cuda_cross_entropy_loss(ctx, dst);
+            break;
        default:
            return false;
    }
@@ -2610,6 +2623,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
                assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
                for (int j = 0; j < GGML_MAX_SRC; j++) {
                    if (node->src[j] != nullptr) {
+                        assert(node->src[j]->buffer);
                        assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
                    }
                }
@@ -2853,12 +2867,15 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
        case GGML_OP_TRANSPOSE:
        case GGML_OP_NORM:
        case GGML_OP_ADD:
+        case GGML_OP_SUB:
        case GGML_OP_MUL:
        case GGML_OP_DIV:
        case GGML_OP_RMS_NORM:
        case GGML_OP_SCALE:
        case GGML_OP_SQR:
        case GGML_OP_SQRT:
+        case GGML_OP_SIN:
+        case GGML_OP_COS:
        case GGML_OP_CLAMP:
        case GGML_OP_CONT:
        case GGML_OP_DIAG_MASK_INF:
@@ -2890,6 +2907,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
            }
            return ggml_cuda_info().devices[cuda_ctx->device].cc >= CC_VOLTA &&
                op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
+        case GGML_OP_CROSS_ENTROPY_LOSS:
+            return true;
 #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
        default:
            return false;
--- a/ggml/src/ggml-cuda/binbcast.cu
+++ b/ggml/src/ggml-cuda/binbcast.cu
@@ -9,6 +9,10 @@ static __device__ __forceinline__ float op_add(const float a, const float b) {
    return a + b;
 }

+static __device__ __forceinline__ float op_sub(const float a, const float b) {
+    return a - b;
+}
+
 static __device__ __forceinline__ float op_mul(const float a, const float b) {
    return a * b;
 }
@@ -271,6 +275,10 @@ void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
 }

+void ggml_cuda_op_sub(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_sub>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
+}
+
 void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
 }
--- a/ggml/src/ggml-cuda/binbcast.cuh
+++ b/ggml/src/ggml-cuda/binbcast.cuh
@@ -2,5 +2,6 @@

 void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_sub(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/Show More
+++ b/Show More