cmake : use ggml-metal.metal from source dir to build default.metallib

2026-04-30 16:47:31 +03:00 · 2024-09-05 12:17:56 -04:00
127 changed files with 5809 additions and 9787 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -375,7 +375,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3

      - name: Dependencies
        id: depends
@@ -401,7 +401,7 @@ jobs:
    continue-on-error: true

    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v2

      - name: add oneAPI to apt
        shell: bash
@@ -442,7 +442,7 @@ jobs:
    continue-on-error: true

    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v2

      - name: add oneAPI to apt
        shell: bash
@@ -546,7 +546,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v1

      - name: Dependencies
        id: depends
@@ -576,7 +576,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v1

      - name: Dependencies
        id: depends
@@ -610,7 +610,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v1

      - name: Dependencies
        id: depends
@@ -857,7 +857,7 @@ jobs:
        run: |
          mkdir build
          cd build
-          cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON
+          cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON
          cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1)) -t ggml
          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}

@@ -969,14 +969,14 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3

      - name: Install
        id: depends
        run: |
          $ErrorActionPreference = "Stop"
          write-host "Downloading AMD HIP SDK Installer"
-          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
+          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-23.Q4-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
          write-host "Installing AMD HIP SDK"
          Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
          write-host "Completed AMD HIP SDK installation"
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -37,9 +37,9 @@ jobs:
          - { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
          - { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
          - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
-          # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
-          #- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          #- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          - { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          - { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          # Note: the full-rocm image is failing due to a "no space left on device" error. It is disabled for now to allow the workflow to complete.
          #- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          - { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" }
          - { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" }
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -173,7 +173,6 @@ jobs:
        if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
        run: |
          cd examples/server/tests
-          $env:PYTHONIOENCODING = ":replace"
          behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp

      - name: Slow tests
--- a/.gitignore
+++ b/.gitignore
@@ -61,7 +61,6 @@ llama-batched-swift
 /rpc-server
 out/
 tmp/
-autogen-*.md

 # Deprecated

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -139,16 +139,10 @@ set(LLAMA_BIN_INSTALL_DIR     ${CMAKE_INSTALL_BINDIR}     CACHE PATH "Location o
 # determining _precisely_ which defines are necessary for the llama-config
 # package.
 #
-set(GGML_TRANSIENT_DEFINES)
 get_target_property(GGML_DIRECTORY ggml SOURCE_DIR)
 get_directory_property(GGML_DIR_DEFINES DIRECTORY ${GGML_DIRECTORY} COMPILE_DEFINITIONS)
-if (GGML_DIR_DEFINES)
-    list(APPEND GGML_TRANSIENT_DEFINES ${GGML_DIR_DEFINES})
-endif()
 get_target_property(GGML_TARGET_DEFINES ggml COMPILE_DEFINITIONS)
-if (GGML_TARGET_DEFINES)
-    list(APPEND GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES})
-endif()
+set(GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES} ${GGML_DIR_DEFINES})
 get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)

 set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h)
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -32,8 +32,8 @@

    {
        "name": "arm64-windows-msvc", "hidden": true,
-        "architecture": { "value": "arm64",    "strategy": "external" },
-        "toolset":      { "value": "host=x64", "strategy": "external" },
+        "architecture": { "value": "arm64",       "strategy": "external" },
+        "toolset":      { "value": "host=x86_64", "strategy": "external" },
        "cacheVariables": {
            "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
        }
@@ -41,8 +41,8 @@

    {
        "name": "arm64-windows-llvm", "hidden": true,
-        "architecture": { "value": "arm64",    "strategy": "external" },
-        "toolset":      { "value": "host=x64", "strategy": "external" },
+        "architecture": { "value": "arm64",       "strategy": "external" },
+        "toolset":      { "value": "host=x86_64", "strategy": "external" },
        "cacheVariables": {
            "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-llvm.cmake"
        }
--- a/34
+++ b/34
@@ -39,12 +39,10 @@ BUILD_TARGETS = \
 	llama-tokenize \
 	llama-vdot \
 	llama-cvector-generator \
-	llama-gen-docs \
 	tests/test-c.o

 # Binaries only useful for tests
 TEST_TARGETS = \
-	tests/test-arg-parser \
 	tests/test-autorelease \
 	tests/test-backend-ops \
 	tests/test-chat-template \
@@ -434,7 +432,7 @@ endif
 # TODO: probably these flags need to be tweaked on some architectures
 #       feel free to update the Makefile for your architecture and send a pull request or issue

-ifndef RISCV_CROSS_COMPILE
+ifndef RISCV

 ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
 	# Use all CPU extensions that are available:
@@ -514,12 +512,7 @@ ifneq ($(filter loongarch64%,$(UNAME_M)),)
 	MK_CXXFLAGS += -mlasx
 endif

-ifneq ($(filter riscv64%,$(UNAME_M)),)
-	MK_CFLAGS   += -march=rv64gcv -mabi=lp64d
-	MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
-endif
-
-else # RISC-V CROSS COMPILATION
+else
 	MK_CFLAGS   += -march=rv64gcv -mabi=lp64d
 	MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
 endif
@@ -930,11 +923,11 @@ OBJ_LLAMA = \

 OBJ_COMMON = \
 	common/common.o \
-	common/arg.o \
 	common/console.o \
 	common/ngram-cache.o \
 	common/sampling.o \
 	common/train.o \
+	common/grammar-parser.o \
 	common/build-info.o \
 	common/json-schema-to-grammar.o

@@ -1163,11 +1156,6 @@ common/common.o: \
 	include/llama.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

-common/arg.o: \
-	common/arg.cpp \
-	common/arg.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
 common/sampling.o: \
 	common/sampling.cpp \
 	common/sampling.h \
@@ -1179,6 +1167,11 @@ common/console.o: \
 	common/console.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

+common/grammar-parser.o: \
+	common/grammar-parser.cpp \
+	common/grammar-parser.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
 common/json-schema-to-grammar.o: \
 	common/json-schema-to-grammar.cpp \
 	common/json-schema-to-grammar.h
@@ -1440,7 +1433,6 @@ llama-server: \
 	examples/server/system-prompts.js.hpp \
 	examples/server/prompt-formats.js.hpp \
 	examples/server/json-schema-to-grammar.mjs.hpp \
-	examples/server/loading.html.hpp \
 	common/json.hpp \
 	common/stb_image.h \
 	$(OBJ_ALL)
@@ -1456,11 +1448,6 @@ examples/server/%.hpp: examples/server/public/% Makefile
 		echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
 	) > $@

-llama-gen-docs: examples/gen-docs/gen-docs.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
 libllava.a: examples/llava/llava.cpp \
 	examples/llava/llava.h \
 	examples/llava/clip.cpp \
@@ -1518,11 +1505,6 @@ run-benchmark-matmult: llama-benchmark-matmult

 .PHONY: run-benchmark-matmult swift

-tests/test-arg-parser: tests/test-arg-parser.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
 tests/test-llama-grammar: tests/test-llama-grammar.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)

 ## Hot topics

- Huggingface GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
+- *add hot topics here*

 ----

@@ -89,7 +89,6 @@ Typically finetunes of the base models below are supported as well.
 - [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)
 - [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct)
 - [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a)
- [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat)

 (instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))

@@ -164,7 +163,6 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 - [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
 - [AIKit](https://github.com/sozercan/aikit) (MIT)
 - [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL)
- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)

 *(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*

@@ -173,7 +171,6 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 - [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
 - [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
 - [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with prebuild Mobile and Web platform wrappers and a model example)

 **Infrastructure:**

--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -54,12 +54,12 @@ add_library(${TARGET} STATIC
    base64.hpp
    common.h
    common.cpp
-    arg.h
-    arg.cpp
    sampling.h
    sampling.cpp
    console.h
    console.cpp
+    grammar-parser.h
+    grammar-parser.cpp
    json.hpp
    json-schema-to-grammar.cpp
    train.h
--- a/common/arg.cpp
+++ b/common/arg.cpp
--- a/common/arg.h
+++ b/common/arg.h
@@ -1,77 +0,0 @@
-#pragma once
-
-#include "common.h"
-
-#include <set>
-#include <string>
-#include <vector>
-
-//
-// CLI argument parsing
-//
-
-struct llama_arg {
-    std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
-    std::vector<const char *> args;
-    const char * value_hint   = nullptr; // help text or example for arg value
-    const char * value_hint_2 = nullptr; // for second arg value
-    const char * env          = nullptr;
-    std::string help;
-    bool is_sparam = false; // is current arg a sampling param?
-    void (*handler_void)   (gpt_params & params) = nullptr;
-    void (*handler_string) (gpt_params & params, const std::string &) = nullptr;
-    void (*handler_str_str)(gpt_params & params, const std::string &, const std::string &) = nullptr;
-    void (*handler_int)    (gpt_params & params, int) = nullptr;
-
-    llama_arg(
-        const std::initializer_list<const char *> & args,
-        const char * value_hint,
-        const std::string & help,
-        void (*handler)(gpt_params & params, const std::string &)
-    ) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
-
-    llama_arg(
-        const std::initializer_list<const char *> & args,
-        const char * value_hint,
-        const std::string & help,
-        void (*handler)(gpt_params & params, int)
-    ) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
-
-    llama_arg(
-        const std::initializer_list<const char *> & args,
-        const std::string & help,
-        void (*handler)(gpt_params & params)
-    ) : args(args), help(help), handler_void(handler) {}
-
-    // support 2 values for arg
-    llama_arg(
-        const std::initializer_list<const char *> & args,
-        const char * value_hint,
-        const char * value_hint_2,
-        const std::string & help,
-        void (*handler)(gpt_params & params, const std::string &, const std::string &)
-    ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
-
-    llama_arg & set_examples(std::initializer_list<enum llama_example> examples);
-    llama_arg & set_env(const char * env);
-    llama_arg & set_sparam();
-    bool in_example(enum llama_example ex);
-    bool get_value_from_env(std::string & output);
-    bool has_value_from_env();
-    std::string to_string();
-};
-
-struct gpt_params_context {
-    enum llama_example ex = LLAMA_EXAMPLE_COMMON;
-    gpt_params & params;
-    std::vector<llama_arg> options;
-    void(*print_usage)(int, char **) = nullptr;
-    gpt_params_context(gpt_params & params) : params(params) {}
-};
-
-// parse input arguments from CLI
-// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
-bool gpt_params_parse(int argc, char ** argv, gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
-
-// function to be used by test-arg-parser
-gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
--- a/common/common.cpp
+++ b/common/common.cpp
--- a/common/common.h
+++ b/common/common.h
@@ -4,11 +4,18 @@

 #include "llama.h"

+#include "sampling.h"
+
 #define LOG_NO_FILE_LINE_FUNCTION
 #include "log.h"

+#include <cmath>
 #include <string>
 #include <vector>
+#include <random>
+#include <thread>
+#include <unordered_map>
+#include <tuple>

 #ifdef _WIN32
 #define DIRECTORY_SEPARATOR '\\'
@@ -47,6 +54,19 @@ struct llama_control_vector_load_info;
 // CPU utils
 //

+int32_t cpu_get_num_physical_cores();
+int32_t cpu_get_num_math();
+
+//
+// CLI argument parsing
+//
+
+// dimensionality reduction methods, used by cvector-generator
+enum dimre_method {
+    DIMRE_METHOD_PCA,
+    DIMRE_METHOD_MEAN,
+};
+
 struct cpu_params {
    int      n_threads                   = -1;
    bool     cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
@@ -56,94 +76,9 @@ struct cpu_params {
    uint32_t poll                        = 50;      // Polling (busywait) level (0 - no polling, 100 - mostly polling)
 };

-int32_t cpu_get_num_physical_cores();
-int32_t cpu_get_num_math();
-
-//
-// Common params
-//
-
-enum llama_example {
-    LLAMA_EXAMPLE_COMMON,
-    LLAMA_EXAMPLE_SPECULATIVE,
-    LLAMA_EXAMPLE_MAIN,
-    LLAMA_EXAMPLE_INFILL,
-    LLAMA_EXAMPLE_EMBEDDING,
-    LLAMA_EXAMPLE_PERPLEXITY,
-    LLAMA_EXAMPLE_RETRIEVAL,
-    LLAMA_EXAMPLE_PASSKEY,
-    LLAMA_EXAMPLE_IMATRIX,
-    LLAMA_EXAMPLE_BENCH,
-    LLAMA_EXAMPLE_SERVER,
-    LLAMA_EXAMPLE_CVECTOR_GENERATOR,
-    LLAMA_EXAMPLE_EXPORT_LORA,
-    LLAMA_EXAMPLE_LLAVA,
-    LLAMA_EXAMPLE_LOOKUP,
-    LLAMA_EXAMPLE_PARALLEL,
-
-    LLAMA_EXAMPLE_COUNT,
-};
-
-enum gpt_sampler_type {
-    GPT_SAMPLER_TYPE_NONE        = 0,
-    GPT_SAMPLER_TYPE_TOP_K       = 1,
-    GPT_SAMPLER_TYPE_TOP_P       = 2,
-    GPT_SAMPLER_TYPE_MIN_P       = 3,
-    GPT_SAMPLER_TYPE_TFS_Z       = 4,
-    GPT_SAMPLER_TYPE_TYPICAL_P   = 5,
-    GPT_SAMPLER_TYPE_TEMPERATURE = 6,
-};
-
-// dimensionality reduction methods, used by cvector-generator
-enum dimre_method {
-    DIMRE_METHOD_PCA,
-    DIMRE_METHOD_MEAN,
-};
-
-// sampler parameters
-struct gpt_sampler_params {
-    uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
-
-    int32_t n_prev            = 64;    // number of previous tokens to remember
-    int32_t n_probs           = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
-    int32_t min_keep          = 0;     // 0 = disabled, otherwise samplers should return at least min_keep tokens
-    int32_t top_k             = 40;    // <= 0 to use vocab size
-    float   top_p             = 0.95f; // 1.0 = disabled
-    float   min_p             = 0.05f; // 0.0 = disabled
-    float   tfs_z             = 1.00f; // 1.0 = disabled
-    float   typ_p             = 1.00f; // typical_p, 1.0 = disabled
-    float   temp              = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
-    float   dynatemp_range    = 0.00f; // 0.0 = disabled
-    float   dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
-    int32_t penalty_last_n    = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
-    float   penalty_repeat    = 1.00f; // 1.0 = disabled
-    float   penalty_freq      = 0.00f; // 0.0 = disabled
-    float   penalty_present   = 0.00f; // 0.0 = disabled
-    int32_t mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
-    float   mirostat_tau      = 5.00f; // target entropy
-    float   mirostat_eta      = 0.10f; // learning rate
-    bool    penalize_nl       = false; // consider newlines as a repeatable token
-    bool    ignore_eos        = false;
-    bool    no_perf           = false; // disable performance metrics
-
-    std::vector<enum gpt_sampler_type> samplers = {
-        GPT_SAMPLER_TYPE_TOP_K,
-        GPT_SAMPLER_TYPE_TFS_Z,
-        GPT_SAMPLER_TYPE_TYPICAL_P,
-        GPT_SAMPLER_TYPE_TOP_P,
-        GPT_SAMPLER_TYPE_MIN_P,
-        GPT_SAMPLER_TYPE_TEMPERATURE
-    };
-
-    std::string grammar; // optional BNF-like grammar to constrain sampling
-
-    std::vector<llama_logit_bias> logit_bias; // logit biases to apply
-
-    // print the parameters into a string
-    std::string print() const;
-};
-
 struct gpt_params {
+    uint32_t seed                 = LLAMA_DEFAULT_SEED; // RNG seed
+
    int32_t n_predict             =    -1; // new tokens to predict
    int32_t n_ctx                 =     0; // context size
    int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
@@ -185,25 +120,26 @@ struct gpt_params {
    enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
    enum llama_attention_type    attention_type    = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings

-    struct gpt_sampler_params sparams;
+    // // sampling parameters
+    struct llama_sampling_params sparams;

-    std::string model                = ""; // model path                                                    // NOLINT
-    std::string model_draft          = ""; // draft model for speculative decoding                          // NOLINT
-    std::string model_alias          = "unknown"; // model alias                                            // NOLINT
-    std::string model_url            = ""; // model url to download                                         // NOLINT
-    std::string hf_token             = ""; // HF token                                                      // NOLINT
-    std::string hf_repo              = ""; // HF repo                                                       // NOLINT
-    std::string hf_file              = ""; // HF file                                                       // NOLINT
-    std::string prompt               = "";                                                                  // NOLINT
-    std::string prompt_file          = ""; // store the external prompt file name                           // NOLINT
-    std::string path_prompt_cache    = ""; // path to file for saving/loading prompt eval state             // NOLINT
-    std::string input_prefix         = ""; // string to prefix user inputs with                             // NOLINT
-    std::string input_suffix         = ""; // string to suffix user inputs with                             // NOLINT
-    std::string logdir               = ""; // directory in which to save YAML log files                     // NOLINT
-    std::string lookup_cache_static  = ""; // path of static ngram cache file for lookup decoding           // NOLINT
-    std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding          // NOLINT
-    std::string logits_file          = ""; // file for saving *all* logits                                  // NOLINT
-    std::string rpc_servers          = ""; // comma separated list of RPC servers                           // NOLINT
+    std::string model                = ""; // model path
+    std::string model_draft          = ""; // draft model for speculative decoding
+    std::string model_alias          = "unknown"; // model alias
+    std::string model_url            = ""; // model url to download
+    std::string hf_token             = ""; // HF token
+    std::string hf_repo              = ""; // HF repo
+    std::string hf_file              = ""; // HF file
+    std::string prompt               = "";
+    std::string prompt_file          = ""; // store the external prompt file name
+    std::string path_prompt_cache    = ""; // path to file for saving/loading prompt eval state
+    std::string input_prefix         = ""; // string to prefix user inputs with
+    std::string input_suffix         = ""; // string to suffix user inputs with
+    std::string logdir               = ""; // directory in which to save YAML log files
+    std::string lookup_cache_static  = ""; // path of static ngram cache file for lookup decoding
+    std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
+    std::string logits_file          = ""; // file for saving *all* logits
+    std::string rpc_servers          = ""; // comma separated list of RPC servers

    std::vector<std::string> in_files;   // all input files
    std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
@@ -247,14 +183,15 @@ struct gpt_params {
    bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
    bool cont_batching     = true;  // insert new sequences for decoding on-the-fly
    bool flash_attn        = false; // flash attention
-    bool no_perf           = false; // disable performance metrics

    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
+    bool ignore_eos        = false; // ignore generated EOS tokens
    bool logits_all        = false; // return logits for all tokens in the batch
    bool use_mmap          = true;  // use mmap for faster loads
    bool use_mlock         = false; // use mlock to keep model in memory
    bool verbose_prompt    = false; // print prompt tokens before generation
    bool display_prompt    = true;  // print prompt before generation
+    bool infill            = false; // use infill mode
    bool dump_kv_cache     = false; // dump the KV cache contents for debugging purposes
    bool no_kv_offload     = false; // disable KV offloading
    bool warmup            = true;  // warmup run
@@ -264,7 +201,7 @@ struct gpt_params {
    std::string cache_type_v = "f16"; // KV cache data type for the V

    // multimodal models (see examples/llava)
-    std::string mmproj = "";        // path to multimodal projector                                         // NOLINT
+    std::string mmproj = "";        // path to multimodal projector
    std::vector<std::string> image; // path to image file(s)

    // embedding
@@ -280,15 +217,15 @@ struct gpt_params {
    int     n_threads_http = -1;           // number of threads to process HTTP requests (TODO: support threadpool)

    std::string hostname      = "127.0.0.1";
-    std::string public_path   = "";                                                                         // NOLINT
-    std::string chat_template = "";                                                                         // NOLINT
-    std::string system_prompt = "";                                                                         // NOLINT
+    std::string public_path   = "";
+    std::string chat_template = "";
+    std::string system_prompt = "";
    bool enable_chat_template = true;

    std::vector<std::string> api_keys;

-    std::string ssl_file_key  = "";                                                                         // NOLINT
-    std::string ssl_file_cert = "";                                                                         // NOLINT
+    std::string ssl_file_key  = "";
+    std::string ssl_file_cert = "";

    bool endpoint_slots   = true;
    bool endpoint_metrics = false;
@@ -338,11 +275,16 @@ struct gpt_params {
    bool spm_infill = false; // suffix/prefix/middle pattern for infill

    std::string lora_outfile = "ggml-lora-merged-f16.gguf";
-
-    // batched-bench params
-    bool batched_bench_output_jsonl = false;
 };

+void gpt_params_parse_from_env(gpt_params & params);
+void gpt_params_handle_model_default(gpt_params & params);
+
+bool gpt_params_parse_ex   (int argc, char ** argv, gpt_params & params);
+bool gpt_params_parse      (int argc, char ** argv, gpt_params & params);
+bool gpt_params_find_arg   (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
+void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
+
 std::string gpt_params_get_system_info(const gpt_params & params);

 bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
--- a/common/grammar-parser.cpp
+++ b/common/grammar-parser.cpp
@@ -0,0 +1,539 @@
+#include "grammar-parser.h"
+#include <cstdint>
+#include <cwchar>
+#include <string>
+#include <utility>
+#include <stdexcept>
+#include <exception>
+
+namespace grammar_parser {
+    // NOTE: assumes valid utf8 (but checks for overrun)
+    // copied from llama.cpp
+    static std::pair<uint32_t, const char *> decode_utf8(const char * src) {
+        static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
+        uint8_t  first_byte = static_cast<uint8_t>(*src);
+        uint8_t  highbits   = first_byte >> 4;
+        int      len        = lookup[highbits];
+        uint8_t  mask       = (1 << (8 - len)) - 1;
+        uint32_t value      = first_byte & mask;
+        const char * end    = src + len; // may overrun!
+        const char * pos    = src + 1;
+        for ( ; pos < end && *pos; pos++) {
+            value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
+        }
+        return std::make_pair(value, pos);
+    }
+
+    static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
+        uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
+        auto result = state.symbol_ids.emplace(std::string(src, len), next_id);
+        return result.first->second;
+    }
+
+    static uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) {
+        uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
+        state.symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id;
+        return next_id;
+    }
+
+    static void add_rule(
+            parse_state & state,
+            uint32_t      rule_id,
+            const std::vector<llama_grammar_element> & rule) {
+        if (state.rules.size() <= rule_id) {
+            state.rules.resize(rule_id + 1);
+        }
+        state.rules[rule_id] = rule;
+    }
+
+    static bool is_digit_char(char c) {
+        return '0' <= c && c <= '9';
+    }
+
+    static bool is_word_char(char c) {
+        return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || is_digit_char(c);
+    }
+
+    static std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
+        const char * pos   = src;
+        const char * end   = src + size;
+        uint32_t     value = 0;
+        for ( ; pos < end && *pos; pos++) {
+            value <<= 4;
+            char c = *pos;
+            if ('a' <= c && c <= 'f') {
+                value += c - 'a' + 10;
+            } else if ('A' <= c && c <= 'F') {
+                value += c - 'A' + 10;
+            } else if ('0' <= c && c <= '9') {
+                value += c - '0';
+            } else {
+                break;
+            }
+        }
+        if (pos != end) {
+            throw std::runtime_error("expecting " + std::to_string(size) + " hex chars at " + src);
+        }
+        return std::make_pair(value, pos);
+    }
+
+    static const char * parse_space(const char * src, bool newline_ok) {
+        const char * pos = src;
+        while (*pos == ' ' || *pos == '\t' || *pos == '#' ||
+                (newline_ok && (*pos == '\r' || *pos == '\n'))) {
+            if (*pos == '#') {
+                while (*pos && *pos != '\r' && *pos != '\n') {
+                    pos++;
+                }
+            } else {
+                pos++;
+            }
+        }
+        return pos;
+    }
+
+    static const char * parse_name(const char * src) {
+        const char * pos = src;
+        while (is_word_char(*pos)) {
+            pos++;
+        }
+        if (pos == src) {
+            throw std::runtime_error(std::string("expecting name at ") + src);
+        }
+        return pos;
+    }
+
+    static const char * parse_int(const char * src) {
+        const char * pos = src;
+        while (is_digit_char(*pos)) {
+            pos++;
+        }
+        if (pos == src) {
+            throw std::runtime_error(std::string("expecting integer at ") + src);
+        }
+        return pos;
+    }
+
+    static std::pair<uint32_t, const char *> parse_char(const char * src) {
+        if (*src == '\\') {
+            switch (src[1]) {
+                case 'x': return parse_hex(src + 2, 2);
+                case 'u': return parse_hex(src + 2, 4);
+                case 'U': return parse_hex(src + 2, 8);
+                case 't': return std::make_pair('\t', src + 2);
+                case 'r': return std::make_pair('\r', src + 2);
+                case 'n': return std::make_pair('\n', src + 2);
+                case '\\':
+                case '"':
+                case '[':
+                case ']':
+                    return std::make_pair(src[1], src + 2);
+                default:
+                    throw std::runtime_error(std::string("unknown escape at ") + src);
+            }
+        } else if (*src) {
+            return decode_utf8(src);
+        }
+        throw std::runtime_error("unexpected end of input");
+    }
+
+    const char * parse_alternates(
+            parse_state       & state,
+            const char        * src,
+            const std::string & rule_name,
+            uint32_t            rule_id,
+            bool                is_nested);
+
+    static const char * parse_sequence(
+            parse_state                        & state,
+            const char                         * src,
+            const std::string                  & rule_name,
+            std::vector<llama_grammar_element> & out_elements,
+            bool                                 is_nested) {
+        size_t last_sym_start = out_elements.size();
+        const char * pos = src;
+
+        auto handle_repetitions = [&](int min_times, int max_times) {
+
+            if (last_sym_start == out_elements.size()) {
+                throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos);
+            }
+
+            // apply transformation to previous symbol (last_sym_start to end) according to
+            // the following rewrite rules:
+            // S{m,n} --> S S S (m times) S'(n-m)
+            //            S'(x)   ::= S S'(x-1) |
+            //            (... n-m definitions of these S' rules ...)
+            //            S'(1)   ::= S |
+            // S{m,} -->  S S S (m times) S'
+            //            S'     ::= S S' |
+            // S*     --> S{0,}
+            //        --> S'     ::= S S' |
+            // S+     --> S{1,}
+            //        --> S S'
+            //            S'     ::= S S' |
+            // S?     --> S{0,1}
+            //        --> S'
+            //            S'     ::= S |
+
+            std::vector<llama_grammar_element> previous_elements(out_elements.begin() + last_sym_start, out_elements.end());
+            if (min_times == 0) {
+                out_elements.resize(last_sym_start);
+            } else {
+                // Repeat the previous elements (min_times - 1) times
+                for (int i = 1; i < min_times; i++) {
+                    out_elements.insert(out_elements.end(), previous_elements.begin(), previous_elements.end());
+                }
+            }
+
+            uint32_t last_rec_rule_id = 0;
+            auto n_opt = max_times < 0 ? 1 : max_times - min_times;
+
+            std::vector<llama_grammar_element> rec_rule(previous_elements);
+            for (int i = 0; i < n_opt; i++) {
+                rec_rule.resize(previous_elements.size());
+                uint32_t rec_rule_id = generate_symbol_id(state, rule_name);
+                if (i > 0 || max_times < 0) {
+                    rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, max_times < 0 ? rec_rule_id : last_rec_rule_id});
+                }
+                rec_rule.push_back({LLAMA_GRETYPE_ALT, 0});
+                rec_rule.push_back({LLAMA_GRETYPE_END, 0});
+                add_rule(state, rec_rule_id, rec_rule);
+                last_rec_rule_id = rec_rule_id;
+            }
+            if (n_opt > 0) {
+                out_elements.push_back({LLAMA_GRETYPE_RULE_REF, last_rec_rule_id});
+            }
+        };
+
+        while (*pos) {
+            if (*pos == '"') { // literal string
+                pos++;
+                last_sym_start = out_elements.size();
+                while (*pos != '"') {
+                    if (!*pos) {
+                        throw std::runtime_error("unexpected end of input");
+                    }
+                    auto char_pair = parse_char(pos);
+                         pos       = char_pair.second;
+                    out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
+                }
+                pos = parse_space(pos + 1, is_nested);
+            } else if (*pos == '[') { // char range(s)
+                pos++;
+                enum llama_gretype start_type = LLAMA_GRETYPE_CHAR;
+                if (*pos == '^') {
+                    pos++;
+                    start_type = LLAMA_GRETYPE_CHAR_NOT;
+                }
+                last_sym_start = out_elements.size();
+                while (*pos != ']') {
+                    if (!*pos) {
+                        throw std::runtime_error("unexpected end of input");
+                    }
+                    auto char_pair = parse_char(pos);
+                         pos       = char_pair.second;
+                    enum llama_gretype type = last_sym_start < out_elements.size()
+                        ? LLAMA_GRETYPE_CHAR_ALT
+                        : start_type;
+
+                    out_elements.push_back({type, char_pair.first});
+                    if (pos[0] == '-' && pos[1] != ']') {
+                        if (!pos[1]) {
+                            throw std::runtime_error("unexpected end of input");
+                        }
+                        auto endchar_pair = parse_char(pos + 1);
+                             pos          = endchar_pair.second;
+                        out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});
+                    }
+                }
+                pos = parse_space(pos + 1, is_nested);
+            } else if (is_word_char(*pos)) { // rule reference
+                const char * name_end    = parse_name(pos);
+                uint32_t     ref_rule_id = get_symbol_id(state, pos, name_end - pos);
+                pos = parse_space(name_end, is_nested);
+                last_sym_start = out_elements.size();
+                out_elements.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id});
+            } else if (*pos == '(') { // grouping
+                // parse nested alternates into synthesized rule
+                pos = parse_space(pos + 1, true);
+                uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
+                pos = parse_alternates(state, pos, rule_name, sub_rule_id, true);
+                last_sym_start = out_elements.size();
+                // output reference to synthesized rule
+                out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
+                if (*pos != ')') {
+                    throw std::runtime_error(std::string("expecting ')' at ") + pos);
+                }
+                pos = parse_space(pos + 1, is_nested);
+            } else if (*pos == '.') { // any char
+                last_sym_start = out_elements.size();
+                out_elements.push_back({LLAMA_GRETYPE_CHAR_ANY, 0});
+                pos = parse_space(pos + 1, is_nested);
+            } else if (*pos == '*') {
+                pos = parse_space(pos + 1, is_nested);
+                handle_repetitions(0, -1);
+            } else if (*pos == '+') {
+                pos = parse_space(pos + 1, is_nested);
+                handle_repetitions(1, -1);
+            } else if (*pos == '?') {
+                pos = parse_space(pos + 1, is_nested);
+                handle_repetitions(0, 1);
+            } else if (*pos == '{') {
+                pos = parse_space(pos + 1, is_nested);
+
+                if (!is_digit_char(*pos)) {
+                    throw std::runtime_error(std::string("expecting an int at ") + pos);
+                }
+                const char * int_end = parse_int(pos);
+                int min_times = std::stoul(std::string(pos, int_end - pos));
+                pos = parse_space(int_end, is_nested);
+
+                int max_times = -1;
+
+                if (*pos == '}') {
+                    max_times = min_times;
+                    pos = parse_space(pos + 1, is_nested);
+                } else if (*pos == ',') {
+                    pos = parse_space(pos + 1, is_nested);
+
+                    if (is_digit_char(*pos)) {
+                        const char * int_end = parse_int(pos);
+                        max_times = std::stoul(std::string(pos, int_end - pos));
+                        pos = parse_space(int_end, is_nested);
+                    }
+
+                    if (*pos != '}') {
+                        throw std::runtime_error(std::string("expecting '}' at ") + pos);
+                    }
+                    pos = parse_space(pos + 1, is_nested);
+                } else {
+                    throw std::runtime_error(std::string("expecting ',' at ") + pos);
+                }
+                handle_repetitions(min_times, max_times);
+            } else {
+                break;
+            }
+        }
+        return pos;
+    }
+
+    const char * parse_alternates(
+            parse_state       & state,
+            const char        * src,
+            const std::string & rule_name,
+            uint32_t            rule_id,
+            bool                is_nested) {
+        std::vector<llama_grammar_element> rule;
+        const char * pos = parse_sequence(state, src, rule_name, rule, is_nested);
+        while (*pos == '|') {
+            rule.push_back({LLAMA_GRETYPE_ALT, 0});
+            pos = parse_space(pos + 1, true);
+            pos = parse_sequence(state, pos, rule_name, rule, is_nested);
+        }
+        rule.push_back({LLAMA_GRETYPE_END, 0});
+        add_rule(state, rule_id, rule);
+        return pos;
+    }
+
+    static const char * parse_rule(parse_state & state, const char * src) {
+        const char * name_end = parse_name(src);
+        const char * pos      = parse_space(name_end, false);
+        size_t       name_len = name_end - src;
+        uint32_t     rule_id  = get_symbol_id(state, src, name_len);
+        const std::string name(src, name_len);
+
+        if (!(pos[0] == ':' && pos[1] == ':' && pos[2] == '=')) {
+            throw std::runtime_error(std::string("expecting ::= at ") + pos);
+        }
+        pos = parse_space(pos + 3, true);
+
+        pos = parse_alternates(state, pos, name, rule_id, false);
+
+        if (*pos == '\r') {
+            pos += pos[1] == '\n' ? 2 : 1;
+        } else if (*pos == '\n') {
+            pos++;
+        } else if (*pos) {
+            throw std::runtime_error(std::string("expecting newline or end at ") + pos);
+        }
+        return parse_space(pos, true);
+    }
+
+    parse_state parse(const char * src) {
+        try {
+            parse_state state;
+            const char * pos = parse_space(src, true);
+            while (*pos) {
+                pos = parse_rule(state, pos);
+            }
+            // Validate the state to ensure that all rules are defined
+            for (const auto & rule : state.rules) {
+                if (rule.empty()) {
+                    throw std::runtime_error("Undefined rule");
+                }
+                for (const auto & elem : rule) {
+                    if (elem.type == LLAMA_GRETYPE_RULE_REF) {
+                        // Ensure that the rule at that location exists
+                        if (elem.value >= state.rules.size() || state.rules[elem.value].empty()) {
+                            // Get the name of the rule that is missing
+                            for (const auto & kv : state.symbol_ids) {
+                                if (kv.second == elem.value) {
+                                    throw std::runtime_error("Undefined rule identifier '" + kv.first + "'");
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            return state;
+        } catch (const std::exception & err) {
+            fprintf(stderr, "%s: error parsing grammar: %s\n", __func__, err.what());
+            return parse_state();
+        }
+    }
+
+    static void print_grammar_char(FILE * file, uint32_t c) {
+        if (0x20 <= c && c <= 0x7f) {
+            fprintf(file, "%c", static_cast<char>(c));
+        } else {
+            // cop out of encoding UTF-8
+            fprintf(file, "<U+%04X>", c);
+        }
+    }
+
+    static bool is_char_element(llama_grammar_element elem) {
+        switch (elem.type) {
+            case LLAMA_GRETYPE_CHAR:           return true;
+            case LLAMA_GRETYPE_CHAR_NOT:       return true;
+            case LLAMA_GRETYPE_CHAR_ALT:       return true;
+            case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true;
+            case LLAMA_GRETYPE_CHAR_ANY:       return true;
+            default:                           return false;
+        }
+    }
+
+    static void print_rule_binary(FILE * file, const std::vector<llama_grammar_element> & rule) {
+        for (auto elem : rule) {
+            switch (elem.type) {
+                case LLAMA_GRETYPE_END:            fprintf(file, "END");            break;
+                case LLAMA_GRETYPE_ALT:            fprintf(file, "ALT");            break;
+                case LLAMA_GRETYPE_RULE_REF:       fprintf(file, "RULE_REF");       break;
+                case LLAMA_GRETYPE_CHAR:           fprintf(file, "CHAR");           break;
+                case LLAMA_GRETYPE_CHAR_NOT:       fprintf(file, "CHAR_NOT");       break;
+                case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
+                case LLAMA_GRETYPE_CHAR_ALT:       fprintf(file, "CHAR_ALT");       break;
+                case LLAMA_GRETYPE_CHAR_ANY:       fprintf(file, "CHAR_ANY");       break;
+            }
+            switch (elem.type) {
+                case LLAMA_GRETYPE_END:
+                case LLAMA_GRETYPE_ALT:
+                case LLAMA_GRETYPE_RULE_REF:
+                    fprintf(file, "(%u) ", elem.value);
+                    break;
+                case LLAMA_GRETYPE_CHAR:
+                case LLAMA_GRETYPE_CHAR_NOT:
+                case LLAMA_GRETYPE_CHAR_RNG_UPPER:
+                case LLAMA_GRETYPE_CHAR_ALT:
+                case LLAMA_GRETYPE_CHAR_ANY:
+                    fprintf(file, "(\"");
+                    print_grammar_char(file, elem.value);
+                    fprintf(file, "\") ");
+                    break;
+            }
+        }
+        fprintf(file, "\n");
+    }
+
+    static void print_rule(
+            FILE     * file,
+            uint32_t   rule_id,
+            const std::vector<llama_grammar_element> & rule,
+            const std::map<uint32_t, std::string>    & symbol_id_names) {
+        if (rule.empty() || rule.back().type != LLAMA_GRETYPE_END) {
+            throw std::runtime_error(
+                "malformed rule, does not end with LLAMA_GRETYPE_END: " + std::to_string(rule_id));
+        }
+        fprintf(file, "%s ::= ", symbol_id_names.at(rule_id).c_str());
+        for (size_t i = 0, end = rule.size() - 1; i < end; i++) {
+            llama_grammar_element elem = rule[i];
+            switch (elem.type) {
+                case LLAMA_GRETYPE_END:
+                    throw std::runtime_error(
+                        "unexpected end of rule: " + std::to_string(rule_id) + "," +
+                        std::to_string(i));
+                case LLAMA_GRETYPE_ALT:
+                    fprintf(file, "| ");
+                    break;
+                case LLAMA_GRETYPE_RULE_REF:
+                    fprintf(file, "%s ", symbol_id_names.at(elem.value).c_str());
+                    break;
+                case LLAMA_GRETYPE_CHAR:
+                    fprintf(file, "[");
+                    print_grammar_char(file, elem.value);
+                    break;
+                case LLAMA_GRETYPE_CHAR_NOT:
+                    fprintf(file, "[^");
+                    print_grammar_char(file, elem.value);
+                    break;
+                case LLAMA_GRETYPE_CHAR_RNG_UPPER:
+                    if (i == 0 || !is_char_element(rule[i - 1])) {
+                        throw std::runtime_error(
+                            "LLAMA_GRETYPE_CHAR_RNG_UPPER without preceding char: " +
+                            std::to_string(rule_id) + "," + std::to_string(i));
+                    }
+                    fprintf(file, "-");
+                    print_grammar_char(file, elem.value);
+                    break;
+                case LLAMA_GRETYPE_CHAR_ALT:
+                    if (i == 0 || !is_char_element(rule[i - 1])) {
+                        throw std::runtime_error(
+                            "LLAMA_GRETYPE_CHAR_ALT without preceding char: " +
+                            std::to_string(rule_id) + "," + std::to_string(i));
+                    }
+                    print_grammar_char(file, elem.value);
+                    break;
+                case LLAMA_GRETYPE_CHAR_ANY:
+                    fprintf(file, ".");
+                    break;
+            }
+            if (is_char_element(elem)) {
+                switch (rule[i + 1].type) {
+                    case LLAMA_GRETYPE_CHAR_ALT:
+                    case LLAMA_GRETYPE_CHAR_RNG_UPPER:
+                    case LLAMA_GRETYPE_CHAR_ANY:
+                        break;
+                    default:
+                        fprintf(file, "] ");
+                }
+            }
+        }
+        fprintf(file, "\n");
+    }
+
+    void print_grammar(FILE * file, const parse_state & state) {
+        try {
+            std::map<uint32_t, std::string> symbol_id_names;
+            for (const auto & kv : state.symbol_ids) {
+                symbol_id_names[kv.second] = kv.first;
+            }
+            for (size_t i = 0, end = state.rules.size(); i < end; i++) {
+                // fprintf(file, "%zu: ", i);
+                // print_rule_binary(file, state.rules[i]);
+                print_rule(file, uint32_t(i), state.rules[i], symbol_id_names);
+                // fprintf(file, "\n");
+            }
+        } catch (const std::exception & err) {
+            fprintf(stderr, "\n%s: error printing grammar: %s\n", __func__, err.what());
+        }
+    }
+
+    std::vector<const llama_grammar_element *> parse_state::c_rules() {
+        std::vector<const llama_grammar_element *> ret;
+        ret.reserve(rules.size());
+        for (const auto & rule : rules) {
+            ret.push_back(rule.data());
+        }
+        return ret;
+    }
+}
--- a/common/grammar-parser.h
+++ b/common/grammar-parser.h
@@ -0,0 +1,29 @@
+// Implements a parser for an extended Backus-Naur form (BNF), producing the
+// binary context-free grammar format specified by llama.h. Supports character
+// ranges, grouping, and repetition operators. As an example, a grammar for
+// arithmetic might look like:
+//
+// root  ::= expr
+// expr  ::= term ([-+*/] term)*
+// term  ::= num | "(" space expr ")" space
+// num   ::= [0-9]+ space
+// space ::= [ \t\n]*
+
+#pragma once
+#include "llama.h"
+#include <vector>
+#include <map>
+#include <cstdint>
+#include <string>
+
+namespace grammar_parser {
+    struct parse_state {
+        std::map<std::string, uint32_t>                 symbol_ids;
+        std::vector<std::vector<llama_grammar_element>> rules;
+
+        std::vector<const llama_grammar_element *> c_rules();
+    };
+
+    parse_state parse(const char * src);
+    void print_grammar(FILE * file, const parse_state & state);
+}
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -1,450 +1,460 @@
+#define LLAMA_API_INTERNAL
 #include "sampling.h"
+#include <random>

-#include "common.h"
+struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) {
+    struct llama_sampling_context * result = new llama_sampling_context();

-#include <cmath>
-#include <unordered_map>
+    result->params  = params;
+    result->grammar = nullptr;

-// the ring buffer works similarly to std::deque, but with a fixed capacity
-// TODO: deduplicate with llama-impl.h
-template<typename T>
-struct ring_buffer {
-    ring_buffer(size_t cap) : capacity(cap), data(cap) {}
+    // if there is a grammar, parse it
+    if (!params.grammar.empty()) {
+        result->parsed_grammar = grammar_parser::parse(params.grammar.c_str());

-    T & front() {
-        if (sz == 0) {
-            throw std::runtime_error("ring buffer is empty");
-        }
-        return data[first];
-    }
-
-    const T & front() const {
-        if (sz == 0) {
-            throw std::runtime_error("ring buffer is empty");
-        }
-        return data[first];
-    }
-
-    T & back() {
-        if (sz == 0) {
-            throw std::runtime_error("ring buffer is empty");
-        }
-        return data[pos];
-    }
-
-    const T & back() const {
-        if (sz == 0) {
-            throw std::runtime_error("ring buffer is empty");
-        }
-        return data[pos];
-    }
-
-    void push_back(const T & value) {
-        if (sz == capacity) {
-            // advance the start when buffer is full
-            first = (first + 1) % capacity;
-        } else {
-            sz++;
-        }
-        data[pos] = value;
-        pos = (pos + 1) % capacity;
-    }
-
-    T pop_front() {
-        if (sz == 0) {
-            throw std::runtime_error("ring buffer is empty");
-        }
-        T value = data[first];
-        first = (first + 1) % capacity;
-        sz--;
-        return value;
-    }
-
-    const T & rat(size_t i) const {
-        if (i >= sz) {
-            throw std::runtime_error("ring buffer: index out of bounds");
-        }
-        return data[(first + sz - i - 1) % capacity];
-    }
-
-    std::vector<T> to_vector() const {
-        std::vector<T> result;
-        result.reserve(sz);
-        for (size_t i = 0; i < sz; i++) {
-            result.push_back(data[(first + i) % capacity]);
-        }
-        return result;
-    }
-
-    void clear() {
-        // here only reset the status of the buffer
-        sz = 0;
-        first = 0;
-        pos = 0;
-    }
-
-    bool empty() const {
-        return sz == 0;
-    }
-
-    size_t size() const {
-        return sz;
-    }
-
-    size_t capacity = 0;
-    size_t sz = 0;
-    size_t first = 0;
-    size_t pos = 0;
-    std::vector<T> data;
-};
-
-struct gpt_sampler {
-    gpt_sampler_params params;
-
-    struct llama_sampler * grmr;
-    struct llama_sampler * chain;
-
-    ring_buffer<llama_token> prev;
-
-    std::vector<llama_token_data> cur;
-
-    llama_token_data_array cur_p;
-
-    void set_logits(struct llama_context * ctx, int idx) {
-        const auto * logits = llama_get_logits_ith(ctx, idx);
-
-        const int n_vocab = llama_n_vocab(llama_get_model(ctx));
-
-        cur.resize(n_vocab);
-
-        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-            cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
+        // will be empty (default) if there are parse errors
+        if (result->parsed_grammar.rules.empty()) {
+            fprintf(stderr, "%s: failed to parse grammar\n", __func__);
+            delete result;
+            return nullptr;
        }

-        cur_p = { cur.data(), cur.size(), -1, false };
-    }
-};
+        // Ensure that there is a "root" node.
+        if (result->parsed_grammar.symbol_ids.find("root") == result->parsed_grammar.symbol_ids.end()) {
+            fprintf(stderr, "%s: grammar does not contain a 'root' symbol\n", __func__);
+            delete result;
+            return nullptr;
+        }

-std::string gpt_sampler_params::print() const {
+        std::vector<const llama_grammar_element *> grammar_rules(result->parsed_grammar.c_rules());
+
+        struct llama_grammar * grammar = llama_grammar_init(
+                grammar_rules.data(),
+                grammar_rules.size(), result->parsed_grammar.symbol_ids.at("root"));
+        if (grammar == nullptr) {
+            throw std::runtime_error("Failed to initialize llama_grammar");
+        }
+        result->grammar = grammar;
+    }
+
+    result->prev.resize(params.n_prev);
+
+    result->n_valid = 0;
+
+    llama_sampling_set_rng_seed(result, params.seed);
+
+    return result;
+}
+
+void llama_sampling_free(struct llama_sampling_context * ctx) {
+    if (ctx->grammar != NULL) {
+        llama_grammar_free(ctx->grammar);
+    }
+
+    delete ctx;
+}
+
+void llama_sampling_reset(llama_sampling_context * ctx) {
+    if (ctx->grammar != NULL) {
+        llama_grammar_free(ctx->grammar);
+        ctx->grammar = NULL;
+    }
+
+    if (!ctx->parsed_grammar.rules.empty()) {
+        std::vector<const llama_grammar_element *> grammar_rules(ctx->parsed_grammar.c_rules());
+
+        struct llama_grammar * grammar = llama_grammar_init(
+                grammar_rules.data(),
+                grammar_rules.size(), ctx->parsed_grammar.symbol_ids.at("root"));
+        if (grammar == nullptr) {
+            throw std::runtime_error("Failed to initialize llama_grammar");
+        }
+        ctx->grammar = grammar;
+    }
+
+    std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
+    ctx->cur.clear();
+    ctx->n_valid = 0;
+}
+
+void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) {
+    if (seed == LLAMA_DEFAULT_SEED) {
+        seed = std::random_device{}();
+    }
+    ctx->rng.seed(seed);
+}
+
+void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst) {
+    if (dst->grammar) {
+        llama_grammar_free(dst->grammar);
+        dst->grammar = nullptr;
+    }
+
+    if (src->grammar) {
+        dst->grammar = llama_grammar_copy(src->grammar);
+    }
+
+    dst->prev = src->prev;
+}
+
+llama_token llama_sampling_last(llama_sampling_context * ctx) {
+    return ctx->prev.back();
+}
+
+std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n) {
+    const int size = ctx_sampling->prev.size();
+
+    n = std::min(n, size);
+
+    std::string result;
+
+    for (int i = size - n; i < size; i++) {
+        result += llama_token_to_piece(ctx_main, ctx_sampling->prev[i]);
+    }
+
+    return result;
+}
+
+std::string llama_sampling_print(const llama_sampling_params & params) {
    char result[1024];

    snprintf(result, sizeof(result),
            "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
            "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
            "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
-            penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
-            top_k, tfs_z, top_p, min_p, typ_p, temp,
-            mirostat, mirostat_eta, mirostat_tau);
+            params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present,
+            params.top_k, params.tfs_z, params.top_p, params.min_p, params.typical_p, params.temp,
+            params.mirostat, params.mirostat_eta, params.mirostat_tau);

    return std::string(result);
 }

-struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) {
-    llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
-
-    lparams.no_perf = params.no_perf;
-
-    auto * result = new gpt_sampler {
-        /* .params = */ params,
-        /* .grmr   = */ llama_sampler_init_grammar(model, params.grammar.c_str(), "root"),
-        /* .chain  = */ llama_sampler_chain_init(lparams),
-        /* .prev   = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
-        /* .cur    = */ {},
-        /* .cur_p  = */ {},
-    };
-
-    llama_sampler_chain_add(result->chain,
-            llama_sampler_init_logit_bias(
-                llama_n_vocab(model),
-                params.logit_bias.size(),
-                params.logit_bias.data()));
-
-    llama_sampler_chain_add(result->chain,
-            llama_sampler_init_penalties(
-                llama_n_vocab  (model),
-                llama_token_eos(model),
-                llama_token_nl (model),
-                params.penalty_last_n,
-                params.penalty_repeat,
-                params.penalty_freq,
-                params.penalty_present,
-                params.penalize_nl,
-                params.ignore_eos));
-
-    if (params.temp > 0.0f) {
-        if (params.mirostat == 0) {
-            for (const auto & cnstr : params.samplers) {
-                switch (cnstr) {
-                    case GPT_SAMPLER_TYPE_TOP_K:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_top_k    (params.top_k));
-                        break;
-                    case GPT_SAMPLER_TYPE_TOP_P:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_top_p    (params.top_p, params.min_keep));
-                        break;
-                    case GPT_SAMPLER_TYPE_MIN_P:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_min_p    (params.min_p, params.min_keep));
-                        break;
-                    case GPT_SAMPLER_TYPE_TFS_Z:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
-                        break;
-                    case GPT_SAMPLER_TYPE_TYPICAL_P:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_typical  (params.typ_p, params.min_keep));
-                        break;
-                    case GPT_SAMPLER_TYPE_TEMPERATURE:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
-                        break;
-                    default:
-                        GGML_ASSERT(false && "unknown sampler type");
-                }
+std::string llama_sampling_order_print(const llama_sampling_params & params) {
+    std::string result = "CFG -> Penalties ";
+    if (params.mirostat == 0) {
+        for (auto sampler_type : params.samplers_sequence) {
+            const auto sampler_type_name = llama_sampling_type_to_str(sampler_type);
+            if (!sampler_type_name.empty()) {
+                result += "-> " + sampler_type_name + " ";
            }
-            llama_sampler_chain_add(result->chain, llama_sampler_init_softmax());
-            llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
-        } else if (params.mirostat == 1) {
-            llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
-            llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
-        } else if (params.mirostat == 2) {
-            llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
-            llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
-        } else {
-            GGML_ASSERT(false && "unknown mirostat version");
        }
    } else {
-        llama_sampler_chain_add(result->chain, llama_sampler_init_softmax());
-        llama_sampler_chain_add(result->chain, llama_sampler_init_greedy());
+        result += "-> mirostat ";
    }

    return result;
 }

-void gpt_sampler_free(struct gpt_sampler * gsmpl) {
-    if (gsmpl) {
-        llama_sampler_free(gsmpl->grmr);
-
-        llama_sampler_free(gsmpl->chain);
-
-        delete gsmpl;
-    }
-}
-
-void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool accept_grammar) {
-    if (accept_grammar) {
-        llama_sampler_accept(gsmpl->grmr, token);
-    }
-
-    llama_sampler_accept(gsmpl->chain, token);
-
-    gsmpl->prev.push_back(token);
-}
-
-void gpt_sampler_reset(struct gpt_sampler * gsmpl) {
-    llama_sampler_reset(gsmpl->grmr);
-
-    llama_sampler_reset(gsmpl->chain);
-}
-
-struct gpt_sampler * gpt_sampler_clone(gpt_sampler * gsmpl) {
-    return new gpt_sampler {
-        /* .params = */ gsmpl->params,
-        /* .grmr   = */ llama_sampler_clone(gsmpl->grmr),
-        /* .chain  = */ llama_sampler_clone(gsmpl->chain),
-        /* .prev   = */ gsmpl->prev,
-        /* .cur    = */ gsmpl->cur,
-        /* .cur_p  = */ gsmpl->cur_p,
-    };
-}
-
-void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * gsmpl) {
-    // TODO: measure grammar performance
-
-    if (gsmpl) {
-        llama_perf_sampler_print(gsmpl->chain);
-    }
-    if (ctx) {
-        llama_perf_context_print(ctx);
-    }
-}
-
-llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
-    gsmpl->set_logits(ctx, idx);
-
-    auto & grmr  = gsmpl->grmr;
-    auto & chain = gsmpl->chain;
-    auto & cur_p = gsmpl->cur_p; // initialized by set_logits
-
-    if (grammar_first) {
-        llama_sampler_apply(grmr, &cur_p);
-    }
-
-    llama_sampler_apply(chain, &cur_p);
-
-    GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
-
-    const llama_token id = cur_p.data[cur_p.selected].id;
-
-    if (grammar_first) {
-        return id;
-    }
-
-    // check if it the sampled token fits the grammar
-    {
-        llama_token_data       single_token_data       = { id, 1.0f, 0.0f };
-        llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
-
-        llama_sampler_apply(grmr, &single_token_data_array);
-
-        const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
-        if (is_valid) {
-            return id;
-        }
-    }
-
-    // resampling:
-    // if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
-    gsmpl->set_logits(ctx, idx);
-
-    llama_sampler_apply(grmr,  &cur_p);
-    llama_sampler_apply(chain, &cur_p);
-
-    GGML_ASSERT(cur_p.selected != -1 && "no selected token during re-sampling - check your sampling configuration");
-
-    return cur_p.data[cur_p.selected].id;
-}
-
-uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl) {
-    return llama_sampler_get_seed(gsmpl->chain);
-}
-
-// helpers
-
-llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl) {
-    return &gsmpl->cur_p;
-}
-
-llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl) {
-    return gsmpl->prev.rat(0);
-}
-
-std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) {
-    std::string result = "\tlogits ";
-
-    for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
-        const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
-        result += std::string("-> ") + llama_sampler_name(smpl) + " ";
-    }
-
-    return result;
-}
-
-std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx_main, int n) {
-    n = std::min(n, (int) gsmpl->prev.size());
-
-    if (n <= 0) {
-        return "";
-    }
-
-    std::string result;
-    result.reserve(8*n); // 8 is the average length of a token [citation needed], TODO: compute this from the vocab
-
-    for (int i = n - 1; i >= 0; i--) {
-        const llama_token id = gsmpl->prev.rat(i);
-
-        GGML_ASSERT(id != LLAMA_TOKEN_NULL && "null token in the sampling history - should not happen");
-
-        result += llama_token_to_piece(ctx_main, id);
-    }
-
-    return result;
-}
-
-char gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr) {
-    switch (cnstr) {
-        case GPT_SAMPLER_TYPE_TOP_K:       return 'k';
-        case GPT_SAMPLER_TYPE_TFS_Z:       return 'f';
-        case GPT_SAMPLER_TYPE_TYPICAL_P:   return 'y';
-        case GPT_SAMPLER_TYPE_TOP_P:       return 'p';
-        case GPT_SAMPLER_TYPE_MIN_P:       return 'm';
-        case GPT_SAMPLER_TYPE_TEMPERATURE: return 't';
-        default : return '?';
-    }
-}
-
-std::string gpt_sampler_type_to_str(enum gpt_sampler_type cnstr) {
-    switch (cnstr) {
-        case GPT_SAMPLER_TYPE_TOP_K:       return "top_k";
-        case GPT_SAMPLER_TYPE_TFS_Z:       return "tfs_z";
-        case GPT_SAMPLER_TYPE_TYPICAL_P:   return "typ_p";
-        case GPT_SAMPLER_TYPE_TOP_P:       return "top_p";
-        case GPT_SAMPLER_TYPE_MIN_P:       return "min_p";
-        case GPT_SAMPLER_TYPE_TEMPERATURE: return "temperature";
+std::string llama_sampling_type_to_str(llama_sampler_type sampler_type) {
+    switch (sampler_type) {
+        case llama_sampler_type::TOP_K:       return "top_k";
+        case llama_sampler_type::TFS_Z:       return "tfs_z";
+        case llama_sampler_type::TYPICAL_P:   return "typical_p";
+        case llama_sampler_type::TOP_P:       return "top_p";
+        case llama_sampler_type::MIN_P:       return "min_p";
+        case llama_sampler_type::TEMPERATURE: return "temperature";
        default : return "";
    }
 }

-std::vector<gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
-    std::unordered_map<std::string, gpt_sampler_type> sampler_canonical_name_map {
-        { "top_k",       GPT_SAMPLER_TYPE_TOP_K },
-        { "top_p",       GPT_SAMPLER_TYPE_TOP_P },
-        { "typ_p",       GPT_SAMPLER_TYPE_TYPICAL_P },
-        { "min_p",       GPT_SAMPLER_TYPE_MIN_P },
-        { "tfs_z",       GPT_SAMPLER_TYPE_TFS_Z },
-        { "temperature", GPT_SAMPLER_TYPE_TEMPERATURE },
+std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
+    std::unordered_map<std::string, llama_sampler_type> sampler_canonical_name_map {
+        {"top_k",       llama_sampler_type::TOP_K},
+        {"top_p",       llama_sampler_type::TOP_P},
+        {"typical_p",   llama_sampler_type::TYPICAL_P},
+        {"min_p",       llama_sampler_type::MIN_P},
+        {"tfs_z",       llama_sampler_type::TFS_Z},
+        {"temperature", llama_sampler_type::TEMPERATURE}
    };

    // since samplers names are written multiple ways
    // make it ready for both system names and input names
-    std::unordered_map<std::string, gpt_sampler_type> sampler_alt_name_map {
-        { "top-k",       GPT_SAMPLER_TYPE_TOP_K },
-        { "top-p",       GPT_SAMPLER_TYPE_TOP_P },
-        { "nucleus",     GPT_SAMPLER_TYPE_TOP_P },
-        { "typical-p",   GPT_SAMPLER_TYPE_TYPICAL_P },
-        { "typical",     GPT_SAMPLER_TYPE_TYPICAL_P },
-        { "typ-p",       GPT_SAMPLER_TYPE_TYPICAL_P },
-        { "typ",         GPT_SAMPLER_TYPE_TYPICAL_P },
-        { "min-p",       GPT_SAMPLER_TYPE_MIN_P },
-        { "tfs-z",       GPT_SAMPLER_TYPE_TFS_Z },
-        { "tfs",         GPT_SAMPLER_TYPE_TFS_Z },
-        { "temp",        GPT_SAMPLER_TYPE_TEMPERATURE },
+    std::unordered_map<std::string, llama_sampler_type> sampler_alt_name_map {
+        {"top-k",       llama_sampler_type::TOP_K},
+        {"top-p",       llama_sampler_type::TOP_P},
+        {"nucleus",     llama_sampler_type::TOP_P},
+        {"typical-p",   llama_sampler_type::TYPICAL_P},
+        {"typical",     llama_sampler_type::TYPICAL_P},
+        {"min-p",       llama_sampler_type::MIN_P},
+        {"tfs-z",       llama_sampler_type::TFS_Z},
+        {"tfs",         llama_sampler_type::TFS_Z},
+        {"temp",        llama_sampler_type::TEMPERATURE}
    };

-    std::vector<gpt_sampler_type> samplers;
-    samplers.reserve(names.size());
+    std::vector<llama_sampler_type> sampler_types;
+    sampler_types.reserve(names.size());
+    for (const auto & name : names)
+    {
+        auto sampler_item = sampler_canonical_name_map.find(name);
+        if (sampler_item != sampler_canonical_name_map.end())
+        {
+            sampler_types.push_back(sampler_item->second);
+        }
+        else
+        {
+            if (allow_alt_names)
+            {
+                sampler_item = sampler_alt_name_map.find(name);
+                if (sampler_item != sampler_alt_name_map.end())
+                {
+                    sampler_types.push_back(sampler_item->second);
+                }
+            }
+        }
+    }
+    return sampler_types;
+}

-    for (const auto & name : names) {
-        auto sampler = sampler_canonical_name_map.find(name);
-        if (sampler != sampler_canonical_name_map.end()) {
-            samplers.push_back(sampler->second);
+std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string) {
+    std::unordered_map<char, llama_sampler_type> sampler_name_map {
+        {'k', llama_sampler_type::TOP_K},
+        {'p', llama_sampler_type::TOP_P},
+        {'y', llama_sampler_type::TYPICAL_P},
+        {'m', llama_sampler_type::MIN_P},
+        {'f', llama_sampler_type::TFS_Z},
+        {'t', llama_sampler_type::TEMPERATURE}
+    };
+
+    std::vector<llama_sampler_type> sampler_types;
+    sampler_types.reserve(names_string.size());
+    for (const auto & c : names_string) {
+        const auto sampler_item = sampler_name_map.find(c);
+        if (sampler_item != sampler_name_map.end()) {
+            sampler_types.push_back(sampler_item->second);
+        }
+    }
+    return sampler_types;
+}
+
+// no reasons to expose this function in header
+static void sampler_queue(
+                   struct llama_context * ctx_main,
+            const llama_sampling_params & params,
+                 llama_token_data_array & cur_p,
+                                 size_t   min_keep) {
+    const float         temp              = params.temp;
+    const float         dynatemp_range    = params.dynatemp_range;
+    const float         dynatemp_exponent = params.dynatemp_exponent;
+    const int32_t       top_k             = params.top_k;
+    const float         top_p             = params.top_p;
+    const float         min_p             = params.min_p;
+    const float         tfs_z             = params.tfs_z;
+    const float         typical_p         = params.typical_p;
+    const std::vector<llama_sampler_type> & samplers_sequence = params.samplers_sequence;
+
+    for (auto sampler_type : samplers_sequence) {
+        switch (sampler_type) {
+            case llama_sampler_type::TOP_K    : llama_sample_top_k    (ctx_main, &cur_p, top_k,     min_keep); break;
+            case llama_sampler_type::TFS_Z    : llama_sample_tail_free(ctx_main, &cur_p, tfs_z,     min_keep); break;
+            case llama_sampler_type::TYPICAL_P: llama_sample_typical  (ctx_main, &cur_p, typical_p, min_keep); break;
+            case llama_sampler_type::TOP_P    : llama_sample_top_p    (ctx_main, &cur_p, top_p,     min_keep); break;
+            case llama_sampler_type::MIN_P    : llama_sample_min_p    (ctx_main, &cur_p, min_p,     min_keep); break;
+            case llama_sampler_type::TEMPERATURE:
+                if (dynatemp_range > 0) {
+                    float dynatemp_min = std::max(0.0f, temp - dynatemp_range);
+                    float dynatemp_max = std::max(0.0f, temp + dynatemp_range);
+                    llama_sample_entropy(ctx_main, &cur_p, dynatemp_min, dynatemp_max, dynatemp_exponent);
+                } else {
+                    llama_sample_temp(ctx_main, &cur_p, temp);
+                }
+                break;
+            default : break;
+        }
+    }
+}
+
+static llama_token llama_sampling_sample_impl(
+                  struct llama_sampling_context * ctx_sampling,
+                  struct llama_context * ctx_main,
+                  struct llama_context * ctx_cfg,
+                  const int idx,
+                  bool is_resampling) {
+    const llama_sampling_params & params = ctx_sampling->params;
+
+    const float   temp            = params.temp;
+    const int     mirostat        = params.mirostat;
+    const float   mirostat_tau    = params.mirostat_tau;
+    const float   mirostat_eta    = params.mirostat_eta;
+
+    std::vector<float> original_logits;
+    auto cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx, /* apply_grammar= */ is_resampling, &original_logits);
+    if (ctx_sampling->grammar != NULL && !is_resampling) {
+        GGML_ASSERT(!original_logits.empty());
+    }
+    llama_token id = 0;
+
+    if (temp < 0.0) {
+        // greedy sampling, with probs
+        llama_sample_softmax(ctx_main, &cur_p);
+        id = cur_p.data[0].id;
+    } else if (temp == 0.0) {
+        // greedy sampling, no probs
+        id = llama_sample_token_greedy(ctx_main, &cur_p);
+    } else {
+        if (mirostat == 1) {
+            const int mirostat_m = 100;
+            llama_sample_temp(ctx_main, &cur_p, temp);
+            id = llama_sample_token_mirostat(ctx_main, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &ctx_sampling->mirostat_mu);
+        } else if (mirostat == 2) {
+            llama_sample_temp(ctx_main, &cur_p, temp);
+            id = llama_sample_token_mirostat_v2(ctx_main, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling->mirostat_mu);
        } else {
-            if (allow_alt_names) {
-                sampler = sampler_alt_name_map.find(name);
-                if (sampler != sampler_alt_name_map.end()) {
-                    samplers.push_back(sampler->second);
+            // temperature sampling
+            size_t min_keep = std::max(1, params.min_keep);
+
+            sampler_queue(ctx_main, params, cur_p, min_keep);
+
+            id = llama_sample_token_with_rng(ctx_main, &cur_p, ctx_sampling->rng);
+
+            //{
+            //    const int n_top = 10;
+            //    LOG("top %d candidates:\n", n_top);
+
+            //    for (int i = 0; i < n_top; i++) {
+            //        const llama_token id = cur_p.data[i].id;
+            //        (void)id; // To avoid a warning that id is unused when logging is disabled.
+            //        LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx_main, id).c_str(), cur_p.data[i].p);
+            //    }
+            //}
+
+            //LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx_main, id).c_str());
+        }
+    }
+
+    if (ctx_sampling->grammar != NULL && !is_resampling) {
+        // Get a pointer to the logits
+        float * logits = llama_get_logits_ith(ctx_main, idx);
+
+        // Create an array with a single token data element for the sampled id
+        llama_token_data single_token_data = {id, logits[id], 0.0f};
+        llama_token_data_array single_token_data_array = { &single_token_data, 1, false };
+
+        // Apply grammar constraints to the single token
+        llama_grammar_sample(ctx_sampling->grammar, ctx_main, &single_token_data_array);
+
+        // Check if the token is valid according to the grammar by seeing if its logit has been set to -INFINITY
+        bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
+
+        // If the token is not valid according to the grammar, perform resampling
+        if (!is_valid) {
+            LOG("Resampling because token %d: '%s' does not meet grammar rules\n", id, llama_token_to_piece(ctx_main, id).c_str());
+
+            // Restore logits from the copy
+            std::copy(original_logits.begin(), original_logits.end(), logits);
+
+            return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ true);
+        }
+    }
+
+    ctx_sampling->n_valid = temp == 0.0f ? 0 : cur_p.size;
+
+    return id;
+}
+
+static llama_token_data_array llama_sampling_prepare_impl(
+                  struct llama_sampling_context * ctx_sampling,
+                  struct llama_context * ctx_main,
+                  struct llama_context * ctx_cfg,
+                  const int idx,
+                  bool apply_grammar,
+                  std::vector<float> * original_logits) {
+    const llama_sampling_params & params = ctx_sampling->params;
+
+    const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
+
+    const int32_t penalty_last_n  = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
+    const float   penalty_repeat  = params.penalty_repeat;
+    const float   penalty_freq    = params.penalty_freq;
+    const float   penalty_present = params.penalty_present;
+
+    const bool    penalize_nl     = params.penalize_nl;
+
+    auto & prev = ctx_sampling->prev;
+    auto & cur  = ctx_sampling->cur;
+
+    // Get a pointer to the logits
+    float * logits = llama_get_logits_ith(ctx_main, idx);
+
+    if (ctx_sampling->grammar != NULL && !apply_grammar) {
+        GGML_ASSERT(original_logits != NULL);
+        // Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this.
+        *original_logits = {logits, logits + n_vocab};
+    }
+
+    // apply params.logit_bias map
+    for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
+        logits[it->first] += it->second;
+    }
+
+    if (ctx_cfg) {
+        float * logits_guidance = llama_get_logits_ith(ctx_cfg, idx);
+        llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale);
+    }
+
+    cur.resize(n_vocab);
+
+    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+        cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
+    }
+
+    llama_token_data_array cur_p = { cur.data(), cur.size(), false };
+
+    // apply penalties
+    const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev;
+    const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n);
+    if (penalty_tokens_used_size) {
+        const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];
+
+        llama_sample_repetition_penalties(ctx_main, &cur_p,
+                penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size,
+                penalty_tokens_used_size, penalty_repeat, penalty_freq, penalty_present);
+
+        if (!penalize_nl) {
+            for (size_t idx = 0; idx < cur_p.size; idx++) {
+                if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) {
+                    cur_p.data[idx].logit = nl_logit;
+                    break;
                }
            }
        }
    }

-    return samplers;
-}
-
-std::vector<gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars) {
-    std::unordered_map<char, gpt_sampler_type> sampler_name_map = {
-        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_K),       GPT_SAMPLER_TYPE_TOP_K },
-        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TFS_Z),       GPT_SAMPLER_TYPE_TFS_Z },
-        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TYPICAL_P),   GPT_SAMPLER_TYPE_TYPICAL_P },
-        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_P),       GPT_SAMPLER_TYPE_TOP_P },
-        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_MIN_P),       GPT_SAMPLER_TYPE_MIN_P },
-        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TEMPERATURE), GPT_SAMPLER_TYPE_TEMPERATURE }
-    };
-
-    std::vector<gpt_sampler_type> samplers;
-    samplers.reserve(chars.size());
-
-    for (const auto & c : chars) {
-        const auto sampler = sampler_name_map.find(c);
-        if (sampler != sampler_name_map.end()) {
-            samplers.push_back(sampler->second);
-        }
+    // apply grammar checks before sampling logic
+    if (apply_grammar && ctx_sampling->grammar != NULL) {
+        llama_grammar_sample(ctx_sampling->grammar, ctx_main, &cur_p);
    }

-    return samplers;
+    return cur_p;
+}
+
+llama_token llama_sampling_sample(
+                  struct llama_sampling_context * ctx_sampling,
+                  struct llama_context * ctx_main,
+                  struct llama_context * ctx_cfg,
+                  const int idx) {
+    // Call the implementation function with is_resampling set to false by default
+    return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ false);
+}
+
+llama_token_data_array llama_sampling_prepare(
+                  struct llama_sampling_context * ctx_sampling,
+                  struct llama_context * ctx_main,
+                  struct llama_context * ctx_cfg,
+                  const int idx,
+                  bool apply_grammar,
+                  std::vector<float> * original_logits) {
+    return llama_sampling_prepare_impl(ctx_sampling,ctx_main, ctx_cfg, idx, apply_grammar, original_logits);
+}
+
+void llama_sampling_accept(
+        struct llama_sampling_context * ctx_sampling,
+        struct llama_context * ctx_main,
+        llama_token id,
+        bool apply_grammar) {
+    ctx_sampling->prev.erase(ctx_sampling->prev.begin());
+    ctx_sampling->prev.push_back(id);
+
+    if (ctx_sampling->grammar != NULL && apply_grammar) {
+        llama_grammar_accept_token(ctx_sampling->grammar, ctx_main, id);
+    }
 }
--- a/common/sampling.h
+++ b/common/sampling.h
@@ -2,82 +2,159 @@

 #include "llama.h"

-#include "common.h"
+#include "grammar-parser.h"

+#include <random>
 #include <string>
+#include <unordered_map>
 #include <vector>

-// gpt_sampler extends llama_sampler with additional functionality:
+// sampler types
+enum class llama_sampler_type : char {
+    TOP_K       = 'k',
+    TOP_P       = 'p',
+    MIN_P       = 'm',
+    TFS_Z       = 'f',
+    TYPICAL_P   = 'y',
+    TEMPERATURE = 't'
+};
+
+// sampling parameters
+typedef struct llama_sampling_params {
+    int32_t     n_prev                = 64;                 // number of previous tokens to remember
+    int32_t     n_probs               = 0;                  // if greater than 0, output the probabilities of top n_probs tokens.
+    int32_t     min_keep              = 0;                  // 0 = disabled, otherwise samplers should return at least min_keep tokens
+    int32_t     top_k                 = 40;                 // <= 0 to use vocab size
+    float       top_p                 = 0.95f;              // 1.0 = disabled
+    float       min_p                 = 0.05f;              // 0.0 = disabled
+    float       tfs_z                 = 1.00f;              // 1.0 = disabled
+    float       typical_p             = 1.00f;              // 1.0 = disabled
+    float       temp                  = 0.80f;              // <= 0.0 to sample greedily, 0.0 to not output probabilities
+    float       dynatemp_range        = 0.00f;              // 0.0 = disabled
+    float       dynatemp_exponent     = 1.00f;              // controls how entropy maps to temperature in dynamic temperature sampler
+    int32_t     penalty_last_n        = 64;                 // last n tokens to penalize (0 = disable penalty, -1 = context size)
+    float       penalty_repeat        = 1.00f;              // 1.0 = disabled
+    float       penalty_freq          = 0.00f;              // 0.0 = disabled
+    float       penalty_present       = 0.00f;              // 0.0 = disabled
+    int32_t     mirostat              = 0;                  // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
+    float       mirostat_tau          = 5.00f;              // target entropy
+    float       mirostat_eta          = 0.10f;              // learning rate
+    bool        penalize_nl           = false;              // consider newlines as a repeatable token
+    uint32_t    seed                  = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampling_context
+
+    std::vector<llama_sampler_type> samplers_sequence = {
+        llama_sampler_type::TOP_K,
+        llama_sampler_type::TFS_Z,
+        llama_sampler_type::TYPICAL_P,
+        llama_sampler_type::TOP_P,
+        llama_sampler_type::MIN_P,
+        llama_sampler_type::TEMPERATURE
+    };
+
+    std::string grammar;  // optional BNF-like grammar to constrain sampling
+
+    // Classifier-Free Guidance
+    // https://arxiv.org/abs/2306.17806
+    std::string cfg_negative_prompt; // string to help guidance
+    float       cfg_scale     = 1.f; // how strong is guidance
+
+    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
+
+    std::vector<llama_token> penalty_prompt_tokens;
+    bool                     use_penalty_prompt_tokens = false;
+} llama_sampling_params;
+
+// general sampler context
+// TODO: move to llama.h
+struct llama_sampling_context {
+    // parameters that will be used for sampling
+    llama_sampling_params params;
+
+    // mirostat sampler state
+    float mirostat_mu;
+
+    llama_grammar * grammar;
+
+    // internal
+    grammar_parser::parse_state parsed_grammar;
+
+    // TODO: replace with ring-buffer
+    std::vector<llama_token>      prev;
+    std::vector<llama_token_data> cur;
+    size_t n_valid; // Number of correct top tokens with correct probabilities.
+
+    std::mt19937 rng;
+};
+
+#include "common.h"
+
+// Create a new sampling context instance.
+struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params);
+
+void llama_sampling_free(struct llama_sampling_context * ctx);
+
+// Reset the sampler context
+// - clear prev tokens
+// - reset grammar
+void llama_sampling_reset(llama_sampling_context * ctx);
+
+// Set the sampler seed
+void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed);
+
+// Copy the sampler context
+void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst);
+
+// Get the last sampled token
+llama_token llama_sampling_last(llama_sampling_context * ctx);
+
+// Get a string representation of the last sampled tokens
+std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n);
+
+// Print sampling parameters into a string
+std::string llama_sampling_print(const llama_sampling_params & params);
+
+// Print sampling order into a string
+std::string llama_sampling_order_print(const llama_sampling_params & params);
+
+std::string llama_sampling_type_to_str(llama_sampler_type sampler_type);
+
+std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
+std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string);
+
+// this is a common sampling function used across the examples for convenience
+// it can serve as a starting point for implementing your own sampling function
+// Note: When using multiple sequences, it is the caller's responsibility to call
+//       llama_sampling_reset when a sequence ends
 //
-//  - grammar support
-//  - custom sampler logic based on the parameters
-//  - history of the last accepted tokens
-//  - performance metrics
+// required:
+//  - ctx_main:     context to use for sampling
+//  - ctx_sampling: sampling-specific context
 //
-// This goal is to have a common implementation of the sampling logic shared across the examples.
-// For example, depending on the temperature, the sampling chain can be very simple (greedy) or more
-// complex (top-k, top-p, etc).
+// optional:
+//  - ctx_cfg:      context to use for classifier-free guidance
+//  - idx:          sample from llama_get_logits_ith(ctx, idx)
 //
-// Another example is related to the grammar. In general, the grammar constraints applied on the full
-// vocabulary can be very taxing. To improve performance, the grammar can be applied only to the sampled
-// token in order to verify if it fits the grammar. And only if the token doesn't fit the grammar, the
-// grammar constraints are applied to the full vocabulary and the token is resampled.
-//
-// The gpt_sampler also maintains a container with the last accepted tokens. In the future, this can
-// be moved into the core llama library.
-//
-// For convenience, the gpt_sampler also maintains a container with the current candidate tokens.
-// This can be used to access the probabilities of the rest of the non-sampled tokens.
-//
-// TODO: measure grammar performance
+// returns:
+//  - token:      sampled token
+//  - candidates: vector of candidate tokens
 //
+llama_token llama_sampling_sample(
+        struct llama_sampling_context * ctx_sampling,
+        struct llama_context * ctx_main,
+        struct llama_context * ctx_cfg,
+        int idx = -1);

-struct gpt_sampler;
+// Prepares and adjusts the set of token candidates for sampling based on penalties, biases, and sampling parameters.
+llama_token_data_array llama_sampling_prepare(
+        struct llama_sampling_context * ctx_sampling,
+        struct llama_context * ctx_main,
+        struct llama_context * ctx_cfg,
+        int idx = 0,
+        bool apply_grammar = true,
+        std::vector<float> * original_logits = nullptr);

-// llama_sampler API overloads
-
-struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params);
-
-void gpt_sampler_free(struct gpt_sampler * gsmpl);
-
-// if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
-void                 gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool accept_grammar);
-void                 gpt_sampler_reset (struct gpt_sampler * gsmpl);
-struct gpt_sampler * gpt_sampler_clone (struct gpt_sampler * gsmpl);
-
-// arguments can be nullptr to skip printing
-void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * gsmpl);
-
-// extended sampling implementation:
-//
-// - set logits
-// - apply the configured sampler chain
-// - check if the token fits the grammar (if any)
-// - if not: resample by first applying the grammar constraints and then sampling again (slower path)
-//
-// if grammar_first is true, the grammar is applied before the samplers (slower)
-// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
-//
-llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
-
-uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl);
-
-// helpers
-
-// access the internal list of current candidate tokens
-llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl);
-
-// get the last accepted token
-llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl);
-
-// print the sampler chain into a string
-std::string gpt_sampler_print(const struct gpt_sampler * gsmpl);
-
-// get a string representation of the last accepted tokens
-std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx, int n);
-
-char        gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr);
-std::string gpt_sampler_type_to_str(enum gpt_sampler_type cnstr);
-
-std::vector<enum gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
-std::vector<enum gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars);
+void llama_sampling_accept(
+        struct llama_sampling_context * ctx_sampling,
+        struct llama_context * ctx_main,
+        llama_token id,
+        bool apply_grammar);
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -302,28 +302,12 @@ class Model:
                            gguf.MODEL_TENSOR.TIME_MIX_FIRST,
                            gguf.MODEL_TENSOR.TIME_MIX_W1,
                            gguf.MODEL_TENSOR.TIME_MIX_W2,
-                            gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1,
-                            gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2,
                        )
                    )
                    or not new_name.endswith(".weight")
                ):
                    data_qtype = gguf.GGMLQuantizationType.F32

-                if data_qtype is False and any(
-                    self.match_model_tensor_name(new_name, key, bid)
-                    for key in (
-                        gguf.MODEL_TENSOR.TOKEN_EMBD,
-                        gguf.MODEL_TENSOR.OUTPUT,
-                    )
-                ):
-                    if self.ftype in (
-                        gguf.LlamaFileType.MOSTLY_TQ1_0,
-                        gguf.LlamaFileType.MOSTLY_TQ2_0,
-                    ):
-                        # TODO: use Q4_K and Q6_K
-                        data_qtype = gguf.GGMLQuantizationType.F16
-
                # No override (data_qtype is False), or wants to be quantized (data_qtype is True)
                if isinstance(data_qtype, bool):
                    if self.ftype == gguf.LlamaFileType.ALL_F32:
@@ -334,10 +318,6 @@ class Model:
                        data_qtype = gguf.GGMLQuantizationType.BF16
                    elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
                        data_qtype = gguf.GGMLQuantizationType.Q8_0
-                    elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ1_0:
-                        data_qtype = gguf.GGMLQuantizationType.TQ1_0
-                    elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ2_0:
-                        data_qtype = gguf.GGMLQuantizationType.TQ2_0
                    else:
                        raise ValueError(f"Unknown file type: {self.ftype.name}")

@@ -626,9 +606,6 @@ class Model:
        if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae":
            # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct
            res = "exaone"
-        if chkhsh == "fcace8b9cac38ce847670c970cd5892031a753a1ef381abd1d9af00f713da085":
-            # ref: https://huggingface.co/microsoft/phi-2
-            res = "phi-2"

        if res is None:
            logger.warning("\n")
@@ -1487,7 +1464,7 @@ class StableLMModel(Model):
                raise ValueError(f"Unprocessed norms: {norms}")


-@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
+@Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
 class LlamaModel(Model):
    model_arch = gguf.MODEL_ARCH.LLAMA

@@ -1646,16 +1623,15 @@ class BitnetModel(Model):
        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
        self.gguf_writer.add_rope_scaling_factor(1.0)

-    def weight_quant(self, weight: Tensor) -> Tensor:
+    def weight_quant(self, weight):
        dtype = weight.dtype
        weight = weight.float()
-        scale = weight.abs().mean().clamp(min=1e-5)
-        iscale = 1 / scale
-        # TODO: multiply by the scale directly instead of inverting it twice
-        # (this is also unnecessarily doubly inverted upstream)
-        # ref: https://huggingface.co/1bitLLM/bitnet_b1_58-3B/blob/af89e318d78a70802061246bf037199d2fb97020/utils_quant.py#L10
-        result = (weight * iscale).round().clamp(-1, 1) / iscale
-        return result.type(dtype)
+        s = 1 / weight.abs().mean().clamp(min=1e-5)
+        weight = (weight * s).round().clamp(-1, 1) / s
+        scale = weight.abs().max().unsqueeze(0)
+        weight = torch.where(weight.abs().less(1e-6), 0, weight).type(dtype)
+        weight = torch.sign(weight).type(dtype)
+        return weight.type(dtype), scale.type(torch.float32)

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        new_name = self.map_tensor_name(name)
@@ -1670,9 +1646,11 @@ class BitnetModel(Model):
            gguf.MODEL_TENSOR.FFN_GATE,
        ]):
            # transform weight into 1/0/-1 (in fp32)
-            data_torch = self.weight_quant(data_torch)
-
-        yield (new_name, data_torch)
+            weight_torch, scale_torch = self.weight_quant(data_torch)
+            yield (new_name, weight_torch)
+            yield (new_name.removesuffix(".weight") + ".scale", scale_torch)
+        else:
+            yield (new_name, data_torch)


@Model.register("GrokForCausalLM")
@@ -2774,8 +2752,6 @@ class Rwkv6Model(Model):
        self.gguf_writer.add_tokenizer_model("rwkv")
        self.gguf_writer.add_token_list(tokens)
        self.gguf_writer.add_token_types(toktypes)
-        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
-        special_vocab.add_to_gguf(self.gguf_writer)

    def set_gguf_parameters(self):
        block_count = self.hparams["num_hidden_layers"]
@@ -4035,8 +4011,8 @@ def parse_args() -> argparse.Namespace:
        help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
    )
    parser.add_argument(
-        "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="f16",
-        help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
+        "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
+        help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
    )
    parser.add_argument(
        "--bigendian", action="store_true",
@@ -4123,8 +4099,6 @@ def main() -> None:
        "f16": gguf.LlamaFileType.MOSTLY_F16,
        "bf16": gguf.LlamaFileType.MOSTLY_BF16,
        "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
-        "tq1_0": gguf.LlamaFileType.MOSTLY_TQ1_0,
-        "tq2_0": gguf.LlamaFileType.MOSTLY_TQ2_0,
        "auto": gguf.LlamaFileType.GUESSED,
    }

--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -31,7 +31,6 @@ import re
 import requests
 import sys
 import json
-import shutil

 from hashlib import sha256
 from enum import IntEnum, auto
@@ -98,7 +97,6 @@ models = [
    {'name': "bloom",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", },
    {'name': "gpt3-finnish",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", },
    {"name": "exaone",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
-    {"name": "phi-2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
 ]


@@ -127,27 +125,12 @@ def download_model(model):
    if tokt == TOKENIZER_TYPE.UGM:
        files.append("spiece.model")

-    if os.path.isdir(repo):
-        # If repo is a path on the file system, copy the directory
-        for file in files:
-            src_path = os.path.join(repo, file)
-            dst_path = f"models/tokenizers/{name}/{file}"
-            if os.path.isfile(dst_path):
-                logger.info(f"{name}: File {dst_path} already exists - skipping")
-                continue
-            if os.path.isfile(src_path):
-                shutil.copy2(src_path, dst_path)
-                logger.info(f"{name}: Copied {src_path} to {dst_path}")
-            else:
-                logger.warning(f"{name}: Source file {src_path} does not exist")
-    else:
-        # If repo is a URL, download the files
-        for file in files:
-            save_path = f"models/tokenizers/{name}/{file}"
-            if os.path.isfile(save_path):
-                logger.info(f"{name}: File {save_path} already exists - skipping")
-                continue
-            download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)
+    for file in files:
+        save_path = f"models/tokenizers/{name}/{file}"
+        if os.path.isfile(save_path):
+            logger.info(f"{name}: File {save_path} already exists - skipping")
+            continue
+        download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)


 for model in models:
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@@ -363,13 +363,7 @@ if __name__ == '__main__':
                    yield (name, cast(torch.Tensor, LoraTorchTensor(tensor.A, tensor.B)))

            def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-                dest = list(super().modify_tensors(data_torch, name, bid))
-                # some archs may have the same tensor for lm_head and output (tie word embeddings)
-                # in this case, adapters targeting lm_head will fail when using llama-export-lora
-                # therefore, we ignore them for now
-                # see: https://github.com/ggerganov/llama.cpp/issues/9065
-                if name == "lm_head.weight" and len(dest) == 0:
-                    raise ValueError("lm_head is present in adapter, but is ignored in base model")
+                dest = super().modify_tensors(data_torch, name, bid)
                for dest_name, dest_data in dest:
                    assert isinstance(dest_data, LoraTorchTensor)
                    lora_a, lora_b = dest_data.get_lora_A_B()
--- a/docs/build.md
+++ b/docs/build.md
@@ -380,9 +380,3 @@ For detailed info, such as model/device supports, CANN install, please refer to
 ### Android

 To read documentation for how to build on Android, [click here](./android.md)
-
-### Arm CPU optimized mulmat kernels
-
-Llama.cpp includes a set of optimized mulmat kernels for the Arm architecture, leveraging Arm® Neon™, int8mm and SVE instructions. These kernels are enabled at build time through the appropriate compiler cpu-type flags, such as `-DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+sve`. Note that these optimized kernels require the model to be quantized into one of the formats: `Q4_0_4_4` (Arm Neon), `Q4_0_4_8` (int8mm) or `Q4_0_8_8` (SVE). The SVE mulmat kernel specifically requires a vector width of 256 bits. When running on devices with a different vector width, it is recommended to use the `Q4_0_4_8` (int8mm) or `Q4_0_4_4` (Arm Neon) formats for better performance. Refer to [examples/quantize/README.md](../examples/quantize/README.md) for more information on the quantization formats.
-
-To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_LLAMAFILE=OFF` (`cmake`).
--- a/examples/batched-bench/README.md
+++ b/examples/batched-bench/README.md
@@ -49,12 +49,3 @@ There are 2 modes of operation:
 |   128 |    256 |    8 |   3072 |    0.751 |  1363.92 |   15.110 |   135.54 |   15.861 |   193.69 |
 |   128 |    256 |   16 |   6144 |    1.569 |  1304.93 |   18.073 |   226.64 |   19.642 |   312.80 |
 |   128 |    256 |   32 |  12288 |    3.409 |  1201.35 |   19.223 |   426.15 |   22.633 |   542.93 |
-
-### JSONL output
-
-Pass `--output-format jsonl` to output JSONL instead of Markdown, á la
-
-```json lines
-{"n_kv_max": 2048, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "is_pp_shared": 0, "n_gpu_layers": 99, "n_threads": 8, "n_threads_batch": 8, "pp": 128, "tg": 128, "pl": 1, "n_kv": 256, "t_pp": 0.233810, "speed_pp": 547.453064, "t_tg": 3.503684, "speed_tg": 36.532974, "t": 3.737494, "speed": 68.495094}
-{"n_kv_max": 2048, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "is_pp_shared": 0, "n_gpu_layers": 99, "n_threads": 8, "n_threads_batch": 8, "pp": 128, "tg": 128, "pl": 2, "n_kv": 512, "t_pp": 0.422602, "speed_pp": 605.770935, "t_tg": 11.106112, "speed_tg": 23.050371, "t": 11.528713, "speed": 44.410854}
-```
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -1,13 +1,36 @@
-#include "arg.h"
 #include "common.h"
 #include "llama.h"

 #include <algorithm>
+#include <cmath>
 #include <cstdio>
 #include <string>
 #include <vector>

-static void print_usage(int, char ** argv) {
+// mutates the input string
+static std::vector<int> parse_list(char * p) {
+    std::vector<int> ret;
+
+    char * q = p;
+
+    while (*p) {
+        if (*p == ',') {
+            *p = '\0';
+            ret.push_back(std::atoi(q));
+            q = p + 1;
+        }
+
+        ++p;
+    }
+
+    ret.push_back(std::atoi(q));
+
+    return ret;
+}
+
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+    gpt_params_print_usage(argc, argv, params);
+
    LOG_TEE("\nexample usage:\n");
    LOG_TEE("\n    %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
    LOG_TEE("\n");
@@ -16,7 +39,8 @@ static void print_usage(int, char ** argv) {
 int main(int argc, char ** argv) {
    gpt_params params;

-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_BENCH, print_usage)) {
+    if (!gpt_params_parse(argc, argv, params)) {
+        print_usage(argc, argv, params);
        return 1;
    }

@@ -98,13 +122,12 @@ int main(int argc, char ** argv) {
        }
    }

-    if (!params.batched_bench_output_jsonl) {
-        LOG_TEE("\n");
-        LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
-        LOG_TEE("\n");
-        LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
-        LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
-    }
+    LOG_TEE("\n");
+    LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
+    LOG_TEE("\n");
+
+    LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP",     "TG",     "B",    "N_KV",     "T_PP s",   "S_PP t/s", "T_TG s",   "S_TG t/s", "T s",      "S t/s");
+    LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");

    for (        int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) {
        for (    int i_tg = 0; i_tg < (int) n_tg.size(); ++i_tg) {
@@ -172,22 +195,12 @@ int main(int argc, char ** argv) {
                const float speed_tg = pl*tg / t_tg;
                const float speed    = n_kv / t;

-                if(params.batched_bench_output_jsonl) {
-                    LOG_TEE(
-                        "{\"n_kv_max\": %d, \"n_batch\": %d, \"n_ubatch\": %d, \"flash_attn\": %d, \"is_pp_shared\": %d, \"n_gpu_layers\": %d, \"n_threads\": %u, \"n_threads_batch\": %u, "
-                        "\"pp\": %d, \"tg\": %d, \"pl\": %d, \"n_kv\": %d, \"t_pp\": %f, \"speed_pp\": %f, \"t_tg\": %f, \"speed_tg\": %f, \"t\": %f, \"speed\": %f}\n",
-                        n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch,
-                        pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed
-                    );
-                } else {
-                    LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
-                }
+                LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
            }
        }
    }

-    LOG_TEE("\n");
-    llama_perf_context_print(ctx);
+    llama_print_timings(ctx);

    llama_batch_free(batch);

--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@@ -27,6 +27,7 @@ guard let model = llama_load_model_from_file(modelPath.cString(using: .utf8), mo
    print("Failed to load model")
    exit(1)
 }
+
 defer {
    llama_free_model(model)
 }
@@ -36,6 +37,7 @@ var tokens = tokenize(text: prompt, add_bos: true)
 let n_kv_req = UInt32(tokens.count) + UInt32((n_len - Int(tokens.count)) * n_parallel)

 var context_params = llama_context_default_params()
+context_params.seed = 1234
 context_params.n_ctx = n_kv_req
 context_params.n_batch = UInt32(max(n_len, n_parallel))
 context_params.n_threads = 8
@@ -46,26 +48,11 @@ guard context != nil else {
    print("Failed to initialize context")
    exit(1)
 }
+
 defer {
    llama_free(context)
 }

-var sparams = llama_sampler_chain_default_params()
-
-let smpl = llama_sampler_chain_init(sparams)
-guard smpl != nil else {
-    print("Failed to initialize sampling")
-    exit(1)
-}
-defer {
-    llama_sampler_free(smpl)
-}
-
-llama_sampler_chain_add(smpl, llama_sampler_init_top_k(40));
-llama_sampler_chain_add(smpl, llama_sampler_init_top_p(0.9, 1));
-llama_sampler_chain_add(smpl, llama_sampler_init_temp (0.4));
-llama_sampler_chain_add(smpl, llama_sampler_init_dist (1234));
-
 let n_ctx = llama_n_ctx(context)

 print("\nn_len = \(n_len), n_ctx = \(n_ctx), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n")
@@ -138,7 +125,32 @@ while n_cur <= n_len {
            continue
        }

-        let new_token_id = llama_sampler_sample(smpl, context, i_batch[i])
+        var n_vocab = llama_n_vocab(model)
+        var logits = llama_get_logits_ith(context, i_batch[i])
+
+        var candidates: [llama_token_data] = .init(repeating: llama_token_data(), count: Int(n_vocab))
+
+        for token_id in 0 ..< n_vocab {
+            candidates.append(llama_token_data(id: token_id, logit: logits![Int(token_id)], p: 0.0))
+        }
+
+        var candidates_p: llama_token_data_array = .init(
+            data: &candidates,
+            size: candidates.count,
+            sorted: false
+        )
+
+        let top_k: Int32 = 40
+        let top_p: Float = 0.9
+        let temp: Float = 0.4
+
+        llama_sample_top_k(context, &candidates_p, top_k, 1)
+        llama_sample_top_p(context, &candidates_p, top_p, 1)
+        llama_sample_temp(context, &candidates_p, temp)
+
+        let new_token_id = llama_sample_token(context, &candidates_p)
+
+        // const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);

        // is it an end of stream? -> mark the stream as finished
        if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
@@ -198,10 +210,9 @@ if n_parallel > 1 {

 let t_main_end = ggml_time_us()

-print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n\n")
+print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n")

-llama_perf_sampler_print(smpl)
-llama_perf_context_print(context)
+llama_print_timings(context)

 private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
    let utf8Count = text.utf8.count
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -1,13 +1,15 @@
-#include "arg.h"
 #include "common.h"
 #include "llama.h"

 #include <algorithm>
+#include <cmath>
 #include <cstdio>
 #include <string>
 #include <vector>

-static void print_usage(int, char ** argv) {
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+    gpt_params_print_usage(argc, argv, params);
+
    LOG_TEE("\nexample usage:\n");
    LOG_TEE("\n    %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
    LOG_TEE("\n");
@@ -19,7 +21,8 @@ int main(int argc, char ** argv) {
    params.prompt = "Hello my name is";
    params.n_predict = 32;

-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
+    if (!gpt_params_parse(argc, argv, params)) {
+        print_usage(argc, argv, params);
        return 1;
    }

@@ -62,15 +65,6 @@ int main(int argc, char ** argv) {

    llama_context * ctx = llama_new_context_with_model(model, ctx_params);

-    auto sparams = llama_sampler_chain_default_params();
-
-    llama_sampler * smpl = llama_sampler_chain_init(sparams);
-
-    llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sparams.top_k));
-    llama_sampler_chain_add(smpl, llama_sampler_init_top_p(params.sparams.top_p, params.sparams.min_keep));
-    llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sparams.temp));
-    llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sparams.seed));
-
    if (ctx == NULL) {
        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
        return 1;
@@ -170,7 +164,29 @@ int main(int argc, char ** argv) {
                continue;
            }

-            const llama_token new_token_id = llama_sampler_sample(smpl, ctx, i_batch[i]);
+            auto   n_vocab = llama_n_vocab(model);
+            auto * logits  = llama_get_logits_ith(ctx, i_batch[i]);
+
+            std::vector<llama_token_data> candidates;
+            candidates.reserve(n_vocab);
+
+            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+                candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
+            }
+
+            llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+
+            const int   top_k = 40;
+            const float top_p = 0.9f;
+            const float temp  = 0.4f;
+
+            llama_sample_top_k(ctx, &candidates_p, top_k, 1);
+            llama_sample_top_p(ctx, &candidates_p, top_p, 1);
+            llama_sample_temp (ctx, &candidates_p, temp);
+
+            const llama_token new_token_id = llama_sample_token(ctx, &candidates_p);
+
+            //const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);

            // is it an end of generation? -> mark the stream as finished
            if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
@@ -228,15 +244,12 @@ int main(int argc, char ** argv) {
    LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
            __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));

-    LOG_TEE("\n");
-    llama_perf_sampler_print(smpl);
-    llama_perf_context_print(ctx);
+    llama_print_timings(ctx);

    fprintf(stderr, "\n");

    llama_batch_free(batch);

-    llama_sampler_free(smpl);
    llama_free(ctx);
    llama_free_model(model);

--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@@ -183,7 +183,7 @@ int main(int argc, char ** argv)  {

    ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads);

-    TENSOR_DUMP(ggml_graph_node(gf, 0));
+    TENSOR_DUMP(gf->nodes[0]);

    printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype));

@@ -224,7 +224,7 @@ int main(int argc, char ** argv)  {


    // Let's use the F32 result from above as a reference for the quantized multiplication
-    float sum_of_F32_reference = tensor_sum_elements(ggml_graph_node(gf, 0));
+    float sum_of_F32_reference = tensor_sum_elements(gf->nodes[0]);

    printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
    printf("=====================================================================================\n");
@@ -252,7 +252,7 @@ int main(int argc, char ** argv)  {

        // Check that the matrix multiplication result is in the right ballpark
        // We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
-        float sum_of_Q4_result = tensor_sum_elements(ggml_graph_node(gf31, 0));
+        float sum_of_Q4_result = tensor_sum_elements(gf31->nodes[0]);
        float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference);
        float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; //  Let's accept an epsilon of 10^-6

--- a/examples/cvector-generator/cvector-generator.cpp
+++ b/examples/cvector-generator/cvector-generator.cpp
@@ -1,4 +1,3 @@
-#include "arg.h"
 #include "common.h"
 #include "llama.h"
 #include "ggml.h"
@@ -36,7 +35,9 @@ static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
    return ret;
 }

-static void print_usage(int, char ** argv) {
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+    gpt_params_print_usage(argc, argv, params);
+
    printf("\nexample usage:\n");
    printf("\n    CPU only:   %s -m ./llama-3.Q4_K_M.gguf\n", argv[0]);
    printf("\n    with GPU:   %s -m ./llama-3.Q4_K_M.gguf -ngl 99\n", argv[0]);
@@ -389,7 +390,8 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) {
 int main(int argc, char ** argv) {
    gpt_params params;

-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) {
+    if (!gpt_params_parse(argc, argv, params)) {
+        print_usage(argc, argv, params);
        return 1;
    }

--- a/examples/cvector-generator/pca.hpp
+++ b/examples/cvector-generator/pca.hpp
@@ -12,9 +12,12 @@

 #include <cstdio>
 #include <ctime>
-#include <random>
 #include <string>
+#include <tuple>
 #include <vector>
+#include <algorithm>
+#include <iostream>
+#include <fstream>

 #define DEBUG_POS 5

@@ -226,8 +229,8 @@ static ggml_status compute_piter(
        result.eigenvectors.resize(params.n_batch);
        result.distances.resize(params.n_batch);
        // get output nodes
-        for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) {
-            auto node = ggml_graph_node(gf, i);
+        for (int i = 0; i < gf->n_nodes; ++i) {
+            auto node = gf->nodes[i];
            int iter = -1;
            // find b_tensor (without copying data from device)
            if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) {
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -1,4 +1,3 @@
-#include "arg.h"
 #include "common.h"
 #include "llama.h"

@@ -80,7 +79,8 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
 int main(int argc, char ** argv) {
    gpt_params params;

-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) {
+    if (!gpt_params_parse(argc, argv, params)) {
+        gpt_params_print_usage(argc, argv, params);
        return 1;
    }

@@ -90,6 +90,14 @@ int main(int argc, char ** argv) {

    print_build_info();

+    if (params.seed == LLAMA_DEFAULT_SEED) {
+        params.seed = time(NULL);
+    }
+
+    fprintf(stderr, "%s: seed  = %u\n", __func__, params.seed);
+
+    std::mt19937 rng(params.seed);
+
    llama_backend_init();
    llama_numa_init(params.numa);

@@ -305,10 +313,8 @@ int main(int argc, char ** argv) {
        if (notArray) fprintf(stdout, "\n}\n");
    }

-    LOG_TEE("\n");
-    llama_perf_context_print(ctx);
-
    // clean up
+    llama_print_timings(ctx);
    llama_batch_free(batch);
    llama_free(ctx);
    llama_free_model(model);
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -1,4 +1,3 @@
-#include "arg.h"
 #include "common.h"
 #include "llama.h"
 #include "ggml.h"
@@ -145,12 +144,15 @@ int main(int argc, char ** argv) {

    gpt_params params;

-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
+    if (!gpt_params_parse(argc, argv, params)) {
+        gpt_params_print_usage(argc, argv, params);
        return 1;
    }

    print_build_info();

+    std::mt19937 rng(params.seed);
+
    llama_backend_init();
    llama_numa_init(params.numa);

@@ -181,8 +183,7 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    LOG_TEE("\n");
-    llama_perf_context_print(ctx);
+    llama_print_timings(ctx);

    llama_free(ctx);
    llama_free_model(model);
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@@ -1,4 +1,3 @@
-#include "arg.h"
 #include "common.h"
 #include "ggml.h"
 #include "ggml-alloc.h"
@@ -370,7 +369,7 @@ struct lora_merge_ctx {

        // write data to output file
        {
-            auto * result = ggml_graph_node(gf, -1);
+            auto result = gf->nodes[gf->n_nodes - 1];
            size_t len = ggml_nbytes(result);
            if (read_buf.size() < len) {
                read_buf.resize(len);
@@ -392,7 +391,9 @@ struct lora_merge_ctx {
    }
 };

-static void print_usage(int, char ** argv) {
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+    gpt_params_print_usage(argc, argv, params);
+
    printf("\nexample usage:\n");
    printf("\n  %s -m base-model.gguf --lora lora-file.gguf -o merged-model-f16.gguf\n", argv[0]);
    printf("\nNOTE: output model is F16\n");
@@ -402,7 +403,8 @@ static void print_usage(int, char ** argv) {
 int main(int argc, char ** argv) {
    gpt_params params;

-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) {
+    if (!gpt_params_parse(argc, argv, params)) {
+        print_usage(argc, argv, params);
        return 1;
    }

--- a/examples/gbnf-validator/gbnf-validator.cpp
+++ b/examples/gbnf-validator/gbnf-validator.cpp
@@ -1,5 +1,9 @@
+#define LLAMA_API_INTERNAL
+
+#include "grammar-parser.h"
+#include "ggml.h"
+#include "llama.h"
 #include "unicode.h"
-#include "llama-grammar.h"

 #include <cstdio>
 #include <cstdlib>
@@ -8,28 +12,29 @@
 #include <string>
 #include <vector>

-static bool llama_grammar_validate(struct llama_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) {
-    const auto cpts = unicode_cpts_from_utf8(input_str);
+static bool llama_sample_grammar_string(struct llama_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) {
+    auto decoded = decode_utf8(input_str, {});
+    const auto & code_points = decoded.first;

    const llama_grammar_rules  & rules      = llama_grammar_get_rules (grammar);
-          llama_grammar_stacks & stacks_cur = llama_grammar_get_stacks(grammar);
+          llama_grammar_stacks & cur_stacks = llama_grammar_get_stacks(grammar);

    size_t pos = 0;
-    for (const auto & cpt : cpts) {
-        const llama_grammar_stacks stacks_prev = llama_grammar_get_stacks(grammar); // copy
+    for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
+        const llama_grammar_stacks prev_stacks = llama_grammar_get_stacks(grammar); // copy

-        llama_grammar_accept(rules, stacks_prev, cpt, stacks_cur);
+        llama_grammar_accept(rules, prev_stacks, *it, cur_stacks);

-        if (stacks_cur.empty()) {
+        if (cur_stacks.empty()) {
            error_pos = pos;
-            error_msg = "Unexpected character '" + unicode_cpt_to_utf8(cpt) + "'";
-            stacks_cur = stacks_prev;
+            error_msg = "Unexpected character '" + unicode_cpt_to_utf8(*it) + "'";
+            cur_stacks = prev_stacks;
            return false;
        }
        ++pos;
    }

-    for (const auto & stack : stacks_cur) {
+    for (const auto & stack : cur_stacks) {
        if (stack.empty()) {
            return true;
        }
@@ -80,7 +85,27 @@ int main(int argc, char** argv) {
        grammar_str = buffer.str();
    }

-    llama_grammar * grammar = llama_grammar_init_impl(nullptr, grammar_str.c_str(), "root");
+    // Parse the GBNF grammar
+    auto parsed_grammar = grammar_parser::parse(grammar_str.c_str());
+
+    // will be empty (default) if there are parse errors
+    if (parsed_grammar.rules.empty()) {
+        fprintf(stdout, "%s: failed to parse grammar\n", __func__);
+        return 1;
+    }
+
+    // Ensure that there is a "root" node.
+    if (parsed_grammar.symbol_ids.find("root") == parsed_grammar.symbol_ids.end()) {
+        fprintf(stdout, "%s: grammar does not contain a 'root' symbol\n", __func__);
+        return 1;
+    }
+
+    std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
+
+    // Create the LLAMA grammar
+    auto grammar = llama_grammar_init(
+            grammar_rules.data(),
+            grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
    if (grammar == nullptr) {
        throw std::runtime_error("Failed to initialize llama_grammar");
    }
@@ -97,7 +122,7 @@ int main(int argc, char** argv) {
    // Validate the input string against the grammar
    size_t error_pos;
    std::string error_msg;
-    bool is_valid = llama_grammar_validate(grammar, input_str, error_pos, error_msg);
+    bool is_valid = llama_sample_grammar_string(grammar, input_str, error_pos, error_msg);

    if (is_valid) {
        fprintf(stdout, "Input string is valid according to the grammar.\n");
@@ -106,7 +131,7 @@ int main(int argc, char** argv) {
    }

    // Clean up
-    llama_grammar_free_impl(grammar);
+    llama_grammar_free(grammar);

    return 0;
 }
--- a/examples/gen-docs/CMakeLists.txt
+++ b/examples/gen-docs/CMakeLists.txt
@@ -1,5 +0,0 @@
-set(TARGET llama-gen-docs)
-add_executable(${TARGET} gen-docs.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/gen-docs/gen-docs.cpp
+++ b/examples/gen-docs/gen-docs.cpp
@@ -1,52 +0,0 @@
-#include "arg.h"
-#include "common.h"
-
-#include <fstream>
-#include <string>
-
-// Export usage message (-h) to markdown format
-
-static void export_md(std::string fname, llama_example ex) {
-    std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc);
-
-    gpt_params params;
-    auto ctx_arg = gpt_params_parser_init(params, ex);
-
-    file << "| Argument | Explanation |\n";
-    file << "| -------- | ----------- |\n";
-    for (auto & opt : ctx_arg.options) {
-        file << "| `";
-        // args
-        for (const auto & arg : opt.args) {
-        if (arg == opt.args.front()) {
-                file << arg;
-                if (opt.args.size() > 1) file << ", ";
-            } else {
-                file << arg << (arg != opt.args.back() ? ", " : "");
-            }
-        }
-        // value hint
-        if (opt.value_hint) {
-            std::string md_value_hint(opt.value_hint);
-            string_replace_all(md_value_hint, "|", "\\|");
-            file << " " << md_value_hint;
-        }
-        if (opt.value_hint_2) {
-            std::string md_value_hint_2(opt.value_hint_2);
-            string_replace_all(md_value_hint_2, "|", "\\|");
-            file << " " << md_value_hint_2;
-        }
-        // help text
-        std::string md_help(opt.help);
-        string_replace_all(md_help, "\n", "<br/>");
-        string_replace_all(md_help, "|", "\\|");
-        file << "` | " << md_help << " |\n";
-    }
-}
-
-int main(int, char **) {
-    export_md("autogen-main.md", LLAMA_EXAMPLE_MAIN);
-    export_md("autogen-server.md", LLAMA_EXAMPLE_SERVER);
-
-    return 0;
-}
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@@ -1,4 +1,3 @@
-#include "arg.h"
 #include "common.h"
 #include "llama.h"

@@ -10,7 +9,7 @@
 static std::vector<std::vector<float>> encode(llama_context * ctx, const std::vector<std::string> & sentences, const std::string & instruction) {
    std::vector<std::vector<float>> result;

-    const llama_model * model = llama_get_model(ctx);
+    const llama_model * mdl = llama_get_model(ctx);

    llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1);

@@ -19,16 +18,16 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve

        const std::string input_string = instruction + sentences[i];

-        std::vector<llama_token> inputs = llama_tokenize(model, input_string, true, false);
+        std::vector<llama_token> inputs = llama_tokenize(mdl, input_string, true, false);

        const int32_t n_toks = inputs.size();

        // GritLM seems to have EOS = ""
        // https://github.com/ContextualAI/gritlm/blob/92025b16534712b31b3c4aaaf069350e222bd5f8/gritlm/gritlm.py#L18
-        // inputs.push_back(llama_token_eos(model));
+        // inputs.push_back(llama_token_eos(mdl));

        // we want to ignore instruction tokens for mean pooling
-        const int32_t n_inst = llama_tokenize(model, instruction, true, false).size();
+        const int32_t n_inst = llama_tokenize(mdl, instruction, true, false).size();

 #ifdef GRIT_DEBUG
        // debug tokens - should be matching as referenced in the GritLM sample
@@ -52,7 +51,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
        llama_decode(ctx, batch);

        // get embedding dimensions
-        uint64_t n_embd = llama_n_embd(model);
+        uint64_t n_embd = llama_n_embd(mdl);

        // allocate embedding output
        std::vector<float> emb_unorm(n_embd, 0.0f);
@@ -93,11 +92,11 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
    return result;
 }

-static std::string generate(llama_context * ctx, llama_sampler * smpl, const std::string & prompt, bool stream) {
+static std::string generate(llama_context * ctx, const std::string & prompt, bool stream) {
    std::string result;

-    const llama_model * model = llama_get_model(ctx);
-    llama_token eos_token = llama_token_eos(model);
+    const llama_model * mdl = llama_get_model(ctx);
+    llama_token eos_token = llama_token_eos(mdl);

    llama_kv_cache_clear(ctx);
    llama_set_embeddings(ctx, false);
@@ -105,24 +104,28 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std

    llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);

-    std::vector<llama_token> inputs = llama_tokenize(model, prompt, false, true);
+    std::vector<llama_token> inputs = llama_tokenize(mdl, prompt, false, true);
    int32_t i_current_token = 0;

    while (true) {
        llama_batch_clear(bat);
-        {
-            const int32_t n_inputs = inputs.size();
-
-            for (int32_t i = 0; i < n_inputs; i++) {
-                llama_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1);
-            }
+        auto n_inputs = (int32_t)inputs.size();
+        for (int32_t i = 0; i < n_inputs; i++) {
+            llama_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1);
        }
        inputs.clear();

        llama_decode(ctx, bat);
+        auto logits = llama_get_logits_ith(ctx, bat.n_tokens - 1);

-        llama_token token = llama_sampler_sample(smpl, ctx, bat.n_tokens - 1);
+        auto candidates = std::vector<llama_token_data>(llama_n_vocab(mdl));
+        auto n_candidates = (int32_t)candidates.size();
+        for (int32_t token = 0; token < n_candidates; token++) {
+            candidates[token] = llama_token_data{ token, logits[token], 0.0f };
+        }
+        auto candidates_p = llama_token_data_array{ candidates.data(), candidates.size(), false };

+        llama_token token = llama_sample_token_greedy(ctx, &candidates_p);
        if (token == eos_token) {
            break;
        }
@@ -154,7 +157,8 @@ static std::string gritlm_instruction(const std::string & instruction) {
 int main(int argc, char * argv[]) {
    gpt_params params;

-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
+    if (!gpt_params_parse(argc, argv, params)) {
+        gpt_params_print_usage(argc, argv, params);
        return 1;
    }

@@ -163,18 +167,10 @@ int main(int argc, char * argv[]) {

    llama_backend_init();

-    llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams);
+    llama_model * mdl = llama_load_model_from_file(params.model.c_str(), mparams);

    // create generation context
-    llama_context * ctx = llama_new_context_with_model(model, cparams);
-
-    auto sparams = llama_sampler_chain_default_params();
-
-    sparams.no_perf = false;
-
-    llama_sampler * smpl = llama_sampler_chain_init(sparams);
-
-    llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
+    llama_context * ctx = llama_new_context_with_model(mdl, cparams);

    // ### Embedding/Representation ###
    // samples taken from: https://github.com/ContextualAI/gritlm#basic
@@ -195,7 +191,7 @@ int main(int argc, char * argv[]) {
        const std::vector<std::vector<float>> d_rep = encode(ctx, documents, gritlm_instruction(""));
        const std::vector<std::vector<float>> q_rep = encode(ctx, queries,   gritlm_instruction(instruction));

-        const int n_embd = llama_n_embd(model);
+        const int n_embd = llama_n_embd(mdl);

        const float cosine_sim_q0_d0 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd);
        const float cosine_sim_q0_d1 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd);
@@ -212,12 +208,11 @@ int main(int argc, char * argv[]) {
    // GritLM models are not finetuned with system prompts, as you can just include system-like instructions together with your user instruction
    {
        const std::string prompt = "<|user|>\nPlease write me a poem about my recent hike of Mt. Fuji at midnight in the style of Shakespeare.\n<|assistant|>\n";
-        std::string response = generate(ctx, smpl, prompt, true);
+        std::string response = generate(ctx, prompt, true);
    }

-    llama_sampler_free(smpl);
    llama_free(ctx);
-    llama_free_model(model);
+    llama_free_model(mdl);
    llama_backend_free();

    return 0;
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -1,4 +1,3 @@
-#include "arg.h"
 #include "common.h"
 #include "llama.h"

@@ -18,7 +17,9 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

-static void print_usage(int, char ** argv) {
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+    gpt_params_print_usage(argc, argv, params);
+
    LOG_TEE("\nexample usage:\n");
    LOG_TEE("\n    %s \\\n"
            "       -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n"
@@ -578,7 +579,8 @@ int main(int argc, char ** argv) {
    params.logits_all = true;
    params.verbosity = 1;

-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) {
+    if (!gpt_params_parse(argc, argv, params)) {
+        print_usage(argc, argv, params);
        return 1;
    }

@@ -636,8 +638,7 @@ int main(int argc, char ** argv) {

    g_collector.save_imatrix();

-    LOG_TEE("\n");
-    llama_perf_context_print(ctx);
+    llama_print_timings(ctx);

    llama_free(ctx);
    llama_free_model(model);
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -1,8 +1,8 @@
-#include "arg.h"
 #include "common.h"
+
 #include "console.h"
-#include "sampling.h"
 #include "llama.h"
+#include "grammar-parser.h"

 #include <cassert>
 #include <cinttypes>
@@ -34,7 +34,6 @@

 static llama_context           ** g_ctx;
 static llama_model             ** g_model;
-static gpt_sampler             ** g_smpl;
 static gpt_params               * g_params;
 static std::vector<llama_token> * g_input_tokens;
 static std::ostringstream       * g_output_ss;
@@ -82,7 +81,7 @@ static void write_logfile(
    yaml_dump_string_multiline(logfile, "output", output.c_str());
    yaml_dump_vector_int(logfile, "output_tokens", output_tokens);

-    llama_perf_dump_yaml(logfile, ctx);
+    llama_dump_timing_info_yaml(logfile, ctx);
    fclose(logfile);
 }

@@ -94,7 +93,7 @@ static void sigint_handler(int signo) {
        } else {
            console::cleanup();
            printf("\n");
-            gpt_perf_print(*g_ctx, *g_smpl);
+            llama_print_timings(*g_ctx);
            write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
            _exit(130);
        }
@@ -104,14 +103,14 @@ static void sigint_handler(int signo) {

 int main(int argc, char ** argv) {
    gpt_params params;
+    llama_sampling_params & sparams = params.sparams;
    g_params = &params;

-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_INFILL)) {
+    if (!gpt_params_parse(argc, argv, params)) {
+        gpt_params_print_usage(argc, argv, params);
        return 1;
    }

-    auto & sparams = params.sparams;
-
 #ifndef LOG_DISABLE_LOGS
    log_set_target(log_filename_generator("infill", "log"));
    LOG_TEE("Log start\n");
@@ -157,19 +156,26 @@ int main(int argc, char ** argv) {
        LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
    }

-    print_build_info();
+    LOG_TEE("%s: build = %d (%s)\n",      __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
+    LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);
+
+    if (params.seed == LLAMA_DEFAULT_SEED) {
+        params.seed = time(NULL);
+    }
+
+    LOG_TEE("%s: seed  = %u\n", __func__, params.seed);
+
+    std::mt19937 rng(params.seed);

    LOG("%s: llama backend init\n", __func__);
    llama_backend_init();
    llama_numa_init(params.numa);

-    llama_model * model = nullptr;
-    llama_context * ctx = nullptr;
-    gpt_sampler  * smpl = nullptr;
+    llama_model * model;
+    llama_context * ctx;

    g_model = &model;
    g_ctx = &ctx;
-    g_smpl = &smpl;

    // load the model and apply lora adapter, if any
    LOG("%s: load the model and apply lora adapter, if any\n", __func__);
@@ -299,14 +305,16 @@ int main(int argc, char ** argv) {
            LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
        }
    }
-    smpl = gpt_sampler_init(model, sparams);
-
-    LOG_TEE("sampling seed: %u\n", gpt_sampler_get_seed(smpl));
-    LOG_TEE("sampling: \n%s\n", sparams.print().c_str());
+    LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
    LOG_TEE("\n\n");

    LOG_TEE("\n#####  Infill mode  #####\n\n");
+    if (params.infill) {
+        printf("\n************\n");
+        printf("no need to specify '--infill', always running infill\n");
+        printf("************\n\n");
+    }
    if (params.interactive) {
        const char *control_message;
        if (params.multiline_input) {
@@ -341,6 +349,8 @@ int main(int argc, char ** argv) {

    std::vector<llama_token> embd;

+    struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
+
    while (n_remain != 0 || params.interactive) {
        // predict
        if (!embd.empty()) {
@@ -411,11 +421,11 @@ int main(int argc, char ** argv) {
        embd.clear();

        if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
-            const llama_token id = gpt_sampler_sample(smpl, ctx, -1);
+            const llama_token id = llama_sampling_sample(ctx_sampling, ctx, nullptr);

-            gpt_sampler_accept(smpl, id, true);
+            llama_sampling_accept(ctx_sampling, ctx, id, true);

-            // LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, smpl->prev.to_vector()).c_str());
+            LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());

            embd.push_back(id);

@@ -434,7 +444,7 @@ int main(int argc, char ** argv) {

                // push the prompt in the sampling context in order to apply repetition penalties later
                // for the prompt, we don't apply grammar rules
-                gpt_sampler_accept(smpl, embd_inp[n_consumed], false);
+                llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false);

                ++n_consumed;
                if ((int) embd.size() >= params.n_batch) {
@@ -466,7 +476,7 @@ int main(int argc, char ** argv) {
        // if not currently processing queued inputs;
        if ((int) embd_inp.size() <= n_consumed) {
            // deal with eot token in infill mode
-            if ((gpt_sampler_last(smpl) == llama_token_eot(model) || is_interacting) && params.interactive){
+            if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){
                if (is_interacting && !params.interactive_first) {
                    // print an eot token
                    printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
@@ -532,7 +542,7 @@ int main(int argc, char ** argv) {
                is_interacting = false;
            }
            // deal with end of generation tokens in interactive mode
-            else if (llama_token_is_eog(model, gpt_sampler_last(smpl))) {
+            else if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) {
                LOG("found EOS token\n");

                if (params.interactive) {
@@ -605,7 +615,7 @@ int main(int argc, char ** argv) {

            if (n_past > 0) {
                if (is_interacting) {
-                    gpt_sampler_reset(smpl);
+                    llama_sampling_reset(ctx_sampling);
                }
                is_interacting = false;
            }
@@ -628,14 +638,13 @@ int main(int argc, char ** argv) {
        fflush(stdout);
    }

-    LOG_TEE("\n");
-    gpt_perf_print(ctx, smpl);
+    llama_print_timings(ctx);
    write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);

    llama_free(ctx);
    llama_free_model(model);

-    gpt_sampler_free(smpl);
+    llama_sampling_free(ctx_sampling);
    llama_backend_free();

 #ifndef LOG_DISABLE_LOGS
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -249,7 +249,6 @@ struct cmd_params {
    ggml_sched_priority prio;
    int delay;
    bool verbose;
-    bool progress;
    output_formats output_format;
    output_formats output_format_stderr;
 };
@@ -281,7 +280,6 @@ static const cmd_params cmd_params_defaults = {
    /* prio                 */ GGML_SCHED_PRIO_NORMAL,
    /* delay                */ 0,
    /* verbose              */ false,
-    /* progress             */ false,
    /* output_format        */ MARKDOWN,
    /* output_format_stderr */ NONE,
 };
@@ -321,7 +319,6 @@ static void print_usage(int /* argc */, char ** argv) {
    printf("  -o, --output <csv|json|jsonl|md|sql>      (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
    printf("  -oe, --output-err <csv|json|jsonl|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
    printf("  -v, --verbose                             (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
-    printf("  --progress                                (default: %s)\n", cmd_params_defaults.progress ? "1" : "0");
    printf("\n");
    printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
 }
@@ -367,7 +364,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
    params.numa = cmd_params_defaults.numa;
    params.prio = cmd_params_defaults.prio;
    params.delay = cmd_params_defaults.delay;
-    params.progress = cmd_params_defaults.progress;

    for (int i = 1; i < argc; i++) {
        arg = argv[i];
@@ -620,8 +616,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
            invalid_param = !output_format_from_str(argv[i], params.output_format_stderr);
        } else if (arg == "-v" || arg == "--verbose") {
            params.verbose = true;
-        } else if (arg == "--progress") {
-            params.progress = true;
        } else {
            invalid_param = true;
            break;
@@ -1529,13 +1523,7 @@ int main(int argc, char ** argv) {
    llama_model * lmodel = nullptr;
    const cmd_params_instance * prev_inst = nullptr;

-    int params_idx = 0;
-    auto params_count = params_instances.size();
    for (const auto & inst : params_instances) {
-        params_idx ++;
-        if (params.progress) {
-            fprintf(stderr, "llama-bench: benchmark %d/%ld: starting\n", params_idx, params_count);
-        }
        // keep the same model between tests when possible
        if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
            if (lmodel) {
@@ -1568,7 +1556,7 @@ int main(int argc, char ** argv) {

        struct ggml_threadpool_params tpp = ggml_threadpool_params_default(t.n_threads);
        if (!parse_cpu_mask(t.cpu_mask, tpp.cpumask)) {
-            fprintf(stderr, "%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str());
+            LOG_TEE("%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str());
            exit(1);
        }
        tpp.strict_cpu = t.cpu_strict;
@@ -1577,7 +1565,7 @@ int main(int argc, char ** argv) {

        struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
        if (!threadpool) {
-            fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
+            LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
            exit(1);
        }

@@ -1585,16 +1573,10 @@ int main(int argc, char ** argv) {

        // warmup run
        if (t.n_prompt > 0) {
-            if (params.progress) {
-                fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup prompt run\n", params_idx, params_count);
-            }
            //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
            test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
        }
        if (t.n_gen > 0) {
-            if (params.progress) {
-                fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup generation run\n", params_idx, params_count);
-            }
            test_gen(ctx, 1, 0, t.n_threads);
        }

@@ -1604,15 +1586,9 @@ int main(int argc, char ** argv) {
            uint64_t t_start = get_time_ns();

            if (t.n_prompt > 0) {
-                if (params.progress) {
-                    fprintf(stderr, "llama-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count, i + 1, params.reps);
-                }
                test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
            }
            if (t.n_gen > 0) {
-                if (params.progress) {
-                    fprintf(stderr, "llama-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count, i + 1, params.reps);
-                }
                test_gen(ctx, t.n_gen, t.n_prompt, t.n_threads);
            }

@@ -1630,7 +1606,7 @@ int main(int argc, char ** argv) {
            fflush(p_err->fout);
        }

-        llama_perf_context_print(ctx);
+        llama_print_timings(ctx);

        llama_free(ctx);

--- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
@@ -120,8 +120,8 @@ Java_android_llama_cpp_LLamaAndroid_new_1context(JNIEnv *env, jobject, jlong jmo
    LOGi("Using %d threads", n_threads);

    llama_context_params ctx_params = llama_context_default_params();
-
-    ctx_params.n_ctx           = 2048;
+    ctx_params.seed  = 1234;
+    ctx_params.n_ctx = 2048;
    ctx_params.n_threads       = n_threads;
    ctx_params.n_threads_batch = n_threads;

@@ -269,6 +269,12 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
    return env->NewStringUTF(result.str().c_str());
 }

+extern "C"
+JNIEXPORT void JNICALL
+Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
+    llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
+}
+
 extern "C"
 JNIEXPORT jlong JNICALL
 Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
@@ -305,29 +311,6 @@ Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens,
    return reinterpret_cast<jlong>(batch);
 }

-extern "C"
-JNIEXPORT void JNICALL
-Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
-    llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
-}
-
-extern "C"
-JNIEXPORT jlong JNICALL
-Java_android_llama_cpp_LLamaAndroid_new_1sampler(JNIEnv *, jobject) {
-    auto sparams = llama_sampler_chain_default_params();
-    sparams.no_perf = true;
-    llama_sampler * smpl = llama_sampler_chain_init(sparams);
-    llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
-
-    return reinterpret_cast<jlong>(smpl);
-}
-
-extern "C"
-JNIEXPORT void JNICALL
-Java_android_llama_cpp_LLamaAndroid_free_1sampler(JNIEnv *, jobject, jlong sampler_pointer) {
-    llama_sampler_free(reinterpret_cast<llama_sampler *>(sampler_pointer));
-}
-
 extern "C"
 JNIEXPORT void JNICALL
 Java_android_llama_cpp_LLamaAndroid_backend_1init(JNIEnv *, jobject) {
@@ -398,21 +381,31 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
        jobject,
        jlong context_pointer,
        jlong batch_pointer,
-        jlong sampler_pointer,
        jint n_len,
        jobject intvar_ncur
 ) {
    const auto context = reinterpret_cast<llama_context *>(context_pointer);
-    const auto batch   = reinterpret_cast<llama_batch   *>(batch_pointer);
-    const auto sampler = reinterpret_cast<llama_sampler *>(sampler_pointer);
+    const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
    const auto model = llama_get_model(context);

    if (!la_int_var) la_int_var = env->GetObjectClass(intvar_ncur);
    if (!la_int_var_value) la_int_var_value = env->GetMethodID(la_int_var, "getValue", "()I");
    if (!la_int_var_inc) la_int_var_inc = env->GetMethodID(la_int_var, "inc", "()V");

+    auto n_vocab = llama_n_vocab(model);
+    auto logits = llama_get_logits_ith(context, batch->n_tokens - 1);
+
+    std::vector<llama_token_data> candidates;
+    candidates.reserve(n_vocab);
+
+    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+        candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
+    }
+
+    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+
    // sample the most likely token
-    const auto new_token_id = llama_sampler_sample(sampler, context, -1);
+    const auto new_token_id = llama_sample_token_greedy(context, &candidates_p);

    const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
    if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
--- a/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt
+++ b/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt
@@ -45,10 +45,8 @@ class LLamaAndroid {
    private external fun free_context(context: Long)
    private external fun backend_init(numa: Boolean)
    private external fun backend_free()
-    private external fun new_batch(nTokens: Int, embd: Int, nSeqMax: Int): Long
    private external fun free_batch(batch: Long)
-    private external fun new_sampler(): Long
-    private external fun free_sampler(sampler: Long)
+    private external fun new_batch(nTokens: Int, embd: Int, nSeqMax: Int): Long
    private external fun bench_model(
        context: Long,
        model: Long,
@@ -71,7 +69,6 @@ class LLamaAndroid {
    private external fun completion_loop(
        context: Long,
        batch: Long,
-        sampler: Long,
        nLen: Int,
        ncur: IntVar
    ): String?
@@ -104,11 +101,8 @@ class LLamaAndroid {
                    val batch = new_batch(512, 0, 1)
                    if (batch == 0L) throw IllegalStateException("new_batch() failed")

-                    val sampler = new_sampler()
-                    if (sampler == 0L) throw IllegalStateException("new_sampler() failed")
-
                    Log.i(tag, "Loaded model $pathToModel")
-                    threadLocalState.set(State.Loaded(model, context, batch, sampler))
+                    threadLocalState.set(State.Loaded(model, context, batch))
                }
                else -> throw IllegalStateException("Model already loaded")
            }
@@ -120,7 +114,7 @@ class LLamaAndroid {
            is State.Loaded -> {
                val ncur = IntVar(completion_init(state.context, state.batch, message, nlen))
                while (ncur.value <= nlen) {
-                    val str = completion_loop(state.context, state.batch, state.sampler, nlen, ncur)
+                    val str = completion_loop(state.context, state.batch, nlen, ncur)
                    if (str == null) {
                        break
                    }
@@ -144,7 +138,6 @@ class LLamaAndroid {
                    free_context(state.context)
                    free_model(state.model)
                    free_batch(state.batch)
-                    free_sampler(state.sampler);

                    threadLocalState.set(State.Idle)
                }
@@ -168,7 +161,7 @@ class LLamaAndroid {

        private sealed interface State {
            data object Idle: State
-            data class Loaded(val model: Long, val context: Long, val batch: Long, val sampler: Long): State
+            data class Loaded(val model: Long, val context: Long, val batch: Long): State
        }

        // Enforce only one instance of Llm.
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@@ -24,7 +24,6 @@ func llama_batch_add(_ batch: inout llama_batch, _ id: llama_token, _ pos: llama
 actor LlamaContext {
    private var model: OpaquePointer
    private var context: OpaquePointer
-    private var sampling: UnsafeMutablePointer<llama_sampler>
    private var batch: llama_batch
    private var tokens_list: [llama_token]
    var is_done: Bool = false
@@ -43,15 +42,9 @@ actor LlamaContext {
        self.tokens_list = []
        self.batch = llama_batch_init(512, 0, 1)
        self.temporary_invalid_cchars = []
-        let sparams = llama_sampler_chain_default_params()
-        self.sampling = llama_sampler_chain_init(sparams)
-        llama_sampler_chain_add(self.sampling, llama_sampler_init_temp(0.4))
-        llama_sampler_chain_add(self.sampling, llama_sampler_init_softmax())
-        llama_sampler_chain_add(self.sampling, llama_sampler_init_dist(1234))
    }

    deinit {
-        llama_sampler_free(sampling)
        llama_batch_free(batch)
        llama_free(context)
        llama_free_model(model)
@@ -76,6 +69,7 @@ actor LlamaContext {
        print("Using \(n_threads) threads")

        var ctx_params = llama_context_default_params()
+        ctx_params.seed  = 1234
        ctx_params.n_ctx = 2048
        ctx_params.n_threads       = Int32(n_threads)
        ctx_params.n_threads_batch = Int32(n_threads)
@@ -150,7 +144,20 @@ actor LlamaContext {
    func completion_loop() -> String {
        var new_token_id: llama_token = 0

-        new_token_id = llama_sampler_sample(sampling, context, batch.n_tokens - 1)
+        let n_vocab = llama_n_vocab(model)
+        let logits = llama_get_logits_ith(context, batch.n_tokens - 1)
+
+        var candidates = Array<llama_token_data>()
+        candidates.reserveCapacity(Int(n_vocab))
+
+        for token_id in 0..<n_vocab {
+            candidates.append(llama_token_data(id: token_id, logit: logits![Int(token_id)], p: 0.0))
+        }
+        candidates.withUnsafeMutableBufferPointer() { buffer in
+            var candidates_p = llama_token_data_array(data: buffer.baseAddress, size: buffer.count, sorted: false)
+
+            new_token_id = llama_sample_token_greedy(context, &candidates_p)
+        }

        if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
            print("\n")
--- a/examples/llava/MobileVLM-README.md
+++ b/examples/llava/MobileVLM-README.md
@@ -39,7 +39,7 @@ python ./examples/llava/llava_surgery.py -m path/to/MobileVLM-1.7B
 3. Use `convert_image_encoder_to_gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:

 ```sh
-python ./examples/llava/convert_image_encoder_to_gguf.py \
+python ./examples/llava/convert_image_encoder_to_gguf \
    -m path/to/clip-vit-large-patch14-336 \
    --llava-projector path/to/MobileVLM-1.7B/llava.projector \
    --output-dir path/to/MobileVLM-1.7B \
@@ -47,7 +47,7 @@ python ./examples/llava/convert_image_encoder_to_gguf.py \
 ```

 ```sh
-python ./examples/llava/convert_image_encoder_to_gguf.py \
+python ./examples/llava/convert_image_encoder_to_gguf \
    -m path/to/clip-vit-large-patch14-336 \
    --llava-projector path/to/MobileVLM-1.7B_V2/llava.projector \
    --output-dir path/to/MobileVLM-1.7B_V2 \
@@ -57,12 +57,12 @@ python ./examples/llava/convert_image_encoder_to_gguf.py \
 4. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF:

 ```sh
-python ./examples/convert_legacy_llama.py path/to/MobileVLM-1.7B --skip-unknown
+python ./examples/convert_legacy_llama.py path/to/MobileVLM-1.7B
 ```

-5. Use `quantize` to convert LLaMA part's DataType from `fp32` to `q4_k`
+5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`
 ```sh
-./llama-quantize path/to/MobileVLM-1.7B/ggml-model-F32.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
+./llama-quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
 ```

 Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directory.
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -2449,7 +2449,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
    ggml_backend_graph_compute(ctx->backend, gf);

    // the last node is the embedding tensor
-    struct ggml_tensor * embeddings = ggml_graph_node(gf, -1);
+    struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1];

    // copy the embeddings to the location passed by the user
    ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -1,12 +1,11 @@
-#include "arg.h"
-#include "base64.hpp"
+#include "ggml.h"
 #include "log.h"
 #include "common.h"
-#include "sampling.h"
 #include "clip.h"
 #include "llava.h"
 #include "llama.h"
-#include "ggml.h"
+
+#include "base64.hpp"

 #include <cstdio>
 #include <cstdlib>
@@ -41,11 +40,11 @@ static bool eval_string(struct llama_context * ctx_llama, const char* str, int n
    return true;
 }

-static const char * sample(struct gpt_sampler * smpl,
+static const char * sample(struct llama_sampling_context * ctx_sampling,
                           struct llama_context * ctx_llama,
                           int * n_past) {
-    const llama_token id = gpt_sampler_sample(smpl, ctx_llama, -1);
-    gpt_sampler_accept(smpl, id, true);
+    const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL);
+    llama_sampling_accept(ctx_sampling, ctx_llama, id, true);
    static std::string ret;
    if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
        ret = "</s>";
@@ -113,7 +112,9 @@ struct llava_context {
    struct llama_model * model = NULL;
 };

-static void print_usage(int, char ** argv) {
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+    gpt_params_print_usage(argc, argv, params);
+
    LOG_TEE("\n example usage:\n");
    LOG_TEE("\n     %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
    LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
@@ -190,15 +191,15 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_

    LOG_TEE("\n");

-    struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams);
-    if (!smpl) {
+    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
+    if (!ctx_sampling) {
        fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
        exit(1);
    }

    std::string response = "";
    for (int i = 0; i < max_tgt_len; i++) {
-        const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past);
+        const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
        response += tmp;
        if (strcmp(tmp, "</s>") == 0) break;
        if (strstr(tmp, "###")) break; // Yi-VL behavior
@@ -210,7 +211,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
        fflush(stdout);
    }

-    gpt_sampler_free(smpl);
+    llama_sampling_free(ctx_sampling);
    printf("\n");
 }

@@ -279,7 +280,8 @@ int main(int argc, char ** argv) {

    gpt_params params;

-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) {
+    if (!gpt_params_parse(argc, argv, params)) {
+        print_usage(argc, argv, params);
        return 1;
    }

@@ -291,7 +293,7 @@ int main(int argc, char ** argv) {
 #endif // LOG_DISABLE_LOGS

    if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
-        print_usage(argc, argv);
+        print_usage(argc, argv, {});
        return 1;
    }
    auto model = llava_init(&params);
@@ -308,7 +310,7 @@ int main(int argc, char ** argv) {
        // process the prompt
        process_prompt(ctx_llava, image_embed, &params, params.prompt);

-        llama_perf_context_print(ctx_llava->ctx_llama);
+        llama_print_timings(ctx_llava->ctx_llama);
        llava_image_embed_free(image_embed);
        ctx_llava->model = NULL;
        llava_free(ctx_llava);
@@ -325,7 +327,7 @@ int main(int argc, char ** argv) {
            // process the prompt
            process_prompt(ctx_llava, image_embed, &params, params.prompt);

-            llama_perf_context_print(ctx_llava->ctx_llama);
+            llama_print_timings(ctx_llava->ctx_llama);
            llava_image_embed_free(image_embed);
            ctx_llava->model = NULL;
            llava_free(ctx_llava);
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -184,7 +184,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
    // ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
    ggml_build_forward_expand(gf, flatten);
    ggml_graph_compute_with_ctx(model.ctx, gf, 1);
-    struct ggml_tensor* result = ggml_graph_node(gf, -1);
+    struct ggml_tensor* result = gf->nodes[gf->n_nodes - 1];

    memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
    // append without newline tokens (default behavior in llava_arch when not using unpad ):
--- a/examples/llava/minicpmv-cli.cpp
+++ b/examples/llava/minicpmv-cli.cpp
@@ -1,11 +1,9 @@
-#include "arg.h"
+#include "ggml.h"
 #include "log.h"
 #include "common.h"
-#include "sampling.h"
 #include "clip.h"
 #include "llava.h"
 #include "llama.h"
-#include "ggml.h"

 #include <cstdio>
 #include <cstdlib>
@@ -18,8 +16,8 @@ struct llava_context {
 };

 static void show_additional_info(int /*argc*/, char ** argv) {
-    LOG_TEE("\nexample usage:\n\n%s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
-    LOG_TEE("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n");
+    LOG_TEE("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
+    LOG_TEE("  note: a lower temperature value like 0.1 is recommended for better quality.\n");
 }

 static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
@@ -165,11 +163,11 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
    LOG_TEE("%s: image token past: %d\n", __func__, n_past);
 }

-static const char * sample(struct gpt_sampler * smpl,
+static const char * sample(struct llama_sampling_context * ctx_sampling,
                           struct llama_context * ctx_llama,
                           int * n_past) {
-    const llama_token id = gpt_sampler_sample(smpl, ctx_llama, -1);
-    gpt_sampler_accept(smpl, id, true);
+    const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL);
+    llama_sampling_accept(ctx_sampling, ctx_llama, id, true);
    static std::string ret;
    if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
        ret = "</s>";
@@ -216,7 +214,7 @@ static struct llava_context * minicpmv_init(gpt_params * params, const std::stri
    return ctx_llava;
 }

-static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false){
+static struct llama_sampling_context * llama_init(struct llava_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false){
    std::string user_prompt = prompt;
    int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
    if (!is_first) {
@@ -240,13 +238,13 @@ static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_par

    LOG_TEE("\n");

-    struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams);
-    return smpl;
+    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
+    return ctx_sampling;
 }

-static const char * llama_loop(struct llava_context * ctx_llava,struct gpt_sampler * smpl, int &n_past){
+static const char * llama_loop(struct llava_context * ctx_llava,struct llama_sampling_context * ctx_sampling, int &n_past){

-    const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past);
+    const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
    return tmp;
 }

@@ -255,7 +253,8 @@ int main(int argc, char ** argv) {

    gpt_params params;

-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) {
+    if (!gpt_params_parse(argc, argv, params)) {
+        show_additional_info(argc, argv);
        return 1;
    }

@@ -267,6 +266,7 @@ int main(int argc, char ** argv) {
 #endif // LOG_DISABLE_LOGS

    if (params.mmproj.empty() || (params.image.empty())) {
+        gpt_params_print_usage(argc, argv, params);
        show_additional_info(argc, argv);
        return 1;
    }
@@ -278,12 +278,12 @@ int main(int argc, char ** argv) {
        if (!params.prompt.empty()) {
            LOG_TEE("<user>%s\n", params.prompt.c_str());
            LOG_TEE("<assistant>");
-            auto smpl = llama_init(ctx_llava, &params, params.prompt.c_str(), n_past, true);
+            auto ctx_sampling = llama_init(ctx_llava, &params, params.prompt.c_str(), n_past, true);
            const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
            std::string response = "";
            bool have_tmp = false;
            for (int i = 0; i < max_tgt_len; i++) {
-                auto tmp = llama_loop(ctx_llava, smpl, n_past);
+                auto tmp = llama_loop(ctx_llava, ctx_sampling, n_past);
                response += tmp;
                if (strcmp(tmp, "</s>") == 0){
                    if(!have_tmp)continue;
@@ -296,18 +296,18 @@ int main(int argc, char ** argv) {

                fflush(stdout);
            }
-            gpt_sampler_free(smpl);
+            llama_sampling_free(ctx_sampling);
        }else {
            while (true) {
                LOG_TEE("<user>");
                std::string prompt;
                std::getline(std::cin, prompt);
                LOG_TEE("<assistant>");
-                auto smpl = llama_init(ctx_llava, &params, prompt, n_past, true);
+                auto ctx_sampling = llama_init(ctx_llava, &params, prompt, n_past, true);
                const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
                std::string response = "";
                for (int i = 0; i < max_tgt_len; i++) {
-                    auto tmp = llama_loop(ctx_llava, smpl, n_past);
+                    auto tmp = llama_loop(ctx_llava, ctx_sampling, n_past);
                    response += tmp;
                    if (strcmp(tmp, "</s>") == 0) break;
                    if (strstr(tmp, "###")) break; // Yi-VL behavior
@@ -315,11 +315,11 @@ int main(int argc, char ** argv) {
                    if (strstr(response.c_str(), "<user>")) break; // minicpm-v
                    fflush(stdout);
                }
-                gpt_sampler_free(smpl);
+                llama_sampling_free(ctx_sampling);
            }
        }
        printf("\n");
-        llama_perf_context_print(ctx_llava->ctx_llama);
+        llama_print_timings(ctx_llava->ctx_llama);

        ctx_llava->model = NULL;
        llava_free(ctx_llava);
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@@ -1,8 +1,7 @@
-#include "arg.h"
 #include "common.h"
-#include "sampling.h"
 #include "llama.h"

+#include <cmath>
 #include <cstdio>
 #include <string>
 #include <vector>
@@ -38,7 +37,8 @@ struct ngram_container {
 int main(int argc, char ** argv) {
    gpt_params params;

-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
+    if (!gpt_params_parse(argc, argv, params)) {
+        gpt_params_print_usage(argc, argv, params);
        return 1;
    }

@@ -118,7 +118,7 @@ int main(int argc, char ** argv) {
    llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);

    // target model sampling context
-    struct gpt_sampler * smpl = gpt_sampler_init(model, params.sparams);
+    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);

    // verification n-grams
    std::vector<ngram_data> ngrams_cur(G);
@@ -159,9 +159,9 @@ int main(int argc, char ** argv) {

    // sample first token
    {
-        id = gpt_sampler_sample(smpl, ctx, 0);
+        id = llama_sampling_sample(ctx_sampling, ctx, NULL, 0);

-        gpt_sampler_accept(smpl, id, true);
+        llama_sampling_accept(ctx_sampling, ctx, id, true);

        {
            const std::string token_str = llama_token_to_piece(ctx, id);
@@ -284,9 +284,9 @@ int main(int argc, char ** argv) {
            }

            // sample the next token
-            id = gpt_sampler_sample(smpl, ctx, i_batch);
+            id = llama_sampling_sample(ctx_sampling, ctx, NULL, i_batch);

-            gpt_sampler_accept(smpl, id, true);
+            llama_sampling_accept(ctx_sampling, ctx, id, true);

            // print
            {
@@ -361,7 +361,7 @@ int main(int argc, char ** argv) {
                if (v == 0) {
                    // sample from the last level
                    for (int i = 0; i < W; i++) {
-                        tokens_j[N - 2][i] = gpt_sampler_sample(smpl, ctx, ngrams_cur.size()*(N-1) + W*(N - 2) + i);
+                        tokens_j[N - 2][i] = llama_sampling_sample(ctx_sampling, ctx, NULL, ngrams_cur.size()*(N-1) + W*(N - 2) + i);
                    }
                } else {
                    for (int i = 0; i < W; i++) {
@@ -468,12 +468,10 @@ int main(int argc, char ** argv) {
    LOG_TEE("n_predict = %d\n", n_predict);
    LOG_TEE("n_accept  = %d\n", n_accept);

-    LOG_TEE("\n");
-    gpt_perf_print(ctx, smpl);
-
-    gpt_sampler_free(smpl);
+    llama_print_timings(ctx);

    llama_kv_cache_view_free(&kvc_view);
+    llama_sampling_free(ctx_sampling);

    llama_batch_free(batch);

--- a/examples/lookup/lookup-create.cpp
+++ b/examples/lookup/lookup-create.cpp
@@ -1,8 +1,7 @@
-#include "arg.h"
-#include "common.h"
-#include "ngram-cache.h"
 #include "ggml.h"
 #include "llama.h"
+#include "common.h"
+#include "ngram-cache.h"

 #include <cstdint>
 #include <fstream>
@@ -14,7 +13,8 @@
 int main(int argc, char ** argv){
    gpt_params params;

-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
+    if (!gpt_params_parse(argc, argv, params)) {
+        gpt_params_print_usage(argc, argv, params);
        return 1;
    }

@@ -40,6 +40,4 @@ int main(int argc, char ** argv){
    fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str());

    llama_ngram_cache_save(ngram_cache, params.lookup_cache_static);
-
-    return 0;
 }
--- a/examples/lookup/lookup-stats.cpp
+++ b/examples/lookup/lookup-stats.cpp
@@ -1,9 +1,8 @@
-#include "arg.h"
+#include "ggml.h"
 #include "common.h"
+#include "llama.h"
 #include "log.h"
 #include "ngram-cache.h"
-#include "llama.h"
-#include "ggml.h"

 #include <cmath>
 #include <cstdint>
@@ -16,7 +15,8 @@
 int main(int argc, char ** argv){
    gpt_params params;

-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
+    if (!gpt_params_parse(argc, argv, params)) {
+        gpt_params_print_usage(argc, argv, params);
        return 1;
    }

--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -1,20 +1,21 @@
-#include "arg.h"
 #include "ggml.h"
+#include "llama.h"
 #include "common.h"
 #include "ngram-cache.h"
-#include "sampling.h"
-#include "llama.h"

+#include <cmath>
 #include <cstdint>
 #include <cstdio>
 #include <fstream>
 #include <string>
 #include <vector>
+#include <unordered_map>

 int main(int argc, char ** argv){
    gpt_params params;

-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
+    if (!gpt_params_parse(argc, argv, params)) {
+        gpt_params_print_usage(argc, argv, params);
        return 1;
    }

@@ -105,7 +106,7 @@ int main(int argc, char ** argv){

    bool has_eos = false;

-    struct gpt_sampler * smpl = gpt_sampler_init(model, params.sparams);
+    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);

    std::vector<llama_token> draft;

@@ -129,9 +130,9 @@ int main(int argc, char ** argv){
        int i_dft = 0;
        while (true) {
            // sample from the target model
-            llama_token id = gpt_sampler_sample(smpl, ctx, i_dft);
+            llama_token id = llama_sampling_sample(ctx_sampling, ctx, NULL, i_dft);

-            gpt_sampler_accept(smpl, id, true);
+            llama_sampling_accept(ctx_sampling, ctx, id, true);

            const std::string token_str = llama_token_to_piece(ctx, id);

@@ -239,11 +240,10 @@ int main(int argc, char ** argv){
    LOG_TEE("n_accept     = %d\n", n_accept);
    LOG_TEE("accept       = %.3f%%\n", 100.0f * n_accept / n_drafted);

-    LOG_TEE("\ntarget:\n\n");
-    gpt_perf_print(ctx, smpl);
-
-    gpt_sampler_free(smpl);
+    LOG_TEE("\ntarget:\n");
+    llama_print_timings(ctx);

+    llama_sampling_free(ctx_sampling);
    llama_batch_free(batch_tgt);

    llama_free(ctx);
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -1,7 +1,6 @@
-#include "arg.h"
 #include "common.h"
+
 #include "console.h"
-#include "sampling.h"
 #include "llama.h"

 #include <cassert>
@@ -34,7 +33,6 @@

 static llama_context           ** g_ctx;
 static llama_model             ** g_model;
-static gpt_sampler             ** g_smpl;
 static gpt_params               * g_params;
 static std::vector<llama_token> * g_input_tokens;
 static std::ostringstream       * g_output_ss;
@@ -42,13 +40,6 @@ static std::vector<llama_token> * g_output_tokens;
 static bool is_interacting  = false;
 static bool need_insert_eot = false;

-static void print_usage(int, char ** argv) {
-    printf("\nexample usage:\n");
-    printf("\n  text generation:     %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]);
-    printf("\n  chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]);
-    printf("\n");
-}
-
 static bool file_exists(const std::string & path) {
    std::ifstream f(path.c_str());
    return f.good();
@@ -101,7 +92,7 @@ static void write_logfile(
    yaml_dump_string_multiline(logfile, "output", output.c_str());
    yaml_dump_vector_int(logfile, "output_tokens", output_tokens);

-    llama_perf_dump_yaml(logfile, ctx);
+    llama_dump_timing_info_yaml(logfile, ctx);
    fclose(logfile);
 }

@@ -114,7 +105,7 @@ static void sigint_handler(int signo) {
        } else {
            console::cleanup();
            printf("\n");
-            gpt_perf_print(*g_ctx, *g_smpl);
+            llama_print_timings(*g_ctx);
            write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
            _exit(130);
        }
@@ -130,7 +121,8 @@ static void llama_log_callback_logTee(ggml_log_level level, const char * text, v

 static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, std::string role, std::string content) {
    llama_chat_msg new_msg{role, content};
-    auto formatted = llama_chat_format_single(model, g_params->chat_template, chat_msgs, new_msg, role == "user");
+    auto formatted = llama_chat_format_single(
+        model, g_params->chat_template, chat_msgs, new_msg, role == "user");
    chat_msgs.push_back({role, content});
    LOG("formatted: %s\n", formatted.c_str());
    return formatted;
@@ -139,11 +131,13 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector<l
 int main(int argc, char ** argv) {
    gpt_params params;
    g_params = &params;
-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) {
+
+    if (!gpt_params_parse(argc, argv, params)) {
+        gpt_params_print_usage(argc, argv, params);
        return 1;
    }

-    auto & sparams = params.sparams;
+    llama_sampling_params & sparams = params.sparams;

 #ifndef LOG_DISABLE_LOGS
    log_set_target(log_filename_generator("main", "log"));
@@ -189,21 +183,27 @@ int main(int argc, char ** argv) {
        LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
    }

-    print_build_info();
+    LOG_TEE("%s: build = %d (%s)\n",      __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
+    LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);
+
+    if (params.seed == LLAMA_DEFAULT_SEED) {
+        params.seed = time(NULL);
+    }
+
+    LOG_TEE("%s: seed  = %u\n", __func__, params.seed);
+
+    std::mt19937 rng(params.seed);

    LOG("%s: llama backend init\n", __func__);
    llama_backend_init();
    llama_numa_init(params.numa);

-    llama_model * model = nullptr;
-    llama_context * ctx = nullptr;
-    gpt_sampler * smpl = nullptr;
-
+    llama_model * model;
+    llama_context * ctx;
+    llama_context * ctx_guidance = NULL;
    std::vector<llama_chat_msg> chat_msgs;
-
    g_model = &model;
    g_ctx = &ctx;
-    g_smpl = &smpl;

    // load the model and apply lora adapter, if any
    LOG("%s: load the model and apply lora adapter, if any\n", __func__);
@@ -211,6 +211,10 @@ int main(int argc, char ** argv) {

    model = llama_init.model;
    ctx = llama_init.context;
+    if (sparams.cfg_scale > 1.f) {
+        struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
+        ctx_guidance = llama_new_context_with_model(model, lparams);
+    }

    if (model == NULL) {
        LOG_TEE("%s: error: unable to load model\n", __func__);
@@ -247,6 +251,9 @@ int main(int argc, char ** argv) {
    }

    llama_attach_threadpool(ctx, threadpool, threadpool_batch);
+    if (ctx_guidance) {
+        llama_attach_threadpool(ctx_guidance, threadpool, threadpool_batch);
+    }

    const int n_ctx_train = llama_n_ctx_train(model);
    const int n_ctx = llama_n_ctx(ctx);
@@ -330,6 +337,24 @@ int main(int argc, char ** argv) {
    }

    // Tokenize negative prompt
+    std::vector<llama_token> guidance_inp;
+    int guidance_offset = 0;
+    int original_prompt_len = 0;
+    if (ctx_guidance) {
+        LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
+
+        guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true, true);
+        LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
+
+        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true, true);
+        LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
+
+        original_prompt_len = original_inp.size();
+        guidance_offset = (int)guidance_inp.size() - original_prompt_len;
+        LOG("original_prompt_len: %s", log_tostr(original_prompt_len));
+        LOG("guidance_offset:     %s", log_tostr(guidance_offset));
+    }
+
    if ((int) embd_inp.size() > n_ctx - 4) {
        LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
        return 1;
@@ -396,6 +421,15 @@ int main(int argc, char ** argv) {
            LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
        }

+        if (ctx_guidance) {
+            LOG_TEE("\n");
+            LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
+            LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
+            for (int i = 0; i < (int) guidance_inp.size(); i++) {
+                LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
+            }
+        }
+
        if (params.n_keep > add_bos) {
            LOG_TEE("%s: static prompt based on n_keep: '", __func__);
            for (int i = 0; i < params.n_keep; i++) {
@@ -461,17 +495,8 @@ int main(int argc, char ** argv) {
            }
        }
    }
-
-    smpl = gpt_sampler_init(model, sparams);
-    if (!smpl) {
-        fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
-        exit(1);
-    }
-
-    LOG_TEE("sampling seed: %u\n", gpt_sampler_get_seed(smpl));
-    LOG_TEE("sampling params: \n%s\n", sparams.print().c_str());
-    LOG_TEE("sampler constr: \n%s\n", gpt_sampler_print(smpl).c_str());
-
+    LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
+    LOG_TEE("sampling order: \n%s\n", llama_sampling_order_print(sparams).c_str());
    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);

    // group-attention state
@@ -518,6 +543,7 @@ int main(int argc, char ** argv) {
    int n_remain           = params.n_predict;
    int n_consumed         = 0;
    int n_session_consumed = 0;
+    int n_past_guidance    = 0;

    std::vector<int>   input_tokens;  g_input_tokens  = &input_tokens;
    std::vector<int>   output_tokens; g_output_tokens = &output_tokens;
@@ -529,6 +555,7 @@ int main(int argc, char ** argv) {
    display = params.display_prompt;

    std::vector<llama_token> embd;
+    std::vector<llama_token> embd_guidance;

    // tokenized antiprompts
    std::vector<std::vector<llama_token>> antiprompt_ids;
@@ -538,6 +565,12 @@ int main(int argc, char ** argv) {
        antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true));
    }

+    struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
+    if (!ctx_sampling) {
+        fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
+        exit(1);
+    }
+
    if (llama_model_has_encoder(model)) {
        int enc_input_size = embd_inp.size();
        llama_token * enc_input_buf = embd_inp.data();
@@ -579,7 +612,7 @@ int main(int argc, char ** argv) {
                // if we run out of context:
                // - take the n_keep first tokens from the original prompt (via n_past)
                // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
-                if (n_past + (int) embd.size() >= n_ctx) {
+                if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) >= n_ctx) {
                    if (params.n_predict == -2) {
                        LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
                        break;
@@ -596,7 +629,11 @@ int main(int argc, char ** argv) {

                    n_past -= n_discard;

-                    LOG("after swap: n_past = %d\n", n_past);
+                    if (ctx_guidance) {
+                        n_past_guidance -= n_discard;
+                    }
+
+                    LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);

                    LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());

@@ -649,6 +686,46 @@ int main(int argc, char ** argv) {
                }
            }

+            // evaluate tokens in batches
+            // embd is typically prepared beforehand to fit within a batch, but not always
+            if (ctx_guidance) {
+                int input_size = 0;
+                llama_token * input_buf = NULL;
+
+                if (n_past_guidance < (int) guidance_inp.size()) {
+                    // Guidance context should have the same data with these modifications:
+                    //
+                    // * Replace the initial prompt
+                    // * Shift everything by guidance_offset
+                    embd_guidance = guidance_inp;
+                    if (embd.begin() + original_prompt_len < embd.end()) {
+                        embd_guidance.insert(
+                            embd_guidance.end(),
+                            embd.begin() + original_prompt_len,
+                            embd.end()
+                        );
+                    }
+
+                    input_buf  = embd_guidance.data();
+                    input_size = embd_guidance.size();
+
+                    LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str());
+                } else {
+                    input_buf  = embd.data();
+                    input_size = embd.size();
+                }
+
+                for (int i = 0; i < input_size; i += params.n_batch) {
+                    int n_eval = std::min(input_size - i, params.n_batch);
+                    if (llama_decode(ctx_guidance, llama_batch_get_one(input_buf + i, n_eval, n_past_guidance, 0))) {
+                        LOG_TEE("%s : failed to eval\n", __func__);
+                        return 1;
+                    }
+
+                    n_past_guidance += n_eval;
+                }
+            }
+
            for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
                int n_eval = (int) embd.size() - i;
                if (n_eval > params.n_batch) {
@@ -678,6 +755,7 @@ int main(int argc, char ** argv) {
        }

        embd.clear();
+        embd_guidance.clear();

        if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
            // optionally save the session on first sample (for faster prompt loading next time)
@@ -688,11 +766,11 @@ int main(int argc, char ** argv) {
                LOG("saved session to %s\n", path_session.c_str());
            }

-            const llama_token id = gpt_sampler_sample(smpl, ctx, -1);
+            const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);

-            gpt_sampler_accept(smpl, id, /* apply_grammar= */ true);
+            llama_sampling_accept(ctx_sampling, ctx, id, /* apply_grammar= */ true);

-            // LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, smpl->prev.to_vector()).c_str());
+            LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());

            embd.push_back(id);

@@ -711,7 +789,7 @@ int main(int argc, char ** argv) {

                // push the prompt in the sampling context in order to apply repetition penalties later
                // for the prompt, we don't apply grammar rules
-                gpt_sampler_accept(smpl, embd_inp[n_consumed], /* apply_grammar= */ false);
+                llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], /* apply_grammar= */ false);

                ++n_consumed;
                if ((int) embd.size() >= params.n_batch) {
@@ -754,7 +832,7 @@ int main(int argc, char ** argv) {
            // check for reverse prompt in the last n_prev tokens
            if (!params.antiprompt.empty()) {
                const int n_prev = 32;
-                const std::string last_output = gpt_sampler_prev_str(smpl, ctx, n_prev);
+                const std::string last_output = llama_sampling_prev_str(ctx_sampling, ctx, n_prev);

                is_antiprompt = false;
                // Check if each of the reverse prompts appears at the end of the output.
@@ -776,7 +854,7 @@ int main(int argc, char ** argv) {
                }

                // check for reverse prompt using special tokens
-                llama_token last_token = gpt_sampler_last(smpl);
+                llama_token last_token = llama_sampling_last(ctx_sampling);
                for (std::vector<llama_token> ids : antiprompt_ids) {
                    if (ids.size() == 1 && last_token == ids[0]) {
                        if (params.interactive) {
@@ -793,7 +871,7 @@ int main(int argc, char ** argv) {
            }

            // deal with end of generation tokens in interactive mode
-            if (llama_token_is_eog(model, gpt_sampler_last(smpl))) {
+            if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) {
                LOG("found an EOG token\n");

                if (params.interactive) {
@@ -814,7 +892,7 @@ int main(int argc, char ** argv) {

            // if current token is not EOG, we add it to current assistant message
            if (params.conversation) {
-                const auto id = gpt_sampler_last(smpl);
+                auto id = llama_sampling_last(ctx_sampling);
                assistant_ss << llama_token_to_piece(ctx, id, false);
            }

@@ -910,7 +988,7 @@ int main(int argc, char ** argv) {

            if (n_past > 0) {
                if (is_interacting) {
-                    gpt_sampler_reset(smpl);
+                    llama_sampling_reset(ctx_sampling);
                }
                is_interacting = false;
            }
@@ -935,15 +1013,14 @@ int main(int argc, char ** argv) {
        llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
    }

-    LOG_TEE("\n");
-    gpt_perf_print(ctx, smpl);
+    llama_print_timings(ctx);
    write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);

-    gpt_sampler_free(smpl);
-
+    if (ctx_guidance) { llama_free(ctx_guidance); }
    llama_free(ctx);
    llama_free_model(model);

+    llama_sampling_free(ctx_sampling);
    llama_backend_free();

    ggml_threadpool_free(threadpool);
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -1,9 +1,7 @@
 // A basic application simulating a server with multiple clients.
 // The clients submit requests to the server and they are processed in parallel.

-#include "arg.h"
 #include "common.h"
-#include "sampling.h"
 #include "llama.h"

 #include <cmath>
@@ -52,8 +50,8 @@ static std::vector<std::string> k_prompts = {

 struct client {
    ~client() {
-        if (smpl) {
-            gpt_sampler_free(smpl);
+        if (ctx_sampling) {
+            llama_sampling_free(ctx_sampling);
        }
    }

@@ -74,7 +72,7 @@ struct client {
    std::string prompt;
    std::string response;

-    struct gpt_sampler * smpl = nullptr;
+    struct llama_sampling_context * ctx_sampling = nullptr;
 };

 static void print_date_time() {
@@ -102,7 +100,8 @@ int main(int argc, char ** argv) {

    gpt_params params;

-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
+    if (!gpt_params_parse(argc, argv, params)) {
+        gpt_params_print_usage(argc, argv, params);
        return 1;
    }

@@ -162,7 +161,7 @@ int main(int argc, char ** argv) {
    for (size_t i = 0; i < clients.size(); ++i) {
        auto & client = clients[i];
        client.id = i;
-        client.smpl = gpt_sampler_init(model, params.sparams);
+        client.ctx_sampling = llama_sampling_init(params.sparams);
    }

    std::vector<llama_token> tokens_system;
@@ -254,7 +253,7 @@ int main(int argc, char ** argv) {
                    client.prompt   = client.input + "\nAssistant:";
                    client.response = "";

-                    gpt_sampler_reset(client.smpl);
+                    llama_sampling_reset(client.ctx_sampling);

                    // do not prepend BOS because we have a system prompt!
                    std::vector<llama_token> tokens_prompt;
@@ -342,9 +341,9 @@ int main(int argc, char ** argv) {
                //printf("client %d, seq %d, token %d, pos %d, batch %d\n",
                //        client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);

-                const llama_token id = gpt_sampler_sample(client.smpl, ctx, client.i_batch - i);
+                const llama_token id = llama_sampling_sample(client.ctx_sampling, ctx, NULL, client.i_batch - i);

-                gpt_sampler_accept(client.smpl, id, true);
+                llama_sampling_accept(client.ctx_sampling, ctx, id, true);

                if (client.n_decoded == 1) {
                    // start measuring generation time after the first token to make sure all concurrent clients
@@ -372,7 +371,7 @@ int main(int argc, char ** argv) {
                    }

                    // delete only the generated part of the sequence, i.e. keep the system prompt in the cache
-                    llama_kv_cache_seq_rm(ctx,    client.id + 1, -1, -1);
+                    llama_kv_cache_seq_rm(ctx, client.id + 1, -1, -1);
                    llama_kv_cache_seq_cp(ctx, 0, client.id + 1, -1, -1);

                    const auto t_main_end = ggml_time_us();
@@ -414,8 +413,7 @@ int main(int argc, char ** argv) {

    LOG_TEE("\n");

-    // TODO: print sampling/grammar timings for all clients
-    llama_perf_context_print(ctx);
+    llama_print_timings(ctx);

    llama_batch_free(batch);

--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -1,4 +1,3 @@
-#include "arg.h"
 #include "common.h"
 #include "llama.h"

@@ -7,7 +6,9 @@
 #include <string>
 #include <vector>

-static void print_usage(int, char ** argv) {
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+    gpt_params_print_usage(argc, argv, params);
+
    LOG_TEE("\nexample usage:\n");
    LOG_TEE("\n    %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]);
    LOG_TEE("\n");
@@ -20,10 +21,13 @@ int main(int argc, char ** argv) {
    params.n_keep = 32;
    params.i_pos  = -1;

-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PASSKEY, print_usage)) {
+    if (!gpt_params_parse(argc, argv, params)) {
+        print_usage(argc, argv, params);
        return 1;
    }

+    srand(params.seed == LLAMA_DEFAULT_SEED ? time(NULL) : params.seed);
+
    int n_junk = params.n_junk;
    int n_keep = params.n_keep;
    int n_grp  = params.grp_attn_n;
@@ -76,17 +80,12 @@ int main(int argc, char ** argv) {
    GGML_ASSERT(ctx_params.n_batch % n_grp == 0 && "n_batch must be divisible by n_grp");

    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
+
    if (ctx == NULL) {
        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
        return 1;
    }

-    auto sparams = llama_sampler_chain_default_params();
-
-    llama_sampler * smpl = llama_sampler_chain_init(sparams);
-
-    llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
-
    // tokenize the prompt
    std::vector<llama_token> tokens_list;
    tokens_list = ::llama_tokenize(ctx, params.prompt, true);
@@ -218,7 +217,20 @@ int main(int argc, char ** argv) {
    while (n_cur <= n_len) {
        // sample the next token
        {
-            const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1);
+            auto   n_vocab = llama_n_vocab(model);
+            auto * logits  = llama_get_logits_ith(ctx, batch.n_tokens - 1);
+
+            std::vector<llama_token_data> candidates;
+            candidates.reserve(n_vocab);
+
+            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+                candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
+            }
+
+            llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+
+            // sample the most likely token
+            const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);

            // is it an end of generation?
            if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
@@ -255,13 +267,10 @@ int main(int argc, char ** argv) {
    LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
            __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));

-    LOG_TEE("\n");
-    llama_perf_context_print(ctx);
+    llama_print_timings(ctx);

    fprintf(stderr, "\n");

-    llama_sampler_free(smpl);
-
    llama_batch_free(batch);

    llama_free(ctx);
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -1,19 +1,18 @@
-#include "arg.h"
 #include "common.h"
 #include "llama.h"

-#include <array>
-#include <atomic>
 #include <cmath>
 #include <cstdio>
 #include <cstring>
 #include <ctime>
-#include <fstream>
-#include <mutex>
-#include <random>
 #include <sstream>
 #include <thread>
+#include <mutex>
+#include <atomic>
 #include <vector>
+#include <array>
+#include <fstream>
+#include <sstream>

 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@@ -77,7 +76,7 @@ static void write_logfile(
    fprintf(logfile, "ppl_value: %f\n", results.ppl_value);
    yaml_dump_vector_float(logfile, "probs", results.probs);

-    llama_perf_dump_yaml(logfile, ctx);
+    llama_dump_timing_info_yaml(logfile, ctx);
    fclose(logfile);
 }

@@ -1968,7 +1967,8 @@ int main(int argc, char ** argv) {
    params.n_ctx = 512;
    params.logits_all = true;

-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {
+    if (!gpt_params_parse(argc, argv, params)) {
+        gpt_params_print_usage(argc, argv, params);
        return 1;
    }

@@ -2007,6 +2007,14 @@ int main(int argc, char ** argv) {

    print_build_info();

+    if (params.seed == LLAMA_DEFAULT_SEED) {
+        params.seed = time(NULL);
+    }
+
+    fprintf(stderr, "%s: seed  = %u\n", __func__, params.seed);
+
+    std::mt19937 rng(params.seed);
+
    llama_backend_init();
    llama_numa_init(params.numa);

@@ -2046,8 +2054,7 @@ int main(int argc, char ** argv) {
        results = perplexity(ctx, params, n_ctx);
    }

-    LOG_TEE("\n");
-    llama_perf_context_print(ctx);
+    llama_print_timings(ctx);
    write_logfile(ctx, params, model, results);

    llama_free(ctx);
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -1,7 +1,7 @@
+#define LLAMA_API_INTERNAL
 #include "common.h"
 #include "ggml.h"
 #include "llama.h"
-#include "llama-impl.h"

 #include <algorithm>
 #include <cassert>
@@ -319,7 +319,8 @@ int main(int argc, char ** argv) {
        }

        auto cparams = llama_context_default_params();
-        cparams.n_ctx = 256;
+        cparams.n_ctx      = 256;
+        cparams.seed       = 1;

        ctx = llama_new_context_with_model(model, cparams);

--- a/examples/quantize/CMakeLists.txt
+++ b/examples/quantize/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(TARGET llama-quantize)
 add_executable(${TARGET} quantize.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
 target_include_directories(${TARGET} PRIVATE ../../common)
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/quantize/README.md
+++ b/examples/quantize/README.md
@@ -54,8 +54,6 @@ As the models are currently fully loaded into memory, you will need adequate dis

 Several quantization methods are supported. They differ in the resulting model disk size and inference speed.

-The quantization formats `Q4_0_4_4`, `Q4_0_4_8` and `Q4_0_8_8` are block interleaved variants of the `Q4_0` format, providing a data layout that is better suited for specific implementations of optimized mulmat kernels. Since these formats differ only in data layout, they have the same quantized size as the `Q4_0` format.
-
 *(outdated)*

 | Model | Measure      |    F16 |   Q4_0 |   Q4_1 |   Q5_0 |   Q5_1 |   Q8_0 |
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -26,8 +26,6 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
    { "IQ2_M",    LLAMA_FTYPE_MOSTLY_IQ2_M,    " 2.7  bpw quantization",            },
    { "IQ1_S",    LLAMA_FTYPE_MOSTLY_IQ1_S,    " 1.56 bpw quantization",            },
    { "IQ1_M",    LLAMA_FTYPE_MOSTLY_IQ1_M,    " 1.75 bpw quantization",            },
-    { "TQ1_0",    LLAMA_FTYPE_MOSTLY_TQ1_0,    " 1.69 bpw ternarization",           },
-    { "TQ2_0",    LLAMA_FTYPE_MOSTLY_TQ2_0,    " 2.06 bpw ternarization",           },
    { "Q2_K",     LLAMA_FTYPE_MOSTLY_Q2_K,     " 2.96G, +3.5199 ppl @ Llama-3-8B",  },
    { "Q2_K_S",   LLAMA_FTYPE_MOSTLY_Q2_K_S,   " 2.96G, +3.1836 ppl @ Llama-3-8B",  },
    { "IQ3_XXS",  LLAMA_FTYPE_MOSTLY_IQ3_XXS,  " 3.06 bpw quantization",            },
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@@ -1,11 +1,12 @@
-#include "arg.h"
 #include "common.h"
 #include "llama.h"

 #include <algorithm>
 #include <fstream>

-static void print_usage(int, char ** argv) {
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+    gpt_params_print_usage(argc, argv, params);
+
    LOG_TEE("\nexample usage:\n");
    LOG_TEE("\n    %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]);
    LOG_TEE("\n");
@@ -112,7 +113,8 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
 int main(int argc, char ** argv) {
    gpt_params params;

-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_RETRIEVAL, print_usage)) {
+    if (!gpt_params_parse(argc, argv, params)) {
+        print_usage(argc, argv, params);
        return 1;
    }

@@ -291,11 +293,9 @@ int main(int argc, char ** argv) {
        }
    }

-    LOG_TEE("\n");
-    llama_perf_context_print(ctx);
-
    // clean up
    llama_batch_free(query_batch);
+    llama_print_timings(ctx);
    llama_free(ctx);
    llama_free_model(model);
    llama_backend_free();
--- a/examples/rpc/README.md
+++ b/examples/rpc/README.md
@@ -10,21 +10,20 @@ This can be used for distributed LLM inference with `llama.cpp` in the following

 ```mermaid
 flowchart TD
-    rpcb<-->|TCP|srva
-    rpcb<-->|TCP|srvb
-    rpcb<-.->|TCP|srvn
+    rpcb---|TCP|srva
+    rpcb---|TCP|srvb
+    rpcb-.-|TCP|srvn
    subgraph hostn[Host N]
-    srvn[rpc-server]<-.->backend3["Backend (CUDA,Metal,etc.)"]
+    srvn[rpc-server]-.-backend3["Backend (CUDA,Metal,etc.)"]
    end
    subgraph hostb[Host B]
-    srvb[rpc-server]<-->backend2["Backend (CUDA,Metal,etc.)"]
+    srvb[rpc-server]---backend2["Backend (CUDA,Metal,etc.)"]
    end
    subgraph hosta[Host A]
-    srva[rpc-server]<-->backend["Backend (CUDA,Metal,etc.)"]
+    srva[rpc-server]---backend["Backend (CUDA,Metal,etc.)"]
    end
    subgraph host[Main Host]
-    local["Backend (CUDA,Metal,etc.)"]<-->ggml[llama-cli]
-    ggml[llama-cli]<-->rpcb[RPC backend]
+    ggml[llama.cpp]---rpcb[RPC backend]
    end
    style hostn stroke:#66,stroke-width:2px,stroke-dasharray: 5 5
 ```
@@ -63,12 +62,17 @@ $ CUDA_VISIBLE_DEVICES=0 bin/rpc-server -p 50052
 This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device.


-On the main host build `llama.cpp` for the local backend and add `-DGGML_RPC=ON` to the build options.
-Finally, when running `llama-cli`, use the `--rpc` option to specify the host and port of each `rpc-server`:
+On the main host build `llama.cpp` only with `-DGGML_RPC=ON`:
+
+```bash
+mkdir build-rpc
+cd build-rpc
+cmake .. -DGGML_RPC=ON
+cmake --build . --config Release
+```
+
+Finally, use the `--rpc` option to specify the host and port of each `rpc-server`:

 ```bash
 $ bin/llama-cli -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99
 ```
-
-This way you can offload model layers to both local and remote devices.
-
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -1,17 +1,17 @@
-#include "arg.h"
 #include "common.h"
 #include "llama.h"

 #include <vector>
 #include <cstdio>
+#include <chrono>

 int main(int argc, char ** argv) {
    gpt_params params;

    params.prompt = "The quick brown fox";
-    params.sparams.seed = 1234;

-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
+    if (!gpt_params_parse(argc, argv, params)) {
+        gpt_params_print_usage(argc, argv, params);
        return 1;
    }

@@ -38,13 +38,6 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    auto sparams = llama_sampler_chain_default_params();
-
-    llama_sampler * smpl = llama_sampler_chain_init(sparams);
-
-    llama_sampler_chain_add(smpl, llama_sampler_init_softmax());
-    llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sparams.seed));
-
    // tokenize prompt
    auto tokens = llama_tokenize(ctx, params.prompt, true);

@@ -71,7 +64,16 @@ int main(int argc, char ** argv) {
    printf("\nfirst run: %s", params.prompt.c_str());

    for (auto i = 0; i < params.n_predict; i++) {
-        auto next_token     = llama_sampler_sample(smpl, ctx, -1);
+        auto * logits = llama_get_logits(ctx);
+        auto n_vocab = llama_n_vocab(model);
+
+        std::vector<llama_token_data> candidates;
+        candidates.reserve(n_vocab);
+        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+            candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+        }
+        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+        auto next_token = llama_sample_token(ctx, &candidates_p);
        auto next_token_str = llama_token_to_piece(ctx, next_token);

        printf("%s", next_token_str.c_str());
@@ -94,11 +96,6 @@ int main(int argc, char ** argv) {
    // make new context
    auto * ctx2 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));

-    llama_sampler * smpl2 = llama_sampler_chain_init(sparams);
-
-    llama_sampler_chain_add(smpl2, llama_sampler_init_softmax());
-    llama_sampler_chain_add(smpl2, llama_sampler_init_dist(params.sparams.seed));
-
    printf("\nsecond run: %s", params.prompt.c_str());

    // load state (rng, logits, embedding and kv_cache) from file
@@ -127,7 +124,15 @@ int main(int argc, char ** argv) {

    // second run
    for (auto i = 0; i < params.n_predict; i++) {
-        auto next_token     = llama_sampler_sample(smpl2, ctx2, -1);
+        auto * logits = llama_get_logits(ctx2);
+        auto n_vocab = llama_n_vocab(model);
+        std::vector<llama_token_data> candidates;
+        candidates.reserve(n_vocab);
+        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+            candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+        }
+        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+        auto next_token = llama_sample_token(ctx2, &candidates_p);
        auto next_token_str = llama_token_to_piece(ctx2, next_token);

        printf("%s", next_token_str.c_str());
@@ -152,12 +157,7 @@ int main(int argc, char ** argv) {
    }

    // make new context
-    auto * ctx3 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));
-
-    llama_sampler * smpl3 = llama_sampler_chain_init(sparams);
-
-    llama_sampler_chain_add(smpl3, llama_sampler_init_softmax());
-    llama_sampler_chain_add(smpl3, llama_sampler_init_dist(params.sparams.seed));
+    auto* ctx3 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));

    printf("\nsingle seq run: %s", params.prompt.c_str());

@@ -215,7 +215,15 @@ int main(int argc, char ** argv) {

    // third run with seq 1 instead of 0
    for (auto i = 0; i < params.n_predict; i++) {
-        auto next_token     = llama_sampler_sample(smpl3, ctx3, -1);
+        auto * logits = llama_get_logits(ctx3);
+        auto n_vocab = llama_n_vocab(model);
+        std::vector<llama_token_data> candidates;
+        candidates.reserve(n_vocab);
+        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+            candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+        }
+        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+        auto next_token = llama_sample_token(ctx3, &candidates_p);
        auto next_token_str = llama_token_to_piece(ctx3, next_token);

        printf("%s", next_token_str.c_str());
@@ -232,10 +240,6 @@ int main(int argc, char ** argv) {

    printf("\n");

-    llama_sampler_free(smpl);
-    llama_sampler_free(smpl2);
-    llama_sampler_free(smpl3);
-
    llama_free(ctx3);
    llama_free_model(model);

--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@@ -30,7 +30,6 @@ set(PUBLIC_ASSETS
    system-prompts.js
    prompt-formats.js
    json-schema-to-grammar.mjs
-    loading.html
 )

 foreach(asset ${PUBLIC_ASSETS})
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -17,126 +17,262 @@ The project is under active development, and we are [looking for feedback and co

 ## Usage

-| Argument | Explanation |
-| -------- | ----------- |
-| `-h, --help, --usage` | print usage and exit |
-| `--version` | show version and build info |
-| `-v, --verbose` | print verbose information |
-| `--verbosity N` | set specific verbosity level (default: 0) |
-| `-t, --threads N` | number of threads to use during generation (default: -1)<br/>(env: LLAMA_ARG_THREADS) |
-| `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) |
-| `-C, --cpu-mask M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: "") |
-| `-Cr, --cpu-range lo-hi` | range of CPUs for affinity. Complements --cpu-mask |
-| `--cpu-strict <0\|1>` | use strict CPU placement (default: 0)<br/> |
-| `--prio N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0)<br/> |
-| `--poll <0...100>` | use polling level to wait for work (0 - no polling, default: 50)<br/> |
-| `-Cb, --cpu-mask-batch M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask) |
-| `-Crb, --cpu-range-batch lo-hi` | ranges of CPUs for affinity. Complements --cpu-mask-batch |
-| `--cpu-strict-batch <0\|1>` | use strict CPU placement (default: same as --cpu-strict) |
-| `--prio-batch N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0)<br/> |
-| `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) |
-| `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
-| `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)<br/>(env: LLAMA_ARG_N_PREDICT) |
-| `-b, --batch-size N` | logical maximum batch size (default: 2048)<br/>(env: LLAMA_ARG_BATCH) |
-| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
-| `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
-| `-fa, --flash-attn` | enable Flash Attention (default: disabled)<br/>(env: LLAMA_ARG_FLASH_ATTN) |
-| `-p, --prompt PROMPT` | prompt to start generation with |
-| `-f, --file FNAME` | a file containing the prompt (default: none) |
-| `-bf, --binary-file FNAME` | binary file containing the prompt (default: none) |
-| `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
-| `--no-escape` | do not process escape sequences |
-| `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
-| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: top_k;tfs_z;typ_p;top_p;min_p;temperature) |
-| `-s, --seed SEED` | RNG seed (default: -1, use random seed for < 0) |
-| `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: kfypmt) |
-| `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
-| `--penalize-nl` | penalize newline tokens (default: false) |
-| `--temp N` | temperature (default: 0.8) |
-| `--top-k N` | top-k sampling (default: 40, 0 = disabled) |
-| `--top-p N` | top-p sampling (default: 0.9, 1.0 = disabled) |
-| `--min-p N` | min-p sampling (default: 0.1, 0.0 = disabled) |
-| `--tfs N` | tail free sampling, parameter z (default: 1.0, 1.0 = disabled) |
-| `--typical N` | locally typical sampling, parameter p (default: 1.0, 1.0 = disabled) |
-| `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) |
-| `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled) |
-| `--presence-penalty N` | repeat alpha presence penalty (default: 0.0, 0.0 = disabled) |
-| `--frequency-penalty N` | repeat alpha frequency penalty (default: 0.0, 0.0 = disabled) |
-| `--dynatemp-range N` | dynamic temperature range (default: 0.0, 0.0 = disabled) |
-| `--dynatemp-exp N` | dynamic temperature exponent (default: 1.0) |
-| `--mirostat N` | use Mirostat sampling.<br/>Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.<br/>(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) |
-| `--mirostat-lr N` | Mirostat learning rate, parameter eta (default: 0.1) |
-| `--mirostat-ent N` | Mirostat target entropy, parameter tau (default: 5.0) |
-| `-l, --logit-bias TOKEN_ID(+/-)BIAS` | modifies the likelihood of token appearing in the completion,<br/>i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',<br/>or `--logit-bias 15043-1` to decrease likelihood of token ' Hello' |
-| `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') |
-| `--grammar-file FNAME` | file to read grammar from |
-| `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
-| `--rope-scaling {none,linear,yarn}` | RoPE frequency scaling method, defaults to linear unless specified by the model |
-| `--rope-scale N` | RoPE context scaling factor, expands context by a factor of N |
-| `--rope-freq-base N` | RoPE base frequency, used by NTK-aware scaling (default: loaded from model) |
-| `--rope-freq-scale N` | RoPE frequency scaling factor, expands context by a factor of 1/N |
-| `--yarn-orig-ctx N` | YaRN: original context size of model (default: 0 = model training context size) |
-| `--yarn-ext-factor N` | YaRN: extrapolation mix factor (default: -1.0, 0.0 = full interpolation) |
-| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: 1.0) |
-| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: 1.0) |
-| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: 32.0) |
-| `-gan, --grp-attn-n N` | group-attention factor (default: 1) |
-| `-gaw, --grp-attn-w N` | group-attention width (default: 512.0) |
-| `-dkvc, --dump-kv-cache` | verbose print of the KV cache |
-| `-nkvo, --no-kv-offload` | disable KV offload |
-| `-ctk, --cache-type-k TYPE` | KV cache data type for K (default: f16) |
-| `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16) |
-| `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: -1.0, < 0 - disabled)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
-| `-np, --parallel N` | number of parallel sequences to decode (default: 1) |
-| `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
-| `-nocb, --no-cont-batching` | disable continuous batching<br/>(env: LLAMA_ARG_NO_CONT_BATCHING) |
-| `--mlock` | force system to keep model in RAM rather than swapping or compressing |
-| `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock) |
-| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggerganov/llama.cpp/issues/1437 |
-| `-ngl, --gpu-layers, --n-gpu-layers N` | number of layers to store in VRAM<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
-| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs |
-| `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1 |
-| `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0) |
-| `--check-tensors` | check model tensor data for invalid values (default: false) |
-| `--override-kv KEY=TYPE:VALUE` | advanced option to override model metadata by key. may be specified multiple times.<br/>types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false |
-| `--lora FNAME` | path to LoRA adapter (can be repeated to use multiple adapters) |
-| `--lora-scaled FNAME SCALE` | path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters) |
-| `--control-vector FNAME` | add a control vector<br/>note: this argument can be repeated to add multiple control vectors |
-| `--control-vector-scaled FNAME SCALE` | add a control vector with user defined scaling SCALE<br/>note: this argument can be repeated to add multiple scaled control vectors |
-| `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive |
-| `-a, --alias STRING` | set alias for model name (to be used by REST API) |
-| `-m, --model FNAME` | model path (default: `models/$filename` with filename from `--hf-file` or `--model-url` if set, otherwise models/7B/ggml-model-f16.gguf)<br/>(env: LLAMA_ARG_MODEL) |
-| `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
-| `-hfr, --hf-repo REPO` | Hugging Face model repository (default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
-| `-hff, --hf-file FILE` | Hugging Face model file (default: unused)<br/>(env: LLAMA_ARG_HF_FILE) |
-| `-hft, --hf-token TOKEN` | Hugging Face access token (default: value from HF_TOKEN environment variable)<br/>(env: HF_TOKEN) |
-| `--host HOST` | ip address to listen (default: 127.0.0.1)<br/>(env: LLAMA_ARG_HOST) |
-| `--port PORT` | port to listen (default: 8080)<br/>(env: LLAMA_ARG_PORT) |
-| `--path PATH` | path to serve static files from (default: ) |
-| `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
-| `--api-key KEY` | API key to use for authentication (default: none)<br/>(env: LLAMA_API_KEY) |
-| `--api-key-file FNAME` | path to file containing API keys (default: none) |
-| `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key |
-| `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate |
-| `-to, --timeout N` | server read/write timeout in seconds (default: 600) |
-| `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
-| `-spf, --system-prompt-file FNAME` | set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications |
-| `--log-format {text, json}` | log output format: json or text (default: json) |
-| `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
-| `--no-slots` | disables slots monitoring endpoint (default: enabled)<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
-| `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
-| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted:<br/>https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
-| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)<br/> |
-| `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
-| `-ld, --logdir LOGDIR` | path under which to save YAML logs (no logging if unset) |
-| `--log-test` | Log test |
-| `--log-disable` | Log disable |
-| `--log-enable` | Log enable |
-| `--log-new` | Log new |
-| `--log-append` | Log append |
-| `--log-file FNAME` | Log file |
+```
+usage: ./llama-server [options]

-Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var.
+general:
+
+  -h,    --help, --usage          print usage and exit
+         --version                show version and build info
+  -v,    --verbose                print verbose information
+         --verbosity N            set specific verbosity level (default: 0)
+         --verbose-prompt         print a verbose prompt before generation (default: false)
+         --no-display-prompt      don't print prompt at generation (default: false)
+  -co,   --color                  colorise output to distinguish prompt and user input from generations (default: false)
+  -s,    --seed SEED              RNG seed (default: -1, use random seed for < 0)
+  -t,    --threads N              number of threads to use during generation (default: 8)
+  -tb,   --threads-batch N        number of threads to use during batch and prompt processing (default: same as --threads)
+  -td,   --threads-draft N        number of threads to use during generation (default: same as --threads)
+  -tbd,  --threads-batch-draft N  number of threads to use during batch and prompt processing (default: same as --threads-draft)
+         --draft N                number of tokens to draft for speculative decoding (default: 5)
+  -ps,   --p-split N              speculative decoding split probability (default: 0.1)
+  -lcs,  --lookup-cache-static FNAME
+                                  path to static lookup cache to use for lookup decoding (not updated by generation)
+  -lcd,  --lookup-cache-dynamic FNAME
+                                  path to dynamic lookup cache to use for lookup decoding (updated by generation)
+  -c,    --ctx-size N             size of the prompt context (default: 0, 0 = loaded from model)
+  -n,    --predict N              number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)
+  -b,    --batch-size N           logical maximum batch size (default: 2048)
+  -ub,   --ubatch-size N          physical maximum batch size (default: 512)
+         --keep N                 number of tokens to keep from the initial prompt (default: 0, -1 = all)
+         --chunks N               max number of chunks to process (default: -1, -1 = all)
+  -fa,   --flash-attn             enable Flash Attention (default: disabled)
+  -p,    --prompt PROMPT          prompt to start generation with
+                                  in conversation mode, this will be used as system prompt
+                                  (default: '')
+  -f,    --file FNAME             a file containing the prompt (default: none)
+         --in-file FNAME          an input file (repeat to specify multiple files)
+  -bf,   --binary-file FNAME      binary file containing the prompt (default: none)
+  -e,    --escape                 process escapes sequences (\n, \r, \t, \', \", \\) (default: true)
+         --no-escape              do not process escape sequences
+  -ptc,  --print-token-count N    print token count every N tokens (default: -1)
+         --prompt-cache FNAME     file to cache prompt state for faster startup (default: none)
+         --prompt-cache-all       if specified, saves user input and generations to cache as well
+                                  not supported with --interactive or other interactive options
+         --prompt-cache-ro        if specified, uses the prompt cache but does not update it
+  -r,    --reverse-prompt PROMPT  halt generation at PROMPT, return control in interactive mode
+                                  can be specified more than once for multiple prompts
+  -sp,   --special                special tokens output enabled (default: false)
+  -cnv,  --conversation           run in conversation mode, does not print special tokens and suffix/prefix
+                                  if suffix/prefix are not specified, default chat template will be used
+                                  (default: false)
+  -i,    --interactive            run in interactive mode (default: false)
+  -if,   --interactive-first      run in interactive mode and wait for input right away (default: false)
+  -mli,  --multiline-input        allows you to write or paste multiple lines without ending each in '\'
+         --in-prefix-bos          prefix BOS to user inputs, preceding the `--in-prefix` string
+         --in-prefix STRING       string to prefix user inputs with (default: empty)
+         --in-suffix STRING       string to suffix after user inputs with (default: empty)
+         --spm-infill             use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled)
+
+sampling:
+
+         --samplers SAMPLERS      samplers that will be used for generation in the order, separated by ';'
+                                  (default: top_k;tfs_z;typical_p;top_p;min_p;temperature)
+         --sampling-seq SEQUENCE  simplified sequence for samplers that will be used (default: kfypmt)
+         --ignore-eos             ignore end of stream token and continue generating (implies --logit-bias EOS-inf)
+         --penalize-nl            penalize newline tokens (default: false)
+         --temp N                 temperature (default: 0.8)
+         --top-k N                top-k sampling (default: 40, 0 = disabled)
+         --top-p N                top-p sampling (default: 0.9, 1.0 = disabled)
+         --min-p N                min-p sampling (default: 0.1, 0.0 = disabled)
+         --tfs N                  tail free sampling, parameter z (default: 1.0, 1.0 = disabled)
+         --typical N              locally typical sampling, parameter p (default: 1.0, 1.0 = disabled)
+         --repeat-last-n N        last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size)
+         --repeat-penalty N       penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled)
+         --presence-penalty N     repeat alpha presence penalty (default: 0.0, 0.0 = disabled)
+         --frequency-penalty N    repeat alpha frequency penalty (default: 0.0, 0.0 = disabled)
+         --dynatemp-range N       dynamic temperature range (default: 0.0, 0.0 = disabled)
+         --dynatemp-exp N         dynamic temperature exponent (default: 1.0)
+         --mirostat N             use Mirostat sampling.
+                                  Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.
+                                  (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)
+         --mirostat-lr N          Mirostat learning rate, parameter eta (default: 0.1)
+         --mirostat-ent N         Mirostat target entropy, parameter tau (default: 5.0)
+         -l TOKEN_ID(+/-)BIAS     modifies the likelihood of token appearing in the completion,
+                                  i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',
+                                  or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'
+         --cfg-negative-prompt PROMPT
+                                  negative prompt to use for guidance (default: '')
+         --cfg-negative-prompt-file FNAME
+                                  negative prompt file to use for guidance
+         --cfg-scale N            strength of guidance (default: 1.0, 1.0 = disable)
+         --chat-template JINJA_TEMPLATE
+                                  set custom jinja chat template (default: template taken from model's metadata)
+                                  if suffix/prefix are specified, template will be disabled
+                                  only commonly used templates are accepted:
+                                  https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
+
+grammar:
+
+         --grammar GRAMMAR        BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '')
+         --grammar-file FNAME     file to read grammar from
+  -j,    --json-schema SCHEMA     JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object
+                                  For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead
+
+embedding:
+
+         --pooling {none,mean,cls,last}
+                                  pooling type for embeddings, use model default if unspecified
+         --attention {causal,non-causal}
+                                  attention type for embeddings, use model default if unspecified
+
+context hacking:
+
+         --rope-scaling {none,linear,yarn}
+                                  RoPE frequency scaling method, defaults to linear unless specified by the model
+         --rope-scale N           RoPE context scaling factor, expands context by a factor of N
+         --rope-freq-base N       RoPE base frequency, used by NTK-aware scaling (default: loaded from model)
+         --rope-freq-scale N      RoPE frequency scaling factor, expands context by a factor of 1/N
+         --yarn-orig-ctx N        YaRN: original context size of model (default: 0 = model training context size)
+         --yarn-ext-factor N      YaRN: extrapolation mix factor (default: -1.0, 0.0 = full interpolation)
+         --yarn-attn-factor N     YaRN: scale sqrt(t) or attention magnitude (default: 1.0)
+         --yarn-beta-slow N       YaRN: high correction dim or alpha (default: 1.0)
+         --yarn-beta-fast N       YaRN: low correction dim or beta (default: 32.0)
+  -gan,  --grp-attn-n N           group-attention factor (default: 1)
+  -gaw,  --grp-attn-w N           group-attention width (default: 512.0)
+  -dkvc, --dump-kv-cache          verbose print of the KV cache
+  -nkvo, --no-kv-offload          disable KV offload
+  -ctk,  --cache-type-k TYPE      KV cache data type for K (default: f16)
+  -ctv,  --cache-type-v TYPE      KV cache data type for V (default: f16)
+
+perplexity:
+
+         --all-logits             return logits for all tokens in the batch (default: false)
+         --hellaswag              compute HellaSwag score over random tasks from datafile supplied with -f
+         --hellaswag-tasks N      number of tasks to use when computing the HellaSwag score (default: 400)
+         --winogrande             compute Winogrande score over random tasks from datafile supplied with -f
+         --winogrande-tasks N     number of tasks to use when computing the Winogrande score (default: 0)
+         --multiple-choice        compute multiple choice score over random tasks from datafile supplied with -f
+         --multiple-choice-tasks N
+                                  number of tasks to use when computing the multiple choice score (default: 0)
+         --kl-divergence          computes KL-divergence to logits provided via --kl-divergence-base
+         --ppl-stride N           stride for perplexity calculation (default: 0)
+         --ppl-output-type {0,1}  output type for perplexity calculation (default: 0)
+
+parallel:
+
+  -dt,   --defrag-thold N         KV cache defragmentation threshold (default: -1.0, < 0 - disabled)
+  -np,   --parallel N             number of parallel sequences to decode (default: 1)
+  -ns,   --sequences N            number of sequences to decode (default: 1)
+  -cb,   --cont-batching          enable continuous batching (a.k.a dynamic batching) (default: enabled)
+
+multi-modality:
+
+         --mmproj FILE            path to a multimodal projector file for LLaVA. see examples/llava/README.md
+         --image FILE             path to an image file. use with multimodal models. Specify multiple times for batching
+
+backend:
+
+         --rpc SERVERS            comma separated list of RPC servers
+         --mlock                  force system to keep model in RAM rather than swapping or compressing
+         --no-mmap                do not memory-map model (slower load but may reduce pageouts if not using mlock)
+         --numa TYPE              attempt optimizations that help on some NUMA systems
+                                    - distribute: spread execution evenly over all nodes
+                                    - isolate: only spawn threads on CPUs on the node that execution started on
+                                    - numactl: use the CPU map provided by numactl
+                                  if run without this previously, it is recommended to drop the system page cache before using this
+                                  see https://github.com/ggerganov/llama.cpp/issues/1437
+
+model:
+
+         --check-tensors          check model tensor data for invalid values (default: false)
+         --override-kv KEY=TYPE:VALUE
+                                  advanced option to override model metadata by key. may be specified multiple times.
+                                  types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false
+         --lora FNAME             apply LoRA adapter (implies --no-mmap)
+         --lora-scaled FNAME S    apply LoRA adapter with user defined scaling S (implies --no-mmap)
+         --lora-base FNAME        optional model to use as a base for the layers modified by the LoRA adapter
+         --control-vector FNAME   add a control vector
+                                  note: this argument can be repeated to add multiple control vectors
+         --control-vector-scaled FNAME SCALE
+                                  add a control vector with user defined scaling SCALE
+                                  note: this argument can be repeated to add multiple scaled control vectors
+         --control-vector-layer-range START END
+                                  layer range to apply the control vector(s) to, start and end inclusive
+  -m,    --model FNAME            model path (default: models/$filename with filename from --hf-file
+                                  or --model-url if set, otherwise models/7B/ggml-model-f16.gguf)
+  -md,   --model-draft FNAME      draft model for speculative decoding (default: unused)
+  -mu,   --model-url MODEL_URL    model download url (default: unused)
+  -hfr,  --hf-repo REPO           Hugging Face model repository (default: unused)
+  -hff,  --hf-file FILE           Hugging Face model file (default: unused)
+  -hft,  --hf-token TOKEN         Hugging Face access token (default: value from HF_TOKEN environment variable)
+
+server:
+
+         --host HOST              ip address to listen (default: 127.0.0.1)
+         --port PORT              port to listen (default: 8080)
+         --path PATH              path to serve static files from (default: )
+         --embedding(s)           restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)
+         --api-key KEY            API key to use for authentication (default: none)
+         --api-key-file FNAME     path to file containing API keys (default: none)
+         --ssl-key-file FNAME     path to file a PEM-encoded SSL private key
+         --ssl-cert-file FNAME    path to file a PEM-encoded SSL certificate
+         --timeout N              server read/write timeout in seconds (default: 600)
+         --threads-http N         number of threads used to process HTTP requests (default: -1)
+         --system-prompt-file FNAME
+                                  set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications
+         --log-format {text,json}
+                                  log output format: json or text (default: json)
+         --metrics                enable prometheus compatible metrics endpoint (default: disabled)
+         --no-slots               disables slots monitoring endpoint (default: enabled)
+         --slot-save-path PATH    path to save slot kv cache (default: disabled)
+         --chat-template JINJA_TEMPLATE
+                                  set custom jinja chat template (default: template taken from model's metadata)
+                                  only commonly used templates are accepted:
+                                  https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
+  -sps,  --slot-prompt-similarity SIMILARITY
+                                  how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)
+         --lora-init-without-apply
+                                  load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled)
+
+logging:
+
+         --simple-io              use basic IO for better compatibility in subprocesses and limited consoles
+  -ld,   --logdir LOGDIR          path under which to save YAML logs (no logging if unset)
+         --log-test               Run simple logging test
+         --log-disable            Disable trace logs
+         --log-enable             Enable trace logs
+         --log-file FNAME         Specify a log filename (without extension)
+         --log-new                Create a separate new log file on start. Each log file will have unique name: "<name>.<ID>.log"
+         --log-append             Don't truncate the old log file.
+```
+
+Available environment variables (if specified, these variables will override parameters specified in arguments):
+
+- `LLAMA_CACHE`: cache directory, used by `--hf-repo`
+- `HF_TOKEN`: Hugging Face access token, used when accessing a gated model with `--hf-repo`
+- `LLAMA_ARG_MODEL`: equivalent to `-m`
+- `LLAMA_ARG_MODEL_URL`: equivalent to `-mu`
+- `LLAMA_ARG_MODEL_ALIAS`: equivalent to `-a`
+- `LLAMA_ARG_HF_REPO`: equivalent to `--hf-repo`
+- `LLAMA_ARG_HF_FILE`: equivalent to `--hf-file`
+- `LLAMA_ARG_THREADS`: equivalent to `-t`
+- `LLAMA_ARG_CTX_SIZE`: equivalent to `-c`
+- `LLAMA_ARG_N_PARALLEL`: equivalent to `-np`
+- `LLAMA_ARG_BATCH`: equivalent to `-b`
+- `LLAMA_ARG_UBATCH`: equivalent to `-ub`
+- `LLAMA_ARG_N_GPU_LAYERS`: equivalent to `-ngl`
+- `LLAMA_ARG_THREADS_HTTP`: equivalent to `--threads-http`
+- `LLAMA_ARG_CHAT_TEMPLATE`: equivalent to `--chat-template`
+- `LLAMA_ARG_N_PREDICT`: equivalent to `-n`
+- `LLAMA_ARG_ENDPOINT_METRICS`: if set to `1`, it will enable metrics endpoint (equivalent to `--metrics`)
+- `LLAMA_ARG_ENDPOINT_SLOTS`: if set to `0`, it will **disable** slots endpoint (equivalent to `--no-slots`). This feature is enabled by default.
+- `LLAMA_ARG_EMBEDDINGS`: if set to `1`, it will enable embeddings endpoint (equivalent to `--embeddings`)
+- `LLAMA_ARG_FLASH_ATTN`: if set to `1`, it will enable flash attention (equivalent to `-fa`)
+- `LLAMA_ARG_CONT_BATCHING`: if set to `0`, it will **disable** continuous batching (equivalent to `--no-cont-batching`). This feature is enabled by default.
+- `LLAMA_ARG_DEFRAG_THOLD`: equivalent to `-dt`
+- `LLAMA_ARG_HOST`: equivalent to `--host`
+- `LLAMA_ARG_PORT`: equivalent to `--port`

 Example usage of docker compose with environment variables:

@@ -153,7 +289,7 @@ services:
      LLAMA_ARG_MODEL: /models/my_model.gguf
      LLAMA_ARG_CTX_SIZE: 4096
      LLAMA_ARG_N_PARALLEL: 2
-      LLAMA_ARG_ENDPOINT_METRICS: 1
+      LLAMA_ARG_ENDPOINT_METRICS: 1  # to disable, either remove or set to 0
      LLAMA_ARG_PORT: 8080
 ```

@@ -334,6 +470,8 @@ node index.js

    `frequency_penalty`: Repeat alpha frequency penalty. Default: `0.0`, which is disabled.

+    `penalty_prompt`: This will replace the `prompt` for the purpose of the penalty evaluation. Can be either `null`, a string or an array of numbers representing tokens. Default: `null`, which is to use the original `prompt`.
+
    `mirostat`: Enable Mirostat sampling, controlling perplexity during text generation. Default: `0`, where `0` is disabled, `1` is Mirostat, and `2` is Mirostat 2.0.

    `mirostat_tau`: Set the Mirostat target entropy, parameter tau. Default: `5.0`
@@ -407,44 +545,9 @@ Notice that each `probs` is an array of length `n_probs`.

    *Options:*

-    `content`: (Required) The text to tokenize.
+    `content`: Set the text to tokenize.

-    `add_special`: (Optional) Boolean indicating if special tokens, i.e. `BOS`, should be inserted.  Default: `false`
-
-    `with_pieces`: (Optional) Boolean indicating whether to return token pieces along with IDs.  Default: `false`
-
-**Response:**
-
-Returns a JSON object with a `tokens` field containing the tokenization result. The `tokens` array contains either just token IDs or objects with `id` and `piece` fields, depending on the `with_pieces` parameter. The piece field is a string if the piece is valid unicode or a list of bytes otherwise.
-
-
-If `with_pieces` is `false`:
-```json
-{
-  "tokens": [123, 456, 789]
-}
-```
-
-If `with_pieces` is `true`:
-```json
-{
-  "tokens": [
-    {"id": 123, "piece": "Hello"},
-    {"id": 456, "piece": " world"},
-    {"id": 789, "piece": "!"}
-  ]
-}
-```
-
-With input 'á' (utf8 hex: C3 A1) on tinyllama/stories260k
-```json
-{
-  "tokens": [
-    {"id": 198, "piece": [195]}, // hex C3
-    {"id": 164, "piece": [161]} // hex A1
-  ]
-}
-```
+    `add_special`: Boolean indicating if special tokens, i.e. `BOS`, should be inserted.  Default: `false`

 ### POST `/detokenize`: Convert tokens to text

@@ -621,6 +724,7 @@ Example:
            "stopping_word": ""
        },
        "penalize_nl": true,
+        "penalty_prompt_tokens": [],
        "presence_penalty": 0.0,
        "prompt": "Say hello to llama.cpp",
        "repeat_last_n": 64,
@@ -644,7 +748,8 @@ Example:
        "tfs_z": 1.0,
        "top_k": 40,
        "top_p": 0.949999988079071,
-        "typical_p": 1.0
+        "typical_p": 1.0,
+        "use_penalty_prompt_tokens": false
    }
 ]
 ```
--- a/examples/server/public/loading.html
+++ b/examples/server/public/loading.html
@@ -1,12 +0,0 @@
-<!DOCTYPE html>
-<html>
-    <head>
-        <meta http-equiv="refresh" content="5">
-    </head>
-    <body>
-        <div id="loading">
-            The model is loading. Please wait.<br/>
-            The user interface will appear soon.
-        </div>
-    </body>
-</html>
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1,10 +1,9 @@
 #include "utils.hpp"

-#include "arg.h"
 #include "common.h"
-#include "sampling.h"
 #include "json-schema-to-grammar.h"
 #include "llama.h"
+#include "grammar-parser.h"

 // Change JSON_ASSERT from assert() to GGML_ASSERT:
 #define JSON_ASSERT GGML_ASSERT
@@ -28,7 +27,6 @@
 #include "system-prompts.js.hpp"
 #include "prompt-formats.js.hpp"
 #include "json-schema-to-grammar.mjs.hpp"
-#include "loading.html.hpp"

 #include <atomic>
 #include <chrono>
@@ -52,12 +50,15 @@ enum stop_type {
    STOP_TYPE_PARTIAL,
 };

-// state diagram: https://github.com/ggerganov/llama.cpp/pull/9283
 enum slot_state {
    SLOT_STATE_IDLE,
-    SLOT_STATE_PROCESSING_PROMPT,
-    SLOT_STATE_DONE_PROMPT,
-    SLOT_STATE_GENERATING,
+    SLOT_STATE_PROCESSING,
+};
+
+enum slot_command {
+    SLOT_COMMAND_NONE,
+    SLOT_COMMAND_LOAD_PROMPT,
+    SLOT_COMMAND_RELEASE,
 };

 enum server_state {
@@ -134,6 +135,7 @@ struct server_slot {
    struct slot_params params;

    slot_state state = SLOT_STATE_IDLE;
+    slot_command command = SLOT_COMMAND_NONE;

    // used to determine the slot that has been used the longest
    int64_t t_last_used = -1;
@@ -171,12 +173,10 @@ struct server_slot {
    std::string stopping_word;

    // sampling
-    json json_schema;
-
-    struct gpt_sampler_params sparams;
-    struct gpt_sampler * smpl = nullptr;
-
    llama_token sampled;
+    struct llama_sampling_params sparams;
+    llama_sampling_context * ctx_sampling = nullptr;
+    json json_schema;

    int32_t ga_i = 0;   // group-attention state
    int32_t ga_n = 1;   // group-attention factor
@@ -194,8 +194,6 @@ struct server_slot {
    double t_prompt_processing; // ms
    double t_token_generation; // ms

-    std::function<void(int)> callback_on_release;
-
    void reset() {
        n_prompt_tokens    = 0;
        generated_text     = "";
@@ -230,28 +228,25 @@ struct server_slot {
        return n_remaining > 0; // no budget
    }

+    bool available() const {
+        return state == SLOT_STATE_IDLE && command == SLOT_COMMAND_NONE;
+    }
+
    bool is_processing() const {
-        return state != SLOT_STATE_IDLE;
+        return (state == SLOT_STATE_IDLE && command == SLOT_COMMAND_LOAD_PROMPT) || state == SLOT_STATE_PROCESSING;
    }

    void add_token_string(const completion_token_output & token) {
-        if (!is_processing()) {
+        if (command == SLOT_COMMAND_RELEASE) {
            return;
        }
        generated_token_probs.push_back(token);
    }

    void release() {
-        if (is_processing()) {
+        if (state == SLOT_STATE_PROCESSING) {
            t_token_generation = (ggml_time_us() - t_start_generation) / 1e3;
-            state = SLOT_STATE_IDLE;
-            LOG_INFO("slot released", {
-                {"id_slot",   id},
-                {"id_task",   id_task},
-                {"n_past",    n_past},
-                {"truncated", truncated},
-            });
-            callback_on_release(id);
+            command = SLOT_COMMAND_RELEASE;
        }
    }

@@ -358,9 +353,6 @@ struct server_metrics {
    uint64_t n_tokens_predicted  = 0;
    uint64_t t_tokens_generation = 0;

-    uint64_t n_decode_total     = 0;
-    uint64_t n_busy_slots_total = 0;
-
    void init() {
        t_start = ggml_time_us();
    }
@@ -379,15 +371,6 @@ struct server_metrics {
        t_tokens_generation_total  += slot.t_token_generation;
    }

-    void on_decoded(const std::vector<server_slot> & slots) {
-        n_decode_total++;
-        for (const auto & slot : slots) {
-            if (slot.is_processing()) {
-                n_busy_slots_total++;
-            }
-        }
-    }
-
    void reset_bucket() {
        n_prompt_tokens_processed = 0;
        t_prompt_processing       = 0;
@@ -429,7 +412,6 @@ struct server_queue {

    // multi-task version of post()
    int post(std::vector<server_task> & tasks, bool front = false) {
-        std::unique_lock<std::mutex> lock(mutex_tasks);
        for (auto & task : tasks) {
            if (task.id == -1) {
                task.id = id++;
@@ -449,7 +431,6 @@ struct server_queue {
    void defer(server_task task) {
        std::unique_lock<std::mutex> lock(mutex_tasks);
        queue_tasks_deferred.push_back(std::move(task));
-        condition_tasks.notify_one();
    }

    // Get the next id for creating a new task
@@ -470,14 +451,14 @@ struct server_queue {
        callback_update_slots = std::move(callback);
    }

-    // Call when the state of one slot is changed, it will move one task from deferred to main queue
-    void pop_deferred_task() {
+    // Call when the state of one slot is changed
+    void notify_slot_changed() {
+        // move deferred tasks back to main loop
        std::unique_lock<std::mutex> lock(mutex_tasks);
-        if (!queue_tasks_deferred.empty()) {
-            queue_tasks.emplace_back(std::move(queue_tasks_deferred.front()));
-            queue_tasks_deferred.pop_front();
+        for (auto & task : queue_tasks_deferred) {
+            queue_tasks.push_back(std::move(task));
        }
-        condition_tasks.notify_one();
+        queue_tasks_deferred.clear();
    }

    // end the start_loop routine
@@ -507,7 +488,7 @@ struct server_queue {
                    break;
                }
                server_task task = queue_tasks.front();
-                queue_tasks.pop_front();
+                queue_tasks.erase(queue_tasks.begin());
                lock.unlock();
                LOG_VERBOSE("callback_new_task", {{"id_task", task.id}});
                callback_new_task(task);
@@ -616,7 +597,7 @@ struct server_context {

    gpt_params params;

-    llama_batch batch = {};
+    llama_batch batch;

    bool clean_kv_cache = true;
    bool add_bos_token  = true;
@@ -655,8 +636,8 @@ struct server_context {

        // Clear any sampling context
        for (server_slot & slot : slots) {
-            if (slot.smpl != nullptr) {
-                gpt_sampler_free(slot.smpl);
+            if (slot.ctx_sampling != nullptr) {
+                llama_sampling_free(slot.ctx_sampling);
            }
        }

@@ -735,10 +716,6 @@ struct server_context {

            slot.sparams = params.sparams;

-            slot.callback_on_release = [this](int) {
-                queue_tasks.pop_deferred_task();
-            };
-
            slot.reset();

            slots.push_back(slot);
@@ -820,7 +797,7 @@ struct server_context {

            for (server_slot & slot : slots) {
                // skip the slot if it is not available
-                if (slot.is_processing()) {
+                if (!slot.available()) {
                    continue;
                }

@@ -862,7 +839,7 @@ struct server_context {
            int64_t t_last = ggml_time_us();
            for (server_slot & slot : slots) {
                // skip the slot if it is not available
-                if (slot.is_processing()) {
+                if (!slot.available()) {
                    continue;
                }

@@ -887,8 +864,8 @@ struct server_context {
    bool launch_slot_with_task(server_slot & slot, const server_task & task) {
        slot_params default_params;
        // Sampling parameter defaults are loaded from the global server context (but individual requests can still override them)
-        auto default_sparams = params.sparams;
-        const auto & data = task.data;
+        llama_sampling_params default_sparams = params.sparams;
+        auto & data = task.data;

        if (data.count("__oaicompat") != 0) {
            slot.oaicompat = true;
@@ -905,7 +882,7 @@ struct server_context {
        slot.sparams.top_p             = json_value(data, "top_p",             default_sparams.top_p);
        slot.sparams.min_p             = json_value(data, "min_p",             default_sparams.min_p);
        slot.sparams.tfs_z             = json_value(data, "tfs_z",             default_sparams.tfs_z);
-        slot.sparams.typ_p             = json_value(data, "typical_p",         default_sparams.typ_p);
+        slot.sparams.typical_p         = json_value(data, "typical_p",         default_sparams.typical_p);
        slot.sparams.temp              = json_value(data, "temperature",       default_sparams.temp);
        slot.sparams.dynatemp_range    = json_value(data, "dynatemp_range",    default_sparams.dynatemp_range);
        slot.sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent);
@@ -927,8 +904,7 @@ struct server_context {
        if (data.contains("json_schema") && !data.at("json_schema").is_null() && data.contains("grammar") && !data.at("grammar").is_null()) {
            send_error(task, "Either \"json_schema\" or \"grammar\" can be specified, but not both", ERROR_TYPE_INVALID_REQUEST);
            return false;
-        }
-        if (data.contains("json_schema") && !data.contains("grammar")) {
+        } else if (data.contains("json_schema") && !data.contains("grammar")) {
            try {
                auto schema                = json_value(data, "json_schema", json::object());
                slot.sparams.grammar       = json_schema_to_grammar(schema);
@@ -978,11 +954,56 @@ struct server_context {
            }
        }

+        // penalize user-provided tokens
+        {
+            slot.sparams.penalty_prompt_tokens.clear();
+            slot.sparams.use_penalty_prompt_tokens = false;
+
+            const auto & penalty_prompt = data.find("penalty_prompt");
+
+            if (penalty_prompt != data.end()) {
+                if (penalty_prompt->is_string()) {
+                    const auto penalty_prompt_string = penalty_prompt->get<std::string>();
+                    slot.sparams.penalty_prompt_tokens = llama_tokenize(model, penalty_prompt_string, false);
+
+                    if (slot.params.n_predict > 0) {
+                        slot.sparams.penalty_prompt_tokens.reserve(slot.sparams.penalty_prompt_tokens.size() + slot.params.n_predict);
+                    }
+                    slot.sparams.use_penalty_prompt_tokens = true;
+
+                    LOG_VERBOSE("penalty_prompt_tokens", {
+                        {"id_slot", slot.id},
+                        {"tokens",  slot.sparams.penalty_prompt_tokens},
+                    });
+                }
+                else if (penalty_prompt->is_array()) {
+                    const auto n_tokens = penalty_prompt->size();
+                    slot.sparams.penalty_prompt_tokens.reserve(n_tokens + std::max(0, slot.params.n_predict));
+
+                    const int n_vocab = llama_n_vocab(model);
+                    for (const auto & penalty_token : *penalty_prompt) {
+                        if (penalty_token.is_number_integer()) {
+                            const auto tok = penalty_token.get<llama_token>();
+                            if (tok >= 0 && tok < n_vocab) {
+                                slot.sparams.penalty_prompt_tokens.push_back(tok);
+                            }
+                        }
+                    }
+                    slot.sparams.use_penalty_prompt_tokens = true;
+
+                    LOG_VERBOSE("penalty_prompt_tokens", {
+                        {"id_slot", slot.id},
+                        {"tokens",  slot.sparams.penalty_prompt_tokens},
+                    });
+                }
+            }
+        }
+
        {
            slot.sparams.logit_bias.clear();

            if (json_value(data, "ignore_eos", false) && has_eos_token) {
-                slot.sparams.logit_bias.push_back({llama_token_eos(model), -INFINITY});
+                slot.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
            }

            const auto & logit_bias = data.find("logit_bias");
@@ -1003,12 +1024,12 @@ struct server_context {
                        if (el[0].is_number_integer()) {
                            llama_token tok = el[0].get<llama_token>();
                            if (tok >= 0 && tok < n_vocab) {
-                                slot.sparams.logit_bias.push_back({tok, bias});
+                                slot.sparams.logit_bias[tok] = bias;
                            }
                        } else if (el[0].is_string()) {
                            auto toks = llama_tokenize(model, el[0].get<std::string>(), false);
                            for (auto tok : toks) {
-                                slot.sparams.logit_bias.push_back({tok, bias});
+                                slot.sparams.logit_bias[tok] = bias;
                            }
                        }
                    }
@@ -1030,34 +1051,33 @@ struct server_context {
        }

        {
-            const auto & samplers = data.find("samplers");
-            if (samplers != data.end() && samplers->is_array()) {
+            const auto & samplers_sequence = data.find("samplers");
+            if (samplers_sequence != data.end() && samplers_sequence->is_array()) {
                std::vector<std::string> sampler_names;
-                for (const auto & name : *samplers) {
-                    if (name.is_string()) {
-                        sampler_names.emplace_back(name);
+                for (const auto & sampler_name : *samplers_sequence) {
+                    if (sampler_name.is_string()) {
+                        sampler_names.emplace_back(sampler_name);
                    }
                }
-                slot.sparams.samplers = gpt_sampler_types_from_names(sampler_names, false);
+                slot.sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, false);
            } else {
-                slot.sparams.samplers = default_sparams.samplers;
+                slot.sparams.samplers_sequence = default_sparams.samplers_sequence;
            }
        }

        {
-            if (slot.smpl != nullptr) {
-                gpt_sampler_free(slot.smpl);
+            if (slot.ctx_sampling != nullptr) {
+                llama_sampling_free(slot.ctx_sampling);
            }
-
-            slot.smpl = gpt_sampler_init(model, slot.sparams);
-            if (slot.smpl == nullptr) {
+            slot.ctx_sampling = llama_sampling_init(slot.sparams);
+            if (slot.ctx_sampling == nullptr) {
                // for now, the only error that may happen here is invalid grammar
                send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST);
                return false;
            }
        }

-        slot.state = SLOT_STATE_PROCESSING_PROMPT;
+        slot.command = SLOT_COMMAND_LOAD_PROMPT;
        slot.prompt_tokens.clear();

        LOG_INFO("slot is processing task", {
@@ -1139,6 +1159,11 @@ struct server_context {
        slot.generated_text += token_str;
        slot.has_next_token = true;

+        if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1) {
+            // we can change penalty_prompt_tokens because it is always created from scratch each request
+            slot.ctx_sampling->params.penalty_prompt_tokens.push_back(result.tok);
+        }
+
        // check if there is incomplete UTF-8 character at the end
        bool incomplete = false;
        for (unsigned i = 1; i < 5 && i <= slot.generated_text.size(); ++i) {
@@ -1256,10 +1281,13 @@ struct server_context {
    }

    json get_formated_generation(const server_slot & slot) const {
-        std::vector<std::string> samplers;
-        samplers.reserve(slot.sparams.samplers.size());
-        for (const auto & sampler : slot.sparams.samplers) {
-            samplers.emplace_back(gpt_sampler_type_to_str(sampler));
+        const auto eos_bias   =             slot.sparams.logit_bias.find(llama_token_eos(model));
+        const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() && eos_bias->second < 0.0f && std::isinf(eos_bias->second);
+
+        std::vector<std::string> samplers_sequence;
+        samplers_sequence.reserve(slot.sparams.samplers_sequence.size());
+        for (const auto & sampler_type : slot.sparams.samplers_sequence) {
+            samplers_sequence.emplace_back(llama_sampling_type_to_str(sampler_type));
        }

        return json {
@@ -1267,7 +1295,6 @@ struct server_context {
            {"n_predict",                 slot.n_predict},     // Server configured n_predict
            {"model",                     params.model_alias},
            {"seed",                      slot.sparams.seed},
-            {"seed_cur",                  slot.smpl ? gpt_sampler_get_seed(slot.smpl) : 0},
            {"temperature",               slot.sparams.temp},
            {"dynatemp_range",            slot.sparams.dynatemp_range},
            {"dynatemp_exponent",         slot.sparams.dynatemp_exponent},
@@ -1275,11 +1302,13 @@ struct server_context {
            {"top_p",                     slot.sparams.top_p},
            {"min_p",                     slot.sparams.min_p},
            {"tfs_z",                     slot.sparams.tfs_z},
-            {"typical_p",                 slot.sparams.typ_p},
+            {"typical_p",                 slot.sparams.typical_p},
            {"repeat_last_n",             slot.sparams.penalty_last_n},
            {"repeat_penalty",            slot.sparams.penalty_repeat},
            {"presence_penalty",          slot.sparams.penalty_present},
            {"frequency_penalty",         slot.sparams.penalty_freq},
+            {"penalty_prompt_tokens",     slot.sparams.penalty_prompt_tokens},
+            {"use_penalty_prompt_tokens", slot.sparams.use_penalty_prompt_tokens},
            {"mirostat",                  slot.sparams.mirostat},
            {"mirostat_tau",              slot.sparams.mirostat_tau},
            {"mirostat_eta",              slot.sparams.mirostat_eta},
@@ -1288,13 +1317,13 @@ struct server_context {
            {"max_tokens",                slot.params.n_predict}, // User configured n_predict
            {"n_keep",                    slot.params.n_keep},
            {"n_discard",                 slot.params.n_discard},
-            {"ignore_eos",                slot.sparams.ignore_eos},
+            {"ignore_eos",                ignore_eos},
            {"stream",                    slot.params.stream},
-          //{"logit_bias",                slot.sparams.logit_bias},
+            {"logit_bias",                slot.sparams.logit_bias},
            {"n_probs",                   slot.sparams.n_probs},
            {"min_keep",                  slot.sparams.min_keep},
            {"grammar",                   slot.sparams.grammar},
-            {"samplers",                  samplers},
+            {"samplers",                  samplers_sequence}
        };
    }

@@ -1592,7 +1621,7 @@ struct server_context {
                        queue_tasks.defer(task);
                        break;
                    }
-                    if (slot->is_processing()) {
+                    if (!slot->available()) {
                        // if requested slot is unavailable, we defer this task for processing later
                        LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
                        queue_tasks.defer(task);
@@ -1698,9 +1727,6 @@ struct server_context {
                        { "n_tokens_predicted",              metrics.n_tokens_predicted},
                        { "t_tokens_generation",             metrics.t_tokens_generation},

-                        { "n_decode_total",                  metrics.n_decode_total},
-                        { "n_busy_slots_total",              metrics.n_busy_slots_total},
-
                        { "kv_cache_tokens_count",           llama_get_kv_cache_token_count(ctx)},
                        { "kv_cache_used_cells",             llama_get_kv_cache_used_cells(ctx)},

@@ -1720,7 +1746,7 @@ struct server_context {
                        send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
                        break;
                    }
-                    if (slot->is_processing()) {
+                    if (!slot->available()) {
                        // if requested slot is unavailable, we defer this task for processing later
                        LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
                        queue_tasks.defer(task);
@@ -1761,7 +1787,7 @@ struct server_context {
                        send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
                        break;
                    }
-                    if (slot->is_processing()) {
+                    if (!slot->available()) {
                        // if requested slot is unavailable, we defer this task for processing later
                        LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
                        queue_tasks.defer(task);
@@ -1809,7 +1835,7 @@ struct server_context {
                        send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
                        break;
                    }
-                    if (slot->is_processing()) {
+                    if (!slot->available()) {
                        // if requested slot is unavailable, we defer this task for processing later
                        LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
                        queue_tasks.defer(task);
@@ -1849,12 +1875,33 @@ struct server_context {
            system_prompt_update();
        }

+        // release slots
+        for (auto & slot : slots) {
+            if (slot.command == SLOT_COMMAND_RELEASE) {
+                slot.state       = SLOT_STATE_IDLE;
+                slot.command     = SLOT_COMMAND_NONE;
+                slot.t_last_used = ggml_time_us();
+
+                LOG_INFO("slot released", {
+                    {"id_slot",         slot.id},
+                    {"id_task",         slot.id_task},
+                    {"n_ctx",           n_ctx},
+                    {"n_past",          slot.n_past},
+                    {"n_system_tokens", system_tokens.size()},
+                    {"n_cache_tokens",  slot.cache_tokens.size()},
+                    {"truncated",       slot.truncated}
+                });
+
+                queue_tasks.notify_slot_changed();
+            }
+        }
+
        // check if all slots are idle
        {
            bool all_idle = true;

            for (auto & slot : slots) {
-                if (slot.is_processing()) {
+                if (slot.state != SLOT_STATE_IDLE || slot.command != SLOT_COMMAND_NONE) {
                    all_idle = false;
                    break;
                }
@@ -1925,7 +1972,7 @@ struct server_context {

        // frist, add sampled tokens from any ongoing sequences
        for (auto & slot : slots) {
-            if (slot.state != SLOT_STATE_GENERATING) {
+            if (slot.state == SLOT_STATE_IDLE) {
                continue;
            }

@@ -1967,7 +2014,7 @@ struct server_context {
        if (params.cont_batching || batch.n_tokens == 0) {
            for (auto & slot : slots) {
                // this slot still has a prompt to be processed
-                if (slot.state == SLOT_STATE_PROCESSING_PROMPT) {
+                if (slot.state == SLOT_STATE_IDLE && slot.command == SLOT_COMMAND_LOAD_PROMPT) {
                    auto & prompt_tokens = slot.prompt_tokens;

                    // we haven't tokenized the prompt yet - do it now:
@@ -2035,6 +2082,8 @@ struct server_context {
                                {"id_task", slot.id_task}
                            });

+                            slot.state = SLOT_STATE_PROCESSING;
+                            slot.command = SLOT_COMMAND_NONE;
                            slot.release();
                            slot.print_timings();
                            send_final_response(slot);
@@ -2044,6 +2093,8 @@ struct server_context {
                        if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING) {
                            // this prompt is too large to process - discard it
                            if (slot.n_prompt_tokens > n_ubatch) {
+                                slot.state = SLOT_STATE_PROCESSING;
+                                slot.command = SLOT_COMMAND_NONE;
                                slot.release();
                                send_error(slot, "input is too large to process. increase the physical batch size", ERROR_TYPE_SERVER);
                                continue;
@@ -2088,7 +2139,7 @@ struct server_context {
                                GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx);
                            }

-                            gpt_sampler_reset(slot.smpl);
+                            llama_sampling_reset(slot.ctx_sampling);

                            if (!slot.params.cache_prompt) {
                                slot.n_past_se = 0;
@@ -2101,7 +2152,7 @@ struct server_context {

                                // push the prompt into the sampling context (do not apply grammar)
                                for (int i = 0; i < slot.n_past; ++i) {
-                                    gpt_sampler_accept(slot.smpl, slot.cache_tokens[i], false);
+                                    llama_sampling_accept(slot.ctx_sampling, ctx, slot.cache_tokens[i], false);
                                }
                            }
                        }
@@ -2154,7 +2205,7 @@ struct server_context {
                        slot.n_past_se = 0;
                        slot.ga_i = 0;
                        // TODO: is the system prompt ever in the sampling context?
-                        gpt_sampler_reset(slot.smpl);
+                        llama_sampling_reset(slot.ctx_sampling);
                    }

                    // remove the non-common part from the cache
@@ -2201,9 +2252,10 @@ struct server_context {
                        {"progress", (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens},
                    });

-                    // entire prompt has been processed
+                    // entire prompt has been processed - start decoding new tokens
                    if (slot.n_past == slot.n_prompt_tokens) {
-                        slot.state = SLOT_STATE_DONE_PROMPT;
+                        slot.state   = SLOT_STATE_PROCESSING;
+                        slot.command = SLOT_COMMAND_NONE;

                        GGML_ASSERT(batch.n_tokens > 0);

@@ -2285,17 +2337,18 @@ struct server_context {
            };

            const int ret = llama_decode(ctx, batch_view);
-            metrics.on_decoded(slots);

            if (ret != 0) {
                if (n_batch == 1 || ret < 0) {
                    // if you get here, it means the KV cache is full - try increasing it via the context size
                    LOG_ERROR("failed to decode the batch: KV cache is full - try increasing it via the context size", {
-                        {"i",       i},
-                        {"n_batch", n_batch},
-                        {"ret",     ret},
+                        {"i",   i},
+                        {"n_batch",  ret},
+                        {"ret",   ret},
                    });
                    for (auto & slot : slots) {
+                        slot.state = SLOT_STATE_PROCESSING;
+                        slot.command = SLOT_COMMAND_NONE;
                        slot.release();
                        send_error(slot, "Input prompt is too big compared to KV size. Please try increasing KV size.");
                    }
@@ -2307,38 +2360,31 @@ struct server_context {
                i -= n_batch;

                LOG_WARNING("failed to find free space in the KV cache, retrying with smaller batch size - try increasing it via the context size or enable defragmentation", {
-                    {"i",       i},
-                    {"n_batch", n_batch},
-                    {"ret",     ret},
+                    {"i",   i},
+                    {"n_batch",  n_batch},
+                    {"ret",   ret},
                });

                continue; // continue loop of n_batch
            }

            for (auto & slot : slots) {
-                if (slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens)) {
+                if (slot.state != SLOT_STATE_PROCESSING || slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens)) {
                    continue; // continue loop of slots
                }

-                if (slot.state == SLOT_STATE_DONE_PROMPT) {
-                    if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING) {
-                        // prompt evaluated for embedding
-                        send_embedding(slot, batch_view);
-                        slot.release();
-                        slot.i_batch = -1;
-                        continue; // continue loop of slots
-                    }
-
-                    // prompt evaluated for next-token prediction
-                    slot.state = SLOT_STATE_GENERATING;
-                } else if (slot.state != SLOT_STATE_GENERATING) {
+                // prompt evaluated for embedding
+                if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING) {
+                    send_embedding(slot, batch_view);
+                    slot.release();
+                    slot.i_batch = -1;
                    continue; // continue loop of slots
                }

                completion_token_output result;
-                const llama_token id = gpt_sampler_sample(slot.smpl, ctx, slot.i_batch - i);
+                const llama_token id = llama_sampling_sample(slot.ctx_sampling, ctx, NULL, slot.i_batch - i);

-                gpt_sampler_accept(slot.smpl, id, true);
+                llama_sampling_accept(slot.ctx_sampling, ctx, id, true);

                slot.n_decoded += 1;
                if (slot.n_decoded == 1) {
@@ -2347,19 +2393,37 @@ struct server_context {
                    metrics.on_prompt_eval(slot);
                }

+                llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false };
                result.tok = id;

-                const auto * cur_p = gpt_sampler_get_candidates(slot.smpl);
+                const size_t n_probs = std::min(cur_p.size, (size_t) slot.sparams.n_probs);
+                if (n_probs > 0) {
+                    const size_t n_valid = slot.ctx_sampling->n_valid;

-                for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) {
-                    result.probs.push_back({
-                        cur_p->data[i].id,
-                        i >= cur_p->size ? 0.0f : cur_p->data[i].p,
-                    });
+                    // Make sure at least n_probs top tokens are at the front of the vector:
+                    if (slot.sparams.temp == 0.0f && n_probs > n_valid) {
+                        llama_sample_top_k(ctx, &cur_p, n_probs, 0);
+                    }
+
+                    if (slot.sparams.temp == 0.0f) {
+                        // With greedy sampling the probabilities have possibly not been calculated.
+                        for (size_t i = 0; i < n_probs; ++i) {
+                            result.probs.push_back({
+                                cur_p.data[i].id,
+                                i == 0 ? 1.0f : 0.0f
+                            });
+                        }
+                    } else {
+                        for (size_t i = 0; i < n_probs; ++i) {
+                            result.probs.push_back({
+                                cur_p.data[i].id,
+                                i >= n_valid ? 0.0f : cur_p.data[i].p // Tokens filtered out due to e.g. top_k have 0 probability.
+                            });
+                        }
+                    }
                }

                if (!process_token(result, slot)) {
-                    // release slot because of stop condition
                    slot.release();
                    slot.print_timings();
                    send_final_response(slot);
@@ -2427,10 +2491,14 @@ int main(int argc, char ** argv) {
    // own arguments required by this example
    gpt_params params;

-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_SERVER)) {
+    if (!gpt_params_parse(argc, argv, params)) {
+        gpt_params_print_usage(argc, argv, params);
        return 1;
    }

+    // parse arguments from environment variables
+    gpt_params_parse_from_env(params);
+
    // TODO: not great to use extern vars
    server_log_json = params.log_json;
    server_verbose = params.verbosity > 0;
@@ -2593,16 +2661,10 @@ int main(int argc, char ** argv) {
        return false;
    };

-    auto middleware_server_state = [&res_error, &state](const httplib::Request & req, httplib::Response & res) {
+    auto middleware_server_state = [&res_error, &state](const httplib::Request &, httplib::Response & res) {
        server_state current_state = state.load();
        if (current_state == SERVER_STATE_LOADING_MODEL) {
-            auto tmp = string_split(req.path, '.');
-            if (req.path == "/" || tmp.back() == "html") {
-                res.set_content(reinterpret_cast<const char*>(loading_html), loading_html_len, "text/html; charset=utf-8");
-                res.status = 503;
-            } else {
-                res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE));
-            }
+            res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE));
            return false;
        }
        return true;
@@ -2642,7 +2704,7 @@ int main(int argc, char ** argv) {
        task.type = SERVER_TASK_TYPE_METRICS;

        ctx_server.queue_results.add_waiting_task_id(task.id);
-        ctx_server.queue_tasks.post(task, true); // high-priority task
+        ctx_server.queue_tasks.post(task);

        // get the result
        server_task_result result = ctx_server.queue_results.recv(task.id);
@@ -2674,7 +2736,7 @@ int main(int argc, char ** argv) {
        task.data.push_back({{"reset_bucket", true}});

        ctx_server.queue_results.add_waiting_task_id(task.id);
-        ctx_server.queue_tasks.post(task, true); // high-priority task
+        ctx_server.queue_tasks.post(task);

        // get the result
        server_task_result result = ctx_server.queue_results.recv(task.id);
@@ -2688,9 +2750,6 @@ int main(int argc, char ** argv) {
        const uint64_t n_tokens_predicted  = data.at("n_tokens_predicted");
        const uint64_t t_tokens_generation = data.at("t_tokens_generation");

-        const uint64_t n_decode_total     = data.at("n_decode_total");
-        const uint64_t n_busy_slots_total = data.at("n_busy_slots_total");
-
        const int32_t kv_cache_used_cells = data.at("kv_cache_used_cells");

        // metrics definition: https://prometheus.io/docs/practices/naming/#metric-names
@@ -2711,14 +2770,6 @@ int main(int argc, char ** argv) {
                    {"name",  "tokens_predicted_seconds_total"},
                    {"help",  "Predict process time"},
                    {"value",  (uint64_t) data.at("t_tokens_generation_total") / 1.e3}
-            }, {
-                    {"name",  "n_decode_total"},
-                    {"help",  "Total number of llama_decode() calls"},
-                    {"value",  n_decode_total}
-            }, {
-                    {"name",  "n_busy_slots_per_decode"},
-                    {"help",  "Average number of busy slots per llama_decode() call"},
-                    {"value",  (float) n_busy_slots_total / (float) n_decode_total}
            }}},
            {"gauge", {{
                    {"name",  "prompt_tokens_seconds"},
@@ -2785,7 +2836,7 @@ int main(int argc, char ** argv) {
        task.data = {
            { "id_slot", id_slot },
            { "filename", filename },
-            { "filepath", filepath },
+            { "filepath", filepath }
        };

        const int id_task = ctx_server.queue_tasks.post(task);
@@ -2815,7 +2866,7 @@ int main(int argc, char ** argv) {
        task.data = {
            { "id_slot", id_slot },
            { "filename", filename },
-            { "filepath", filepath },
+            { "filepath", filepath }
        };

        const int id_task = ctx_server.queue_tasks.post(task);
@@ -2893,7 +2944,7 @@ int main(int argc, char ** argv) {
            { "system_prompt",               ctx_server.system_prompt.c_str() },
            { "default_generation_settings", ctx_server.default_generation_settings_for_props },
            { "total_slots",                 ctx_server.params.n_parallel },
-            { "chat_template",               curr_tmpl.c_str() },
+            { "chat_template",               curr_tmpl.c_str() }
        };

        res_ok(res, data);
@@ -2993,8 +3044,6 @@ int main(int argc, char ** argv) {
                }, [&](json error_data) {
                    server_sent_event(sink, "error", error_data);
                });
-                static const std::string ev_done = "data: [DONE]\n\n";
-                sink.write(ev_done.data(), ev_done.size());
                sink.done();
                return true;
            };
@@ -3006,13 +3055,13 @@ int main(int argc, char ** argv) {
        json models = {
            {"object", "list"},
            {"data", {
-                {
-                    {"id",       params.model_alias},
-                    {"object",   "model"},
-                    {"created",  std::time(0)},
-                    {"owned_by", "llamacpp"},
-                    {"meta",     ctx_server.model_meta()}
-                },
+                 {
+                     {"id",       params.model_alias},
+                     {"object",   "model"},
+                     {"created",  std::time(0)},
+                     {"owned_by", "llamacpp"},
+                     {"meta",     ctx_server.model_meta()}
+                 },
             }}
        };

@@ -3022,39 +3071,12 @@ int main(int argc, char ** argv) {
    const auto handle_tokenize = [&ctx_server, &res_ok](const httplib::Request & req, httplib::Response & res) {
        const json body = json::parse(req.body);

-        json tokens_response = json::array();
+        std::vector<llama_token> tokens;
        if (body.count("content") != 0) {
            const bool add_special = json_value(body, "add_special", false);
-            const bool with_pieces = json_value(body, "with_pieces", false);
-            std::vector<llama_token> tokens = ctx_server.tokenize(body.at("content"), add_special);
-
-            if (with_pieces) {
-                for (const auto& token : tokens) {
-                    std::string piece = llama_token_to_piece(ctx_server.ctx, token);
-                    json piece_json;
-
-                    // Check if the piece is valid UTF-8
-                    if (is_valid_utf8(piece)) {
-                        piece_json = piece;
-                    } else {
-                        // If not valid UTF-8, store as array of byte values
-                        piece_json = json::array();
-                        for (unsigned char c : piece) {
-                            piece_json.push_back(static_cast<int>(c));
-                        }
-                    }
-
-                    tokens_response.push_back({
-                        {"id", token},
-                        {"piece", piece_json}
-                    });
-                }
-            } else {
-                tokens_response = tokens;
-            }
+            tokens = ctx_server.tokenize(body.at("content"), add_special);
        }
-
-        const json data = format_tokenizer_response(tokens_response);
+        const json data = format_tokenizer_response(tokens);
        res_ok(res, data);
    };

--- a/examples/server/tests/features/embeddings.feature
+++ b/examples/server/tests/features/embeddings.feature
@@ -9,11 +9,8 @@ Feature: llama.cpp server
    And   a model alias bert-bge-small
    And   42 as server seed
    And   2 slots
-    # the bert-bge-small model has context size of 512
-    # since the generated prompts are as big as the batch size, we need to set the batch size to 512
-    # ref: https://huggingface.co/BAAI/bge-small-en-v1.5/blob/5c38ec7c405ec4b44b94cc5a9bb96e735b38267a/config.json#L20
-    And   512 as batch size
-    And   512 as ubatch size
+    And   1024 as batch size
+    And   1024 as ubatch size
    And   2048 KV cache size
    And   embeddings extraction
    Then  the server is starting
--- a/examples/server/tests/features/parallel.feature
+++ b/examples/server/tests/features/parallel.feature
@@ -77,35 +77,6 @@ Feature: Parallel
      | disabled  | 128       |
      | enabled   | 64        |

-  Scenario Outline: Multi users with number of prompts exceeding number of slots
-    Given a system prompt You are a writer.
-    And   a model tinyllama-2
-    Given a prompt:
-      """
-      Write a very long book.
-      """
-    And a prompt:
-      """
-      Write another a poem.
-      """
-    And a prompt:
-      """
-      What is LLM?
-      """
-    And a prompt:
-      """
-      The sky is blue and I love it.
-      """
-    And <n_predict> max tokens to predict
-    And streaming is <streaming>
-    Given concurrent OAI completions requests
-    Then the server is busy
-    Then the server is idle
-    Then all prompts are predicted with <n_predict> tokens
-    Examples:
-      | streaming | n_predict |
-      | disabled  | 128       |
-      | enabled   | 64        |

  Scenario:  Multi users with total number of tokens to predict exceeds the KV Cache size #3969
    Given a prompt:
--- a/examples/server/tests/features/passkey.feature
+++ b/examples/server/tests/features/passkey.feature
@@ -15,7 +15,6 @@ Feature: Passkey / Self-extend with context shift
    And   <n_junk> as number of junk
    And   <n_predicted> server max tokens to predict
    And   42 as seed
-    And   0.0 temperature
    And   <n_ctx> KV cache size
    And   1 slots
    And   <n_ga> group attention factor to extend context size through self-extend
@@ -23,8 +22,7 @@ Feature: Passkey / Self-extend with context shift
    # Can be override with N_GPU_LAYERS
    And   <ngl> GPU offloaded layers
    Then  the server is starting
-    # Higher timeout because the model may need to be downloaded from the internet
-    Then  the server is healthy with timeout 120 seconds
+    Then  the server is healthy
    Given available models
    Then  model 0 is trained on <n_ctx_train> tokens context
    Given a prefix prompt:
--- a/examples/server/tests/features/server.feature
+++ b/examples/server/tests/features/server.feature
@@ -105,14 +105,6 @@ Feature: llama.cpp server
    Given first token is removed
    Then  tokens can be detokenized

-  Scenario: Tokenize with pieces
-    When  tokenizing with pieces:
-    """
-    What is the capital of Germany?
-    媽
-    """
-    Then  tokens are given with pieces
-
  Scenario: Models available
    Given available models
    Then  1 models are supported
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -1,6 +1,3 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
 import asyncio
 import json
 import os
@@ -205,15 +202,17 @@ def step_start_server(context):
            time.sleep(0.1)


-async def wait_for_server_status_with_timeout(context, expecting_status: Literal['healthy', 'ready', 'idle', 'busy'] | str, timeout: int):
+@step("the server is {expecting_status}")
+@async_run_until_complete
+async def step_wait_for_the_server_to_be_started(context, expecting_status: Literal['healthy', 'ready', 'idle', 'busy'] | str):
    match expecting_status:
        case 'healthy':
            await wait_for_slots_status(context, context.base_url, 200,
-                                        timeout=timeout)
+                                        timeout=30)

        case 'ready' | 'idle':
            await wait_for_slots_status(context, context.base_url, 200,
-                                        timeout=timeout,
+                                        timeout=30,
                                        params={'fail_on_no_slot': 1},
                                        slots_idle=context.n_slots,
                                        slots_processing=0)
@@ -226,18 +225,6 @@ async def wait_for_server_status_with_timeout(context, expecting_status: Literal
            assert False, "unknown status"


-@step("the server is {expecting_status} with timeout {timeout:d} seconds")
-@async_run_until_complete
-async def step_wait_for_server_status_with_timeout(context, expecting_status: Literal['healthy', 'ready', 'idle', 'busy'] | str, timeout: int):
-    await wait_for_server_status_with_timeout(context, expecting_status, timeout)
-
-
-@step("the server is {expecting_status}")
-@async_run_until_complete
-async def step_wait_for_server_status(context, expecting_status: Literal['healthy', 'ready', 'idle', 'busy'] | str):
-    await wait_for_server_status_with_timeout(context, expecting_status, 30)
-
-
@step('all slots are {expected_slot_status_string}')
@async_run_until_complete
 async def step_all_slots_status(context, expected_slot_status_string: Literal['idle', 'busy'] | str):
@@ -700,32 +687,6 @@ def step_tokenize_set_add_special(context):
    context.tokenize_add_special = True


-@step("tokenizing with pieces")
-@async_run_until_complete
-async def step_tokenize_with_pieces(context):
-    context.tokenized_text = context_text(context)
-    async with aiohttp.ClientSession() as session:
-        tokenize_args = {"content": context.tokenized_text, "with_pieces": True}
-        if getattr(context, "tokenize_add_special", None) is not None:
-            tokenize_args["add_special"] = context.tokenize_add_special
-
-        async with session.post(
-            f"{context.base_url}/tokenize", json=tokenize_args
-        ) as response:
-            assert response.status == 200
-            tokenize_json = await response.json()
-            context.tokens_with_pieces = tokenize_json["tokens"]
-
-
-@step("tokens are given with pieces")
-@async_run_until_complete
-async def step_tokenize_with_pieces(context):
-    # Verify that the response contains both token IDs and pieces
-    assert all(
-        "id" in token and "piece" in token for token in context.tokens_with_pieces
-    )
-
-
@step('tokenizing')
@async_run_until_complete
 async def step_tokenize(context):
@@ -1020,8 +981,6 @@ async def oai_chat_completions(user_prompt,
                            event_data = line.split(': ', 1)
                            assert event_data[0] == 'data', f'Bad event code received: ```{event_data}```'
                            chunk_raw = event_data[1]
-                            if chunk_raw == '[DONE]':
-                                break

                            chunk = json.loads(chunk_raw)
                            assert len(chunk['choices']) == 1, f"no choices provided, line ```{line}```"
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -616,40 +616,7 @@ static json format_embeddings_response_oaicompat(const json & request, const jso
    return res;
 }

-static bool is_valid_utf8(const std::string & str) {
-    const unsigned char* bytes = reinterpret_cast<const unsigned char*>(str.data());
-    const unsigned char* end = bytes + str.length();
-
-    while (bytes < end) {
-        if (*bytes <= 0x7F) {
-            // 1-byte sequence (0xxxxxxx)
-            bytes++;
-        } else if ((*bytes & 0xE0) == 0xC0) {
-            // 2-byte sequence (110xxxxx 10xxxxxx)
-            if (end - bytes < 2 || (bytes[1] & 0xC0) != 0x80)
-                return false;
-            bytes += 2;
-        } else if ((*bytes & 0xF0) == 0xE0) {
-            // 3-byte sequence (1110xxxx 10xxxxxx 10xxxxxx)
-            if (end - bytes < 3 || (bytes[1] & 0xC0) != 0x80 || (bytes[2] & 0xC0) != 0x80)
-                return false;
-            bytes += 3;
-        } else if ((*bytes & 0xF8) == 0xF0) {
-            // 4-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
-            if (end - bytes < 4 || (bytes[1] & 0xC0) != 0x80 ||
-                (bytes[2] & 0xC0) != 0x80 || (bytes[3] & 0xC0) != 0x80)
-                return false;
-            bytes += 4;
-        } else {
-            // Invalid UTF-8 lead byte
-            return false;
-        }
-    }
-
-    return true;
-}
-
-static json format_tokenizer_response(const json & tokens) {
+static json format_tokenizer_response(const std::vector<llama_token> & tokens) {
    return json {
        {"tokens", tokens}
    };
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -1,4 +1,3 @@
-#include "arg.h"
 #include "common.h"
 #include "llama.h"

@@ -7,7 +6,9 @@
 #include <string>
 #include <vector>

-static void print_usage(int, char ** argv) {
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+    gpt_params_print_usage(argc, argv, params);
+
    LOG_TEE("\nexample usage:\n");
    LOG_TEE("\n    %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]);
    LOG_TEE("\n");
@@ -19,7 +20,8 @@ int main(int argc, char ** argv) {
    params.prompt = "Hello my name is";
    params.n_predict = 32;

-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
+    if (!gpt_params_parse(argc, argv, params)) {
+        print_usage(argc, argv, params);
        return 1;
    }

@@ -53,14 +55,6 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    auto sparams = llama_sampler_chain_default_params();
-
-    sparams.no_perf = false;
-
-    llama_sampler * smpl = llama_sampler_chain_init(sparams);
-
-    llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
-
    // tokenize the prompt

    std::vector<llama_token> tokens_list;
@@ -116,7 +110,20 @@ int main(int argc, char ** argv) {
    while (n_cur <= n_predict) {
        // sample the next token
        {
-            const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1);
+            auto   n_vocab = llama_n_vocab(model);
+            auto * logits  = llama_get_logits_ith(ctx, batch.n_tokens - 1);
+
+            std::vector<llama_token_data> candidates;
+            candidates.reserve(n_vocab);
+
+            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+                candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
+            }
+
+            llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+
+            // sample the most likely token
+            const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);

            // is it an end of generation?
            if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
@@ -153,14 +160,12 @@ int main(int argc, char ** argv) {
    LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
            __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));

-    LOG_TEE("\n");
-    llama_perf_sampler_print(smpl);
-    llama_perf_context_print(ctx);
+    llama_print_timings(ctx);

    fprintf(stderr, "\n");

    llama_batch_free(batch);
-    llama_sampler_free(smpl);
+
    llama_free(ctx);
    llama_free_model(model);

--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -1,13 +1,11 @@
-#include "arg.h"
 #include "common.h"
-#include "sampling.h"
 #include "llama.h"

+#include <cmath>
 #include <cstdio>
 #include <string>
 #include <vector>
 #include <set>
-#include <random>

 #define SPEC_VOCAB_MAX_SIZE_DIFFERENCE  100
 #define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
@@ -23,13 +21,14 @@ struct seq_draft {
    std::vector<llama_token> tokens;
    std::vector<std::vector<llama_token_data>> dists;

-    struct gpt_sampler * smpl = nullptr;
+    struct llama_sampling_context * ctx_sampling;
 };

 int main(int argc, char ** argv) {
    gpt_params params;

-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) {
+    if (!gpt_params_parse(argc, argv, params)) {
+        gpt_params_print_usage(argc, argv, params);
        return 1;
    }

@@ -44,7 +43,10 @@ int main(int argc, char ** argv) {
    // probability threshold for splitting a draft branch (only for n_seq_dft > 1)
    const float p_split  = params.p_split;

-    std::default_random_engine rng(params.sparams.seed);
+    if (params.seed == LLAMA_DEFAULT_SEED) {
+        params.seed = time(NULL);
+    }
+    std::default_random_engine rng(params.seed);
    std::uniform_real_distribution<> u_dist;

 #ifndef LOG_DISABLE_LOGS
@@ -177,17 +179,19 @@ int main(int argc, char ** argv) {
    // used to determine end of generation
    bool has_eos = false;

-    // target model sampling context (reuse the llama_context's sampling instance)
-    struct gpt_sampler * smpl = gpt_sampler_init(model_tgt, params.sparams);
-
-    struct llama_sampler * softmax = llama_sampler_init_softmax();
+    // target model sampling context
+    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);

    // draft sequence data
    std::vector<seq_draft> drafts(n_seq_dft);

+    params.sparams.grammar.clear(); // the draft samplers will copy the target sampler's grammar
+    if (params.sparams.temp == 0) {
+        params.sparams.temp = -1.0f; // force greedy sampling with probs for the draft model
+    }
+
    for (int s = 0; s < n_seq_dft; ++s) {
-        // allocate gpt_sampler for each draft sequence
-        drafts[s].smpl = gpt_sampler_init(model_dft, params.sparams);
+        drafts[s].ctx_sampling = llama_sampling_init(params.sparams);
    }

    llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1);
@@ -229,12 +233,12 @@ int main(int argc, char ** argv) {
                bool accept = false;
                if (params.sparams.temp > 0) {
                    // stochastic verification
-                    gpt_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft], true);

-                    auto & dist_tgt = *gpt_sampler_get_candidates(smpl);
+                    llama_token_data_array dist_tgt = llama_sampling_prepare(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft], true, NULL);
+                    llama_sample_softmax(ctx_tgt, &dist_tgt);
+                    float p_tgt = 0, p_dft = 0;

-                    float p_tgt = 0.0f;
-                    float p_dft = 0.0f;
+                    // GGML_ASSERT(dist_tgt.size() == dist_dft.size());

                    while (active_seqs.size() > 0) {
                        // randomly select a sequence to verify from active sequences
@@ -253,13 +257,9 @@ int main(int argc, char ** argv) {
                            }
                            continue;
                        }
-
                        LOG("verifying sequence #%d at pos #%d from %d active sequence(s)\n", s, i_dft, (int) active_seqs.size());
                        float r = u_dist(rng);
-                        llama_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), LLAMA_TOKEN_NULL, true };
-
-                        //GGML_ASSERT(dist_tgt.size <= dist_dft.size);
-
+                        llama_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), true };
                        // acquire the token probabilities assigned by the draft and target models
                        for (size_t i = 0; i < dist_tgt.size; i++) {
                            if (dist_tgt.data[i].id == drafts[s].tokens[i_dft]) {
@@ -278,7 +278,7 @@ int main(int argc, char ** argv) {
                            accept = true;
                            token_id = drafts[s].tokens[i_dft];
                            token_str = llama_token_to_piece(ctx_tgt, token_id);
-                            gpt_sampler_accept(smpl, token_id, true);
+                            llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true);

                            LOG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str());
                            break;
@@ -289,6 +289,7 @@ int main(int argc, char ** argv) {
                            // calculate residual probability
                            GGML_ASSERT(dist_tgt.sorted);
                            GGML_ASSERT(dist_dft.sorted);
+                            float sum_probs = 0.0f;

                            // sort dist by id
                            std::sort(dist_tgt.data, dist_tgt.data + dist_tgt.size, [](const llama_token_data &a, const llama_token_data &b) {
@@ -298,18 +299,10 @@ int main(int argc, char ** argv) {
                                return a.id < b.id;
                            });

-                            float sum_probs = 0.0f;
-
                            for (size_t i = 0; i < dist_tgt.size; i++) {
-                                if (i < dist_dft.size) {
-                                    dist_tgt.data[i].p = std::max(0.0f, dist_tgt.data[i].p - dist_dft.data[i].p);
-                                } else {
-                                    dist_tgt.data[i].p = std::max(0.0f, dist_tgt.data[i].p);
-                                }
-
+                                dist_tgt.data[i].p = std::max(0.0f, dist_tgt.data[i].p - dist_dft.data[i].p);
                                sum_probs += dist_tgt.data[i].p;
                            }
-
                            for (size_t i = 0; i < dist_tgt.size; i++) {
                                dist_tgt.data[i].p /= sum_probs;
                            }
@@ -339,29 +332,21 @@ int main(int argc, char ** argv) {
                        // all drafted tokens were rejected
                        // sample from the target model
                        LOG("all drafted tokens were rejected, sampling from residual distribution\n");
-                        std::vector<float> probs(dist_tgt.size);
-                        for (size_t i = 0; i < dist_tgt.size; ++i) {
-                            probs[i] = dist_tgt.data[i].p;
-                        }
-
-                        std::discrete_distribution<> dist(probs.begin(), probs.end());
-
-                        const int idx = dist(rng);
-
-                        token_id = dist_tgt.data[idx].id;
-                        gpt_sampler_accept(smpl, token_id, true);
+                        token_id = llama_sample_token(ctx_tgt, &dist_tgt);
+                        llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true);
                        token_str = llama_token_to_piece(ctx_tgt, token_id);
                    }
+
                } else {
                    // greedy verification

                    // sample from the target model
                    LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
-                    token_id = gpt_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]);
+                    token_id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);

-                    gpt_sampler_accept(smpl, token_id, true);
+                    llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true);

-                    //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, smpl->prev).c_str());
+                    //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str());

                    token_str = llama_token_to_piece(ctx_tgt, token_id);

@@ -449,10 +434,7 @@ int main(int argc, char ** argv) {
            break;
        }

-        if (drafts[0].smpl) {
-            gpt_sampler_free(drafts[0].smpl);
-        }
-        drafts[0].smpl = gpt_sampler_clone(smpl);
+        llama_sampling_cp(ctx_sampling, drafts[0].ctx_sampling);

        int n_seq_cur  = 1;
        int n_past_cur = n_past_dft;
@@ -481,20 +463,20 @@ int main(int argc, char ** argv) {
                    continue;
                }

-                gpt_sampler_sample(drafts[s].smpl, ctx_dft, drafts[s].i_batch_dft, true);
+                llama_sampling_sample(drafts[s].ctx_sampling, ctx_dft, NULL, drafts[s].i_batch_dft);

-                const auto * cur_p = gpt_sampler_get_candidates(drafts[s].smpl);
+                const auto & cur_p = drafts[s].ctx_sampling->cur;

-                for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p->size); ++k) {
+                for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p.size()); ++k) {
                    LOG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
-                            k, s, i, cur_p->data[k].id, cur_p->data[k].p, llama_token_to_piece(ctx_dft, cur_p->data[k].id).c_str());
+                            k, s, i, cur_p[k].id, cur_p[k].p, llama_token_to_piece(ctx_dft, cur_p[k].id).c_str());
                }

                std::vector<int> sa(1, s);

                // attempt to split the branch if the probability is high enough
                for (int f = 1; f < 8; ++f) {
-                    if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_split) {
+                    if (n_seq_cur < n_seq_dft && cur_p[f].p > p_split) {
                        LOG("splitting seq %3d into %3d\n", s, n_seq_cur);

                        llama_kv_cache_seq_rm(ctx_dft,    n_seq_cur, -1, -1);
@@ -521,10 +503,7 @@ int main(int argc, char ** argv) {
                        drafts[n_seq_cur].i_batch_dft = drafts[s].i_batch_dft;
                        drafts[n_seq_cur].i_batch_tgt = drafts[s].i_batch_tgt;

-                        if (drafts[n_seq_cur].smpl) {
-                            gpt_sampler_free(drafts[n_seq_cur].smpl);
-                        }
-                        drafts[n_seq_cur].smpl = gpt_sampler_clone(drafts[s].smpl);
+                        llama_sampling_cp(drafts[s].ctx_sampling, drafts[n_seq_cur].ctx_sampling);

                        sa.push_back(n_seq_cur);

@@ -536,15 +515,15 @@ int main(int argc, char ** argv) {

                // add drafted token for each sequence
                for (int is = 0; is < (int) sa.size(); ++is) {
-                    const llama_token id = cur_p->data[is].id;
+                    const llama_token id = cur_p[is].id;

                    const int s = sa[is];

-                    gpt_sampler_accept(drafts[s].smpl, id, true);
+                    llama_sampling_accept(drafts[s].ctx_sampling, ctx_dft, id, true);

                    drafts[s].tokens.push_back(id);
                    // save cur_p.data into drafts[s].dists
-                    drafts[s].dists.push_back({cur_p->data, cur_p->data + cur_p->size});
+                    drafts[s].dists.push_back(cur_p);

                    // add unique drafted tokens to the target batch
                    drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens);
@@ -614,19 +593,17 @@ int main(int argc, char ** argv) {
    LOG_TEE("n_accept  = %d\n", n_accept);
    LOG_TEE("accept    = %.3f%%\n", 100.0f * n_accept / n_drafted);

-    LOG_TEE("\ndraft:\n\n");
-    // TODO: print sampling/grammar timings for all drafts
-    llama_perf_context_print(ctx_dft);
+    LOG_TEE("\ndraft:\n");
+    llama_print_timings(ctx_dft);

-    LOG_TEE("\ntarget:\n\n");
-    gpt_perf_print(ctx_tgt, smpl);
+    LOG_TEE("\ntarget:\n");
+    llama_print_timings(ctx_tgt);

-    gpt_sampler_free(smpl);
+    llama_sampling_free(ctx_sampling);
    for (int s = 0; s < n_seq_dft; ++s) {
-        gpt_sampler_free(drafts[s].smpl);
+        llama_sampling_free(drafts[s].ctx_sampling);
    }

-    llama_sampler_free(softmax);
    llama_batch_free(batch_dft);

    llama_free(ctx_tgt);
--- a/examples/sycl/run-llama2.sh
+++ b/examples/sycl/run-llama2.sh
@@ -4,23 +4,33 @@
 #  Copyright (C) 2024 Intel Corporation
 #  SPDX-License-Identifier: MIT

+INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
 source /opt/intel/oneapi/setvars.sh

-#export GGML_SYCL_DEBUG=1
-
-#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
-
-INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
-MODEL_FILE=llama-2-7b.Q4_0.gguf
-NGL=33
-
 if [ $# -gt 0 ]; then
    GGML_SYCL_DEVICE=$1
+    GGML_SYCL_SINGLE_GPU=1
+else
+    GGML_SYCL_DEVICE=0
+    GGML_SYCL_SINGLE_GPU=0
+fi
+
+#export GGML_SYCL_DEBUG=1
+
+
+#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
+
+if [ $GGML_SYCL_SINGLE_GPU -eq 1 ]; then
    echo "use $GGML_SYCL_DEVICE as main GPU"
    #use signle GPU only
-    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -mg $GGML_SYCL_DEVICE -sm none
-
+    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
 else
    #use multiple GPUs with same max compute units
-    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0
+    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
 fi
+
+#use main GPU only
+#ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
+
+#use multiple GPUs with same max compute units
+#ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
--- a/flake.lock
+++ b/flake.lock
@@ -5,11 +5,11 @@
        "nixpkgs-lib": "nixpkgs-lib"
      },
      "locked": {
-        "lastModified": 1725234343,
-        "narHash": "sha256-+ebgonl3NbiKD2UD0x4BszCZQ6sTfL4xioaM49o5B3Y=",
+        "lastModified": 1725024810,
+        "narHash": "sha256-ODYRm8zHfLTH3soTFWE452ydPYz2iTvr9T8ftDMUQ3E=",
        "owner": "hercules-ci",
        "repo": "flake-parts",
-        "rev": "567b938d64d4b4112ee253b9274472dc3a346eb6",
+        "rev": "af510d4a62d071ea13925ce41c95e3dec816c01d",
        "type": "github"
      },
      "original": {
@@ -20,11 +20,11 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1725634671,
-        "narHash": "sha256-v3rIhsJBOMLR8e/RNWxr828tB+WywYIoajrZKFM+0Gg=",
+        "lastModified": 1724819573,
+        "narHash": "sha256-GnR7/ibgIH1vhoy8cYdmXE6iyZqKqFxQSVkFgosBh6w=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "574d1eac1c200690e27b8eb4e24887f8df7ac27c",
+        "rev": "71e91c409d1e654808b2621f28a327acfdad8dc2",
        "type": "github"
      },
      "original": {
@@ -36,14 +36,14 @@
    },
    "nixpkgs-lib": {
      "locked": {
-        "lastModified": 1725233747,
-        "narHash": "sha256-Ss8QWLXdr2JCBPcYChJhz4xJm+h/xjl4G0c0XlP6a74=",
+        "lastModified": 1722555339,
+        "narHash": "sha256-uFf2QeW7eAHlYXuDktm9c25OxOyCoUOQmh5SZ9amE5Q=",
        "type": "tarball",
-        "url": "https://github.com/NixOS/nixpkgs/archive/356624c12086a18f2ea2825fed34523d60ccc4e3.tar.gz"
+        "url": "https://github.com/NixOS/nixpkgs/archive/a5d394176e64ab29c852d03346c1fc9b0b7d33eb.tar.gz"
      },
      "original": {
        "type": "tarball",
-        "url": "https://github.com/NixOS/nixpkgs/archive/356624c12086a18f2ea2825fed34523d60ccc4e3.tar.gz"
+        "url": "https://github.com/NixOS/nixpkgs/archive/a5d394176e64ab29c852d03346c1fc9b0b7d33eb.tar.gz"
      }
    },
    "root": {
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -135,7 +135,6 @@ option(GGML_VULKAN                          "ggml: use Vulkan"
 option(GGML_VULKAN_CHECK_RESULTS            "ggml: run Vulkan op checks"                      OFF)
 option(GGML_VULKAN_DEBUG                    "ggml: enable Vulkan debug output"                OFF)
 option(GGML_VULKAN_MEMORY_DEBUG             "ggml: enable Vulkan memory debug output"         OFF)
-option(GGML_VULKAN_SHADER_DEBUG_INFO        "ggml: enable Vulkan shader debug info"           OFF)
 option(GGML_VULKAN_PERF                     "ggml: enable Vulkan perf output"                 OFF)
 option(GGML_VULKAN_VALIDATE                 "ggml: enable Vulkan validation"                  OFF)
 option(GGML_VULKAN_RUN_TESTS                "ggml: run Vulkan tests"                          OFF)
--- a/ggml/include/ggml-cann.h
+++ b/ggml/include/ggml-cann.h
@@ -80,13 +80,6 @@ ggml_backend_cann_buffer_type(int32_t device);
 */
 GGML_API GGML_CALL int32_t ggml_backend_cann_get_device_count(void);

-/**
- * @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU.
- *
- * @return A pointer to the host buffer type interface.
- */
-GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
-
 /**
 * @brief Retrieves the description of a specific CANN device.
 *
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -358,7 +358,6 @@ extern "C" {

    struct ggml_object;
    struct ggml_context;
-    struct ggml_cgraph;

    // NOTE: always add types at the end of the enum to keep backward compatibility
    enum ggml_type {
@@ -396,8 +395,6 @@ extern "C" {
        GGML_TYPE_Q4_0_4_4 = 31,
        GGML_TYPE_Q4_0_4_8 = 32,
        GGML_TYPE_Q4_0_8_8 = 33,
-        GGML_TYPE_TQ1_0   = 34,
-        GGML_TYPE_TQ2_0   = 35,
        GGML_TYPE_COUNT,
    };

@@ -576,9 +573,23 @@ extern "C" {
        GGML_TENSOR_FLAG_PARAM  = 4,
    };

+    // ggml object
+    struct ggml_object {
+        size_t offs;
+        size_t size;
+
+        struct ggml_object * next;
+
+        enum ggml_object_type type;
+
+        char padding[4];
+    };
+
+    static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
+
    // n-dimensional tensor
    struct ggml_tensor {
-        enum ggml_type type;
+        enum ggml_type         type;

        GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor");

@@ -642,7 +653,7 @@ extern "C" {

    struct ggml_threadpool;     // forward declaration, see ggml.c

-    typedef struct ggml_threadpool * ggml_threadpool_t;
+    typedef struct  ggml_threadpool * ggml_threadpool_t;

    // the compute plan that needs to be prepared for ggml_graph_compute()
    // since https://github.com/ggerganov/ggml/issues/287
@@ -658,6 +669,35 @@ extern "C" {
        void *              abort_callback_data;
    };

+    enum ggml_cgraph_eval_order {
+        GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
+        GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
+        GGML_CGRAPH_EVAL_ORDER_COUNT
+    };
+
+    typedef uint32_t ggml_bitset_t;
+
+    struct ggml_hash_set {
+        size_t size;
+        ggml_bitset_t * used;
+        struct ggml_tensor ** keys;
+    };
+
+    // computation graph
+    struct ggml_cgraph {
+        int size;
+        int n_nodes;
+        int n_leafs;
+
+        struct ggml_tensor ** nodes;
+        struct ggml_tensor ** grads;
+        struct ggml_tensor ** leafs;
+
+        struct ggml_hash_set visited_hash_set;
+
+        enum ggml_cgraph_eval_order order;
+    };
+
    // scratch buffer
    struct ggml_scratch {
        size_t offs;
@@ -1230,7 +1270,7 @@ extern "C" {
            size_t                nb1,
            size_t                nb2,
            size_t                nb3,
-            size_t                offset); // in bytes
+            size_t                offset);

    // b -> view(a,offset,nb1,nb2,3), return view(a)
    GGML_API struct ggml_tensor * ggml_set_inplace(
@@ -1240,19 +1280,19 @@ extern "C" {
            size_t                nb1,
            size_t                nb2,
            size_t                nb3,
-            size_t                offset); // in bytes
+            size_t                offset);

    GGML_API struct ggml_tensor * ggml_set_1d(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b,
-            size_t                offset); // in bytes
+            size_t                offset);

    GGML_API struct ggml_tensor * ggml_set_1d_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b,
-            size_t                offset); // in bytes
+            size_t                offset);

    // b -> view(a,offset,nb1,nb2,3), return modified a
    GGML_API struct ggml_tensor * ggml_set_2d(
@@ -1260,7 +1300,7 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b,
            size_t                nb1,
-            size_t                offset); // in bytes
+            size_t                offset);

    // b -> view(a,offset,nb1,nb2,3), return view(a)
    GGML_API struct ggml_tensor * ggml_set_2d_inplace(
@@ -1268,7 +1308,7 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b,
            size_t                nb1,
-            size_t                offset); // in bytes
+            size_t                offset);

    // a -> b, return view(b)
    GGML_API struct ggml_tensor * ggml_cpy(
@@ -1975,6 +2015,8 @@ extern "C" {
    typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
    typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);

+    #define GGML_N_TASKS_MAX -1
+
    GGML_API struct ggml_tensor * ggml_map_custom1(
            struct ggml_context   * ctx,
            struct ggml_tensor    * a,
@@ -2044,35 +2086,30 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * tensor);

+
    GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
    GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);

    // graph allocation in a context
-    GGML_API struct ggml_cgraph * ggml_new_graph       (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
-    GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads);
-    GGML_API struct ggml_cgraph * ggml_graph_dup       (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
-    GGML_API void                 ggml_graph_cpy       (struct ggml_cgraph * src, struct ggml_cgraph * dst);
-    GGML_API void                 ggml_graph_reset     (struct ggml_cgraph * cgraph);  // zero grads
-    GGML_API void                 ggml_graph_clear     (struct ggml_cgraph * cgraph);
-
-    GGML_API int                   ggml_graph_size   (struct ggml_cgraph * cgraph);
-    GGML_API struct ggml_tensor *  ggml_graph_node   (struct ggml_cgraph * cgraph, int i); // if i < 0, returns nodes[n_nodes + i]
-    GGML_API struct ggml_tensor ** ggml_graph_nodes  (struct ggml_cgraph * cgraph);
-    GGML_API int                   ggml_graph_n_nodes(struct ggml_cgraph * cgraph);
-
-    GGML_API void   ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
+    GGML_API struct ggml_cgraph * ggml_new_graph         (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
+    GGML_API struct ggml_cgraph * ggml_new_graph_custom  (struct ggml_context * ctx, size_t size, bool grads);
+    GGML_API struct ggml_cgraph * ggml_graph_dup         (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
+    GGML_API struct ggml_cgraph   ggml_graph_view        (struct ggml_cgraph * cgraph, int i0, int i1);
+    GGML_API void                 ggml_graph_cpy         (struct ggml_cgraph * src, struct ggml_cgraph * dst);
+    GGML_API void                 ggml_graph_reset       (struct ggml_cgraph * cgraph);  // zero grads
+    GGML_API void                 ggml_graph_clear       (struct ggml_cgraph * cgraph);

    GGML_API size_t ggml_graph_overhead(void);
    GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);

-    GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
-    GGML_API void                          ggml_threadpool_params_init   (struct ggml_threadpool_params * p, int n_threads);
-    GGML_API bool                          ggml_threadpool_params_match  (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
-    GGML_API struct ggml_threadpool *      ggml_threadpool_new          (struct ggml_threadpool_params  * params);
-    GGML_API void                          ggml_threadpool_free         (struct ggml_threadpool * threadpool);
-    GGML_API int                           ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
-    GGML_API void                          ggml_threadpool_pause        (struct ggml_threadpool * threadpool);
-    GGML_API void                          ggml_threadpool_resume       (struct ggml_threadpool * threadpool);
+    GGML_API struct ggml_threadpool_params   ggml_threadpool_params_default(int n_threads);
+    GGML_API void                            ggml_threadpool_params_init  (struct ggml_threadpool_params *p, int n_threads);
+    GGML_API bool                            ggml_threadpool_params_match (const struct ggml_threadpool_params *p0, const struct ggml_threadpool_params *p1);
+    GGML_API struct ggml_threadpool*         ggml_threadpool_new          (struct ggml_threadpool_params  * params);
+    GGML_API void                            ggml_threadpool_free         (struct ggml_threadpool * threadpool);
+    GGML_API int                             ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
+    GGML_API void                            ggml_threadpool_pause        (struct ggml_threadpool * threadpool);
+    GGML_API void                            ggml_threadpool_resume       (struct ggml_threadpool * threadpool);

    // ggml_graph_plan() has to be called before ggml_graph_compute()
    // when plan.work_size > 0, caller must allocate memory for plan.work_data
@@ -2470,7 +2507,6 @@ extern "C" {
    GGML_API int ggml_cpu_has_gpublas    (void);
    GGML_API int ggml_cpu_has_sse3       (void);
    GGML_API int ggml_cpu_has_ssse3      (void);
-    GGML_API int ggml_cpu_has_riscv_v    (void);
    GGML_API int ggml_cpu_has_sycl       (void);
    GGML_API int ggml_cpu_has_rpc        (void);
    GGML_API int ggml_cpu_has_vsx        (void);
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -26,9 +26,6 @@ if (NOT MSVC)
    endif()
 endif()

-unset(GGML_EXTRA_LIBS_PRIVATE)
-unset(GGML_EXTRA_LIBS_PUBLIC)
-
 if (APPLE AND GGML_ACCELERATE)
    find_library(ACCELERATE_FRAMEWORK Accelerate)
    if (ACCELERATE_FRAMEWORK)
@@ -38,7 +35,7 @@ if (APPLE AND GGML_ACCELERATE)
        add_compile_definitions(ACCELERATE_NEW_LAPACK)
        add_compile_definitions(ACCELERATE_LAPACK_ILP64)

-        list(APPEND GGML_EXTRA_LIBS_PRIVATE ${ACCELERATE_FRAMEWORK})
+        set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK})
    else()
        message(WARNING "Accelerate framework not found")
    endif()
@@ -90,7 +87,7 @@ if (GGML_METAL)
            COMMENT "Generate assembly for embedded Metal library"
        )

-        list(APPEND GGML_SOURCES_METAL ${METALLIB_EMBED_ASM})
+        set(GGML_SOURCES_METAL ${GGML_SOURCES_METAL} ${METALLIB_EMBED_ASM})
    else()
        if (GGML_METAL_SHADER_DEBUG)
            # custom command to do the following:
@@ -120,7 +117,7 @@ if (GGML_METAL)

        add_custom_command(
            OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
-            COMMAND xcrun -sdk macosx metal    ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
+            COMMAND xcrun -sdk macosx metal    ${XC_FLAGS} -c ${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal       -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
            COMMAND xcrun -sdk macosx metallib                ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air   -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
            COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
            COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h
@@ -135,7 +132,7 @@ if (GGML_METAL)
            )
    endif() # GGML_METAL_EMBED_LIBRARY

-    list(APPEND GGML_EXTRA_LIBS_PRIVATE
+    set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS}
        ${FOUNDATION_LIBRARY}
        ${METAL_FRAMEWORK}
        ${METALKIT_FRAMEWORK}
@@ -160,11 +157,11 @@ if (GGML_OPENMP)

        add_compile_definitions(GGML_USE_OPENMP)

-        list(APPEND GGML_EXTRA_LIBS_PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
+        set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} OpenMP::OpenMP_C OpenMP::OpenMP_CXX)

        if (GGML_MUSA)
-            list(APPEND GGML_EXTRA_INCLUDES     "/usr/lib/llvm-10/include/openmp")
-            list(APPEND GGML_EXTRA_LIBS_PRIVATE "/usr/lib/llvm-10/lib/libomp.so")
+            set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} "/usr/lib/llvm-10/include/openmp")
+            set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} "/usr/lib/llvm-10/lib/libomp.so")
        endif()
    else()
        message(WARNING "OpenMP not found")
@@ -247,8 +244,8 @@ if (GGML_BLAS)
        set(GGML_HEADERS_BLAS ../include/ggml-blas.h)
        set(GGML_SOURCES_BLAS ggml-blas.cpp)

-        list(APPEND GGML_EXTRA_LIBS_PRIVATE ${BLAS_LIBRARIES})
-        list(APPEND GGML_EXTRA_INCLUDES     ${BLAS_INCLUDE_DIRS})
+        set(GGML_EXTRA_LIBS     ${GGML_EXTRA_LIBS}     ${BLAS_LIBRARIES})
+        set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
    else()
        message(WARNING "BLAS not found, please refer to "
        "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
@@ -371,19 +368,19 @@ if (GGML_CUDA)
        if (GGML_STATIC)
            if (WIN32)
                # As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
-                list(APPEND GGML_EXTRA_LIBS_PRIVATE CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
+                set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
            else ()
                if (GGML_MUSA)
-                    list(APPEND GGML_EXTRA_LIBS_PRIVATE MUSA::musart_static MUSA::mublas_static)
+                    set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} MUSA::musart_static MUSA::mublas_static)
                else()
-                    list(APPEND GGML_EXTRA_LIBS_PRIVATE CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
+                    set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
                endif()
            endif()
        else()
            if (GGML_MUSA)
-                list(APPEND GGML_EXTRA_LIBS_PRIVATE MUSA::musart MUSA::mublas)
+                set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} MUSA::musart MUSA::mublas)
            else()
-                list(APPEND GGML_EXTRA_LIBS_PRIVATE CUDA::cudart CUDA::cublas CUDA::cublasLt)
+                set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
            endif()
        endif()

@@ -391,9 +388,9 @@ if (GGML_CUDA)
            # No VMM requested, no need to link directly with the cuda driver lib (libcuda.so)
        else()
            if (GGML_MUSA)
-                list(APPEND GGML_EXTRA_LIBS_PRIVATE MUSA::musa_driver) # required by muDeviceGetAttribute(), muMemGetAllocationGranularity(...), ...
+                set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} MUSA::musa_driver) # required by muDeviceGetAttribute(), muMemGetAllocationGranularity(...), ...
            else()
-                list(APPEND GGML_EXTRA_LIBS_PRIVATE CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
+                set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
            endif()
        endif()
    else()
@@ -498,7 +495,7 @@ if (GGML_HIPBLAS)

    if (CXX_IS_HIPCC)
        set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
-        list(APPEND GGML_EXTRA_LIBS_PRIVATE hip::device)
+        set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} hip::device)
    else()
        set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE HIP)
    endif()
@@ -507,7 +504,7 @@ if (GGML_HIPBLAS)
        message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
    endif()

-    list(APPEND GGML_EXTRA_LIBS_PUBLIC hip::host roc::rocblas roc::hipblas)
+    set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} PUBLIC hip::host roc::rocblas roc::hipblas)
 endif()

 if (GGML_SYCL)
@@ -516,8 +513,7 @@ if (GGML_SYCL)
    endif()

    check_cxx_compiler_flag("-fsycl" SUPPORTS_SYCL)
-
-    if (DEFINED ENV{ONEAPI_ROOT})
+    if ( DEFINED ENV{ONEAPI_ROOT})
        message(STATUS "Using oneAPI Release SYCL compiler (icpx).")
    elseif(SUPPORTS_SYCL)
        message(WARNING "Using open-source SYCL compiler (clang++). Didn't detect ENV {ONEAPI_ROOT}.
@@ -555,29 +551,26 @@ if (GGML_SYCL)

    find_package(DNNL)
    message("-- DNNL found:" ${DNNL_FOUND})
-
    if (GGML_SYCL_TARGET STREQUAL "INTEL")
        add_compile_definitions(GGML_SYCL_DNNL=${DNNL_FOUND})
    else()
        add_compile_definitions(GGML_SYCL_DNNL=0)
    endif()
-
-    if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
-        list(APPEND GGML_EXTRA_LIBS_PRIVATE DNNL::dnnl)
-    endif()
-
    if (WIN32)
        find_package(IntelSYCL REQUIRED)
        find_package(MKL REQUIRED)
-        list(APPEND GGML_EXTRA_LIBS_PRIVATE IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
+        set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
    else()
        if (GGML_SYCL_TARGET STREQUAL "INTEL")
-            list(APPEND GGML_EXTRA_LIBS_PRIVATE sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
+            set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
        elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
-            list(APPEND GGML_EXTRA_LIBS_PRIVATE sycl pthread m dl onemkl)
+            set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl pthread m dl onemkl)
        endif()
    endif()
+    if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
+        list(APPEND GGML_EXTRA_LIBS DNNL::dnnl)
+    endif()
 endif()

 if (GGML_RPC)
@@ -586,7 +579,7 @@ if (GGML_RPC)
    list(APPEND GGML_CDEF_PUBLIC GGML_USE_RPC)

    if (WIN32)
-        list(APPEND GGML_EXTRA_LIBS_PRIVATE ws2_32)
+        set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ws2_32)
    endif()

    set(GGML_HEADERS_RPC ../include/ggml-rpc.h)
@@ -619,10 +612,6 @@ if (GGML_VULKAN)
            add_compile_definitions(GGML_VULKAN_MEMORY_DEBUG)
        endif()

-        if (GGML_VULKAN_SHADER_DEBUG_INFO)
-            add_compile_definitions(GGML_VULKAN_SHADER_DEBUG_INFO)
-        endif()
-
        if (GGML_VULKAN_PERF)
            add_compile_definitions(GGML_VULKAN_PERF)
        endif()
@@ -664,8 +653,8 @@ if (GGML_VULKAN)
        set(GGML_HEADERS_VULKAN ${CMAKE_CURRENT_SOURCE_DIR}/../include/ggml-vulkan.h ${_ggml_vk_header})
        set(GGML_SOURCES_VULKAN ggml-vulkan.cpp ${_ggml_vk_source})

-        list(APPEND GGML_EXTRA_LIBS_PRIVATE Vulkan::Vulkan)
-        list(APPEND GGML_EXTRA_INCLUDES     ${CMAKE_CURRENT_BINARY_DIR})
+        set(GGML_EXTRA_LIBS     ${GGML_EXTRA_LIBS} Vulkan::Vulkan)
+        set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${CMAKE_CURRENT_BINARY_DIR})
    else()
        message(WARNING "Vulkan not found")
    endif()
@@ -824,8 +813,8 @@ if (GGML_KOMPUTE)

        list(APPEND GGML_CDEF_PUBLIC GGML_USE_KOMPUTE)

-        list(APPEND GGML_EXTRA_LIBS_PRIVATE kompute)
-        list(APPEND GGML_EXTRA_INCLUDES     ${CMAKE_CURRENT_BINARY_DIR})
+        set(GGML_EXTRA_LIBS     ${GGML_EXTRA_LIBS}     kompute)
+        set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${CMAKE_CURRENT_BINARY_DIR})
    else()
        message(WARNING "Kompute not found")
    endif()
@@ -890,10 +879,9 @@ if (GGML_CANN)
            message(STATUS "CANN: CANN_INCLUDE_DIRS =  ${CANN_INCLUDE_DIRS}")
            message(STATUS "CANN: CANN_LIBRARIES =  ${CANN_LIBRARIES}")

-            list(APPEND GGML_EXTRA_LIBS_PRIVATE ${CANN_LIBRARIES} )
-            list(APPEND GGML_EXTRA_INCLUDES     ${CANN_INCLUDE_DIRS})
-            list(APPEND GGML_EXTRA_LIBDIRS      ${CANN_INSTALL_DIR}/lib64)
-
+            set(GGML_EXTRA_LIBS     ${GGML_EXTRA_LIBS}     ${CANN_LIBRARIES} )
+            set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${CANN_INCLUDE_DIRS})
+            set(GGML_EXTRA_LIBDIRS  ${GGML_EXTRA_LIBDIRS}  ${CANN_INSTALL_DIR}/lib64)
            list(APPEND GGML_CDEF_PUBLIC GGML_USE_CANN)
        endif()
    else()
@@ -1330,13 +1318,13 @@ if (EMSCRIPTEN)
    set_target_properties(ggml PROPERTIES COMPILE_FLAGS "-msimd128")
 endif()

-target_compile_definitions(ggml PUBLIC    ${GGML_CDEF_PUBLIC})
-target_include_directories(ggml PUBLIC  ../include)
+target_compile_definitions(ggml PUBLIC  ${GGML_CDEF_PUBLIC})
+target_include_directories(ggml PUBLIC ../include)
 target_include_directories(ggml PRIVATE . ${GGML_EXTRA_INCLUDES})
-target_link_directories   (ggml PRIVATE   ${GGML_EXTRA_LIBDIRS})
+target_link_directories(ggml PRIVATE ${GGML_EXTRA_LIBDIRS})
 target_compile_features   (ggml PRIVATE c_std_11) # don't bump

-list(APPEND GGML_EXTRA_LIBS_PRIVATE Threads::Threads)
+target_link_libraries(ggml PRIVATE Threads::Threads ${GGML_EXTRA_LIBS})

 find_library(MATH_LIBRARY m)
 if (MATH_LIBRARY)
@@ -1345,10 +1333,6 @@ if (MATH_LIBRARY)
    endif()
 endif()

-list(REMOVE_DUPLICATES GGML_EXTRA_LIBS_PRIVATE)
-list(REMOVE_DUPLICATES GGML_EXTRA_LIBS_PUBLIC)
-target_link_libraries(ggml PRIVATE ${GGML_EXTRA_LIBS_PRIVATE} PUBLIC ${GGML_EXTRA_LIBS_PUBLIC})
-
 if (BUILD_SHARED_LIBS)
    set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
    target_compile_definitions(ggml PRIVATE GGML_SHARED GGML_BUILD)
--- a/ggml/src/ggml-backend.c
+++ b/ggml/src/ggml-backend.c
@@ -827,10 +827,6 @@ GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const
                op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
        case GGML_OP_MUL_MAT:
            return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
-        case GGML_OP_ROPE_BACK:
-            return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
-        case GGML_OP_IM2COL_BACK:
-            return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
        default:
            return true;
    }
--- a/ggml/src/ggml-blas.cpp
+++ b/ggml/src/ggml-blas.cpp
@@ -1,4 +1,3 @@
-#include "ggml-impl.h"
 #include "ggml-blas.h"
 #include "ggml-backend-impl.h"

--- a/ggml/src/ggml-cann.cpp
+++ b/ggml/src/ggml-cann.cpp
@@ -30,7 +30,6 @@
 #include <cstring>
 #include <mutex>

-#include "ggml-impl.h"
 #include "ggml-backend-impl.h"
 #include "ggml-cann/aclnn_ops.h"
 #include "ggml-cann/common.h"
@@ -1221,116 +1220,6 @@ ggml_backend_cann_buffer_type(int32_t device) {
    return &ggml_backend_cann_buffer_types[device];
 }

-/**
- * @brief Retrieves the name associated with a CANN host buffer type.
- *
- * This function returns the descriptive name associated with the specified
- * CANN host buffer type context.
- *
- * @param buft Pointer to the host buffer type context.
- * @return Const pointer to the C-style string containing the name.
- */
-GGML_CALL static const char * ggml_backend_cann_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
-    return "CANN_Host";
-
-    GGML_UNUSED(buft);
-}
-
-/**
- * @brief Retrieves the name associated with a CANN host buffer.
- *
- * This function returns the descriptive name associated with the specified
- * CANN host buffer context.
- *
- * @param buft Pointer to the host buffer context.
- * @return Const pointer to the C-style string containing the name.
- */
-GGML_CALL static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buffer) {
-    return "CANN_Host";
-
-    GGML_UNUSED(buffer);
-}
-
-/**
- * @brief Free resources associated with a CANN host buffer.
- *
- * This function frees the resources associated with a CANN host buffer, including
- * its context.
- *
- * @param buffer The CANN host buffer to free.
- */
-GGML_CALL static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) {
-    ACL_CHECK(aclrtFreeHost(buffer->context));
-}
-
-/**
- * @brief Allocates a new CANN host buffer of the specified size.
- *
- * This function allocates a new CANN host buffer with the given size.
- * @param size Size in bytes of the host buffer to allocate.
- * @return Pointer to the allocated host buffer, or nullptr if allocation fails.
- */
-static void * ggml_cann_host_malloc(size_t size) {
-    if (getenv("GGML_CANN_NO_PINNED") != nullptr) {
-        return nullptr;
-    }
-
-    void * hostPtr = nullptr;
-    aclError err = aclrtMallocHost((void **) &hostPtr, size);
-    if (err != ACL_SUCCESS) {
-
-        GGML_CANN_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
-                           size / 1024.0 / 1024.0, aclGetRecentErrMsg());
-        return nullptr;
-    }
-    return hostPtr;
-}
-
-/**
- * @brief Allocates a new CANN host buffer of the specified type and size.
- *
- * @param buft Pointer to the host buffer type context.
- * @param size Size in bytes of the host buffer to allocate.
- * @return Pointer to the allocated host buffer, or CPU buffer pointer if allocation fails.
- */
-GGML_CALL static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    void * hostPtr = ggml_cann_host_malloc(size);
-
-    if (hostPtr == nullptr) {
-        // fallback to cpu buffer
-        return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
-    }
-
-    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(hostPtr, size);
-    buffer->buft = buft;
-    buffer->iface.get_name = ggml_backend_cann_host_buffer_name;
-    buffer->iface.free_buffer = ggml_backend_cann_host_buffer_free;
-
-    return buffer;
-}
-
-/**
- * @brief Interface for managing CANN host buffer types in the GGML backend.
- *
- * Provides function pointers for allocating, querying properties, and managing
- * memory for CANN buffer types in the GGML backend.
- */
-GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() {
-    static struct ggml_backend_buffer_type ggml_backend_cann_buffer_type_host = {
-        /* .iface    = */ {
-            /* .get_name         = */ ggml_backend_cann_host_buffer_type_name,
-            /* .alloc_buffer     = */ ggml_backend_cann_host_buffer_type_alloc_buffer,
-            /* .get_alignment    = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
-            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
-            /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
-            /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
-        },
-        /* .context  = */ nullptr,
-    };
-
-    return &ggml_backend_cann_buffer_type_host;
-}
-
 /**
 * @brief Computes the forward operation for a given tensor using CANN
 * operations.
@@ -2053,7 +1942,7 @@ GGML_CALL ggml_backend_t ggml_backend_cann_init(int32_t device) {
        GGML_CANN_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
        return nullptr;
    }
-    ggml_cann_set_device(ctx->device);
+
    ggml_backend_t cann_backend =
        new ggml_backend{/* .guid      = */ ggml_backend_cann_guid(),
                         /* .interface = */ ggml_backend_cann_interface,
--- a/ggml/src/ggml-cann/Doxyfile
+++ b/ggml/src/ggml-cann/Doxyfile
@@ -32,7 +32,7 @@ DOXYFILE_ENCODING      = UTF-8
 # title of most generated pages and in a few other places.
 # The default value is: My Project.

-PROJECT_NAME           = "ggml"
+PROJECT_NAME           = "llama.cpp"

 # The PROJECT_NUMBER tag can be used to enter a project or revision number. This
 # could be handy for archiving the generated documentation or if some version
@@ -44,7 +44,7 @@ PROJECT_NUMBER         =
 # for a project that appears at the top of each page and should give viewer a
 # quick idea about the purpose of the project. Keep the description short.

-PROJECT_BRIEF          = "Tensor library for machine learning"
+PROJECT_BRIEF          = "llama inference engine"

 # With the PROJECT_LOGO tag one can specify a logo or an icon that is included
 # in the documentation. The maximum height of the logo should not exceed 55
--- a/ggml/src/ggml-common.h
+++ b/ggml/src/ggml-common.h
@@ -227,25 +227,6 @@ typedef struct {
 } block_q8_0x8;
 static_assert(sizeof(block_q8_0x8) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong q8_0x8 block size/padding");

-//
-// Ternary quantization
-//
-
-// 1.6875 bpw
-typedef struct {
-    uint8_t qs[(QK_K - 4 * QK_K / 64) / 5]; // 5 elements per byte (3^5 = 243 < 256)
-    uint8_t qh[QK_K/64]; // 4 elements per byte
-    ggml_half d;
-} block_tq1_0;
-static_assert(sizeof(block_tq1_0) == sizeof(ggml_half) + QK_K / 64 + (QK_K - 4 * QK_K / 64) / 5, "wrong tq1_0 block size/padding");
-
-// 2.0625 bpw
-typedef struct {
-    uint8_t qs[QK_K/4]; // 2 bits per element
-    ggml_half d;
-} block_tq2_0;
-static_assert(sizeof(block_tq2_0) == sizeof(ggml_half) + QK_K / 4, "wrong tq2_0 block size/padding");
-
 //
 // Super-block quantization structures
 //
@@ -380,7 +361,6 @@ typedef struct {
 } block_iq3_s;
 static_assert(sizeof(block_iq3_s) == sizeof(ggml_half) + 13*(QK_K/32) + IQ3S_N_SCALE, "wrong iq3_s block size/padding");

-// 1.5625 bpw
 typedef struct {
    ggml_half d;
    uint8_t  qs[QK_K/8];
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -1,5 +1,5 @@
 #include "ggml-cuda.h"
-#include "ggml-impl.h"
+#include "ggml.h"
 #include "ggml-backend-impl.h"

 #include "ggml-cuda/common.cuh"
@@ -27,7 +27,6 @@
 #include "ggml-cuda/rope.cuh"
 #include "ggml-cuda/scale.cuh"
 #include "ggml-cuda/softmax.cuh"
-#include "ggml-cuda/sum.cuh"
 #include "ggml-cuda/sumrows.cuh"
 #include "ggml-cuda/tsembd.cuh"
 #include "ggml-cuda/unary.cuh"
@@ -2181,7 +2180,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
            ggml_cuda_dup(ctx, dst);
            break;
        case GGML_OP_ADD:
-        case GGML_OP_ADD1: // TODO: more efficient implementation
            ggml_cuda_op_add(ctx, dst);
            break;
        case GGML_OP_SUB:
@@ -2198,9 +2196,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
            break;
        case GGML_OP_UNARY:
            switch (ggml_get_unary_op(dst)) {
-                case GGML_UNARY_OP_NEG:
-                    ggml_cuda_op_neg(ctx, dst);
-                    break;
                case GGML_UNARY_OP_GELU:
                    ggml_cuda_op_gelu(ctx, dst);
                    break;
@@ -2309,9 +2304,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
        case GGML_OP_POOL_2D:
            ggml_cuda_op_pool2d(ctx, dst);
            break;
-        case GGML_OP_SUM:
-            ggml_cuda_op_sum(ctx, dst);
-            break;
        case GGML_OP_SUM_ROWS:
            ggml_cuda_op_sum_rows(ctx, dst);
            break;
@@ -2552,11 +2544,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
        for (int i = 0; i < cgraph->n_nodes; i++) {
            ggml_tensor * node = cgraph->nodes[i];

-            if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
-                continue;
-            }
-
-            if (node->src[0] && node->src[0]->buffer && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
+            if (node->src[0] && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
                use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
 #ifndef NDEBUG
                GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__);
@@ -2760,7 +2748,6 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
    switch (op->op) {
        case GGML_OP_UNARY:
            switch (ggml_get_unary_op(op)) {
-                case GGML_UNARY_OP_NEG:
                case GGML_UNARY_OP_GELU:
                case GGML_UNARY_OP_SILU:
                case GGML_UNARY_OP_RELU:
@@ -2890,7 +2877,6 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
        case GGML_OP_TRANSPOSE:
        case GGML_OP_NORM:
        case GGML_OP_ADD:
-        case GGML_OP_ADD1:
        case GGML_OP_SUB:
        case GGML_OP_MUL:
        case GGML_OP_DIV:
@@ -2901,18 +2887,14 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
        case GGML_OP_SIN:
        case GGML_OP_COS:
        case GGML_OP_CLAMP:
-            return true;
        case GGML_OP_CONT:
-            return op->src[0]->type != GGML_TYPE_BF16;
        case GGML_OP_DIAG_MASK_INF:
        case GGML_OP_SOFT_MAX:
            return true;
        case GGML_OP_ROPE:
            return ggml_is_contiguous(op->src[0]);
        case GGML_OP_IM2COL:
-            return op->src[0]->type == GGML_TYPE_F16;
        case GGML_OP_POOL_2D:
-        case GGML_OP_SUM:
        case GGML_OP_SUM_ROWS:
        case GGML_OP_ARGSORT:
        case GGML_OP_ACC:
--- a/ggml/src/ggml-cuda/cross-entropy-loss.cu
+++ b/ggml/src/ggml-cuda/cross-entropy-loss.cu
@@ -1,6 +1,6 @@
 #include "common.cuh"
 #include "cross-entropy-loss.cuh"
-#include "sum.cuh"
+#include "sumrows.cuh"

 #include <cmath>
 #include <cstdint>
@@ -102,5 +102,5 @@ void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor *
    cross_entropy_loss_f32<<<blocks_num, blocks_dim, shmem, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);

    // Combine results from individual blocks:
-    sum_f32_cuda(pool, dst_tmp.ptr, dst_d, blocks_num.x, stream);
+    sum_rows_f32_cuda(dst_tmp.ptr, dst_d, blocks_num.x, 1, stream);
 }
--- a/ggml/src/ggml-cuda/fattn.cu
+++ b/ggml/src/ggml-cuda/fattn.cu
@@ -152,7 +152,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
    }                                                                       \

 static void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    ggml_tensor * Q = dst->src[0];
+    ggml_tensor * Q = dst->src[1];
    ggml_tensor * K = dst->src[1];
    ggml_tensor * V = dst->src[2];

@@ -227,7 +227,7 @@ static void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, gg
    }                                                                       \

 static void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    ggml_tensor * Q = dst->src[0];
+    ggml_tensor * Q = dst->src[1];
    ggml_tensor * K = dst->src[1];
    ggml_tensor * V = dst->src[2];

--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@@ -26,11 +26,7 @@ void ggml_cuda_op_mul_mat_q(
    // nrows_dst == nrows of the matrix that the kernel writes into
    const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;

-    // The stream-k decomposition is only faster for recent NVIDIA GPUs.
-    // Also its fixup needs to allocate a temporary buffer in the memory pool.
-    // There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
-    const bool use_stream_k = compute_capability >= CC_VOLTA && compute_capability < CC_OFFSET_AMD && src1_ncols == ne11;
-    const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst, use_stream_k};
+    const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst};

    switch (src0->type) {
        case GGML_TYPE_Q4_0:
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
@@ -2742,7 +2742,6 @@ struct mmq_args {
    int64_t ne00; int64_t ne01; int64_t stride01;
    int64_t ne10; int64_t ne11; int64_t stride11;
    int64_t ne0;
-    bool use_stream_k;
 };

 template<ggml_type type>
@@ -2778,7 +2777,8 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
    const int ntx = (args.ne11 + mmq_x - 1) / mmq_x;
    const dim3 block_nums_xy_tiling(nty, ntx, 1);

-    if (!args.use_stream_k) {
+    const bool use_stream_k = cc >= CC_VOLTA && cc < CC_OFFSET_AMD;
+    if (!use_stream_k) {
        if (args.ne01 % mmq_y == 0) {
            constexpr bool need_check = false;
            mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, shmem, stream>>>
--- a/ggml/src/ggml-cuda/sum.cu
+++ b/ggml/src/ggml-cuda/sum.cu
@@ -1,43 +0,0 @@
-#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
-// On Windows CUB uses libraries with variables called CC_PASCAL which conflict with the define in common.cuh.
-// For this reason CUB must be included BEFORE anything else.
-#include <cub/cub.cuh>
-using namespace cub;
-#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
-
-#include "sumrows.cuh"
-#include "sum.cuh"
-
-#include <cstdint>
-
-void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream) {
-#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
-    size_t tmp_size = 0;
-    DeviceReduce::Sum(nullptr,       tmp_size, x, dst, ne, stream);
-    ggml_cuda_pool_alloc<uint8_t> tmp_alloc(pool, tmp_size);
-    DeviceReduce::Sum(tmp_alloc.ptr, tmp_size, x, dst, ne, stream);
-#else
-    // Use (inefficient) sum_rows implementation as a fallback.
-    // For AMD there is rocPRIM which could be used as a drop-in replacement via hipcub but this would require C++11 -> C++14.
-    sum_rows_f32_cuda(x, dst, ne, 1, stream);
-    GGML_UNUSED(pool);
-#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
-}
-
-void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    const ggml_tensor * src0 = dst->src[0];
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-    GGML_ASSERT(ggml_is_contiguous(src0));
-
-    const float * src0_d = (const float *) src0->data;
-    float * dst_d = (float *) dst->data;
-
-    const int64_t ne = ggml_nelements(src0);
-
-    ggml_cuda_pool & pool = ctx.pool();
-    cudaStream_t stream = ctx.stream();
-
-    sum_f32_cuda(pool, src0_d, dst_d, ne, stream);
-}
--- a/ggml/src/ggml-cuda/sum.cuh
+++ b/ggml/src/ggml-cuda/sum.cuh
@@ -1,5 +0,0 @@
-#include "common.cuh"
-
-void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream);
-
-void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml-cuda/unary.cu
+++ b/ggml/src/ggml-cuda/unary.cu
@@ -1,15 +1,5 @@
 #include "unary.cuh"

-static __global__ void neg_f32(const float * x, float * dst, const int k) {
-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-
-    if (i >= k) {
-        return;
-    }
-
-    dst[i] = -x[i];
-}
-
 static __global__ void gelu_f32(const float * x, float * dst, const int k) {
    const float GELU_COEF_A    = 0.044715f;
    const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
@@ -129,11 +119,6 @@ static __global__ void cos_f32(const float * x, float * dst, const int k) {
    dst[i] = cosf(x[i]);
 }

-static void neg_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
-    const int num_blocks = (k + CUDA_NEG_BLOCK_SIZE - 1) / CUDA_NEG_BLOCK_SIZE;
-    neg_f32<<<num_blocks, CUDA_NEG_BLOCK_SIZE, 0, stream>>>(x, dst, k);
-}
-
 static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
    gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
@@ -199,20 +184,6 @@ static void cos_f32_cuda(const float * x, float * dst, const int k, cudaStream_t
    cos_f32<<<num_blocks, CUDA_COS_BLOCK_SIZE, 0, stream>>>(x, dst, k);
 }

-void ggml_cuda_op_neg(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    const ggml_tensor * src0 = dst->src[0];
-    const float * src0_d = (const float *)src0->data;
-    float * dst_d = (float *)dst->data;
-    cudaStream_t stream = ctx.stream();
-
-    GGML_ASSERT(ggml_is_contiguous(src0));
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    neg_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
-}
-
 void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * src0 = dst->src[0];
    const float * src0_d = (const float *)src0->data;
--- a/ggml/src/ggml-cuda/unary.cuh
+++ b/ggml/src/ggml-cuda/unary.cuh
@@ -1,6 +1,5 @@
 #include "common.cuh"

-#define CUDA_NEG_BLOCK_SIZE 256
 #define CUDA_GELU_BLOCK_SIZE 256
 #define CUDA_SILU_BLOCK_SIZE 256
 #define CUDA_TANH_BLOCK_SIZE 256
@@ -13,8 +12,6 @@
 #define CUDA_SIN_BLOCK_SIZE 256
 #define CUDA_COS_BLOCK_SIZE 256

-void ggml_cuda_op_neg(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-
 void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

 void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml-cuda/vendors/musa.h
+++ b/ggml/src/ggml-cuda/vendors/musa.h
@@ -130,3 +130,42 @@
 #define cudaKernelNodeParams musaKernelNodeParams
 #define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
 #define cudaStreamEndCapture musaStreamEndCapture
+
+// XXX: Clang builtins mapping
+#define __vsub4   __vsub4_musa
+#define __vcmpeq4 __vcmpeq4_musa
+#define __vcmpne4 __vcmpne4_musa
+
+#ifndef __has_builtin
+    #define __has_builtin(x) 0
+#endif
+
+typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
+
+static __device__ __forceinline__ int __vsub4_musa(const int a, const int b) {
+    return __vsubss4(a, b);
+}
+
+static __device__ __forceinline__ unsigned int __vcmpeq4_musa(unsigned int a, unsigned int b) {
+    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
+    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
+    unsigned int c;
+    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
+#pragma unroll
+    for (int i = 0; i < 4; ++i) {
+        vc[i] = va[i] == vb[i] ? 0xff : 0x00;
+    }
+    return c;
+}
+
+static __device__ __forceinline__ unsigned int __vcmpne4_musa(unsigned int a, unsigned int b) {
+    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
+    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
+    unsigned int c;
+    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
+#pragma unroll
+    for (int i = 0; i < 4; ++i) {
+        vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
+    }
+    return c;
+}
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -175,7 +175,7 @@ typedef __fp16 ggml_fp16_internal_t;

 // 32-bit ARM compatibility

-// vaddlvq_s16
+// vaddvq_s16
 // vpaddq_s16
 // vpaddq_s32
 // vaddvq_s32
@@ -185,9 +185,12 @@ typedef __fp16 ggml_fp16_internal_t;
 // vzip1_u8
 // vzip2_u8

-inline static int32_t vaddlvq_s16(int16x8_t v) {
-    int32x4_t v0 = vreinterpretq_s32_s64(vpaddlq_s32(vpaddlq_s16(v)));
-    return vgetq_lane_s32(v0, 0) + vgetq_lane_s32(v0, 2);
+inline static int32_t vaddvq_s16(int16x8_t v) {
+    return
+        (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
+        (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
+        (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
+        (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
 }

 inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
@@ -629,16 +632,8 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
 #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
 #endif

-enum ggml_cgraph_eval_order {
-    GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
-    GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
-    GGML_CGRAPH_EVAL_ORDER_COUNT
-};
-
 // bitset

-typedef uint32_t ggml_bitset_t;
-
 static_assert(sizeof(ggml_bitset_t) == 4, "bitset_t constants must be updated");
 #define BITSET_SHR 5 // log2(sizeof(ggml_bitset_t)*8)
 #define BITSET_MASK (sizeof(ggml_bitset_t)*8 - 1)
@@ -664,12 +659,6 @@ static inline void ggml_bitset_clear(ggml_bitset_t * bitset, size_t i) {
 #define GGML_HASHSET_FULL ((size_t)-1)
 #define GGML_HASHSET_ALREADY_EXISTS ((size_t)-2)

-struct ggml_hash_set {
-    size_t size;
-    ggml_bitset_t * used;       // whether or not the keys are in use i.e. set
-    struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i)
-};
-
 struct ggml_hash_set ggml_hash_set_new(size_t size);
 void                 ggml_hash_set_free(struct ggml_hash_set * hash_set);

@@ -759,24 +748,6 @@ static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct g
    GGML_ABORT("fatal error");
 }

-// computation graph
-
-struct ggml_cgraph {
-    int size;
-    int n_nodes;
-    int n_leafs;
-
-    struct ggml_tensor ** nodes;
-    struct ggml_tensor ** grads;
-    struct ggml_tensor ** leafs;
-
-    struct ggml_hash_set visited_hash_set;
-
-    enum ggml_cgraph_eval_order order;
-};
-
-struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
-
 #ifdef __cplusplus
 }
 #endif
--- a/ggml/src/ggml-kompute.cpp
+++ b/ggml/src/ggml-kompute.cpp
@@ -1,4 +1,4 @@
-#include "ggml-impl.h"
+#include "ggml.h"
 #include "ggml-backend.h"
 #include "ggml-backend-impl.h"
 #include "ggml-kompute.h"
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@@ -1,7 +1,7 @@
 #import "ggml-metal.h"

-#import "ggml-impl.h"
 #import "ggml-backend-impl.h"
+#import "ggml.h"

 #import <Foundation/Foundation.h>

@@ -17,8 +17,8 @@
 #define GGML_METAL_LOG_WARN(...)
 #define GGML_METAL_LOG_ERROR(...)
 #else
-#define GGML_METAL_LOG_INFO(...)  ggml_metal_log(GGML_LOG_LEVEL_INFO,  __VA_ARGS__)
-#define GGML_METAL_LOG_WARN(...)  ggml_metal_log(GGML_LOG_LEVEL_WARN,  __VA_ARGS__)
+#define GGML_METAL_LOG_INFO(...)  ggml_metal_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
+#define GGML_METAL_LOG_WARN(...)  ggml_metal_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
 #define GGML_METAL_LOG_ERROR(...) ggml_metal_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
 #endif

@@ -799,9 +799,8 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_context * ctx
            return ctx->support_simdgroup_reduction;
        case GGML_OP_NORM:
        case GGML_OP_ROPE:
-            return true;
        case GGML_OP_IM2COL:
-            return op->src[0]->type == GGML_TYPE_F16;
+            return true;
        case GGML_OP_POOL_1D:
        case GGML_OP_POOL_2D:
            return false;
@@ -882,7 +881,7 @@ static enum ggml_status ggml_metal_graph_compute(
    // create multiple command buffers and enqueue them
    // then, we encode the graph into the command buffers in parallel

-    const int n_nodes = gf->n_nodes;
+    const int n_nodes  = gf->n_nodes;
    const int n_cb = ctx->n_cb;
    const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb;

@@ -3039,7 +3038,8 @@ static enum ggml_status ggml_metal_graph_compute(
        if (status != MTLCommandBufferStatusCompleted) {
            GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
            if (status == MTLCommandBufferStatusError) {
-                GGML_METAL_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]);
+                NSString * error_code = [command_buffer error].localizedDescription;
+                GGML_METAL_LOG_INFO("error: %s\n", [error_code UTF8String]);
            }

            return GGML_STATUS_FAILED;
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
--- a/ggml/src/ggml-quants.h
+++ b/ggml/src/ggml-quants.h
@@ -26,9 +26,6 @@ void quantize_row_q5_K_ref(const float * GGML_RESTRICT x, block_q5_K * GGML_REST
 void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
 void quantize_row_q8_K_ref(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);

-void quantize_row_tq1_0_ref(const float * GGML_RESTRICT x, block_tq1_0 * GGML_RESTRICT y, int64_t k);
-void quantize_row_tq2_0_ref(const float * GGML_RESTRICT x, block_tq2_0 * GGML_RESTRICT y, int64_t k);
-
 void quantize_row_iq3_xxs_ref(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k);
 void quantize_row_iq4_nl_ref (const float * GGML_RESTRICT x, block_iq4_nl  * GGML_RESTRICT y, int64_t k);
 void quantize_row_iq4_xs_ref (const float * GGML_RESTRICT x, block_iq4_xs  * GGML_RESTRICT y, int64_t k);
@@ -49,9 +46,6 @@ void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
 void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);

-void quantize_row_tq1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-
 void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
@@ -73,9 +67,6 @@ void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRI
 void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);

-void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-
 void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_iq2_xs (const block_iq2_xs  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_iq2_s  (const block_iq2_s   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
@@ -99,9 +90,6 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
 void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);

-void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-
 void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_iq2_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
@@ -123,9 +111,6 @@ size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT ds
 size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_iq3_s  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);

-size_t quantize_tq1_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-size_t quantize_tq2_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-
 size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
--- a/ggml/src/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc.cpp
@@ -1,5 +1,5 @@
 #include "ggml-rpc.h"
-#include "ggml-impl.h"
+#include "ggml.h"
 #include "ggml-backend-impl.h"

 #include <cinttypes>
@@ -883,17 +883,15 @@ ggml_tensor * rpc_server::deserialize_tensor(struct ggml_context * ctx, const rp
    }
    result->buffer = reinterpret_cast<ggml_backend_buffer_t>(tensor->buffer);
    if (result->buffer && buffers.find(result->buffer) == buffers.end()) {
-        result->buffer = nullptr;
+        return nullptr;
    }

-    if (result->buffer) {
-        // require that the tensor data does not go beyond the buffer end
-        uint64_t tensor_size = (uint64_t) ggml_nbytes(result);
-        uint64_t buffer_start = (uint64_t) ggml_backend_buffer_get_base(result->buffer);
-        uint64_t buffer_size = (uint64_t) ggml_backend_buffer_get_size(result->buffer);
-        GGML_ASSERT(tensor->data + tensor_size >= tensor->data); // check for overflow
-        GGML_ASSERT(tensor->data >= buffer_start && tensor->data + tensor_size <= buffer_start + buffer_size);
-    }
+    // require that the tensor data does not go beyond the buffer end
+    uint64_t tensor_size = (uint64_t) ggml_nbytes(result);
+    uint64_t buffer_start = (uint64_t) ggml_backend_buffer_get_base(result->buffer);
+    uint64_t buffer_size = (uint64_t) ggml_backend_buffer_get_size(result->buffer);
+    GGML_ASSERT(tensor->data + tensor_size >= tensor->data); // check for overflow
+    GGML_ASSERT(tensor->data >= buffer_start && tensor->data + tensor_size <= buffer_start + buffer_size);

    result->op = (ggml_op) tensor->op;
    for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) {
@@ -1062,7 +1060,7 @@ bool rpc_server::graph_compute(const std::vector<uint8_t> & input, std::vector<u
    const rpc_tensor * tensors = (const rpc_tensor *)(input.data() + sizeof(n_nodes) + n_nodes*sizeof(uint64_t) + sizeof(n_tensors));
    GGML_PRINT_DEBUG("[%s] n_nodes: %u, n_tensors: %u\n", __func__, n_nodes, n_tensors);

-    size_t buf_size = ggml_tensor_overhead()*(n_nodes + n_tensors) + ggml_graph_overhead_custom(n_nodes, false);
+    static size_t buf_size = ggml_tensor_overhead()*(n_nodes + n_tensors) + ggml_graph_overhead_custom(n_nodes, false);
    struct ggml_init_params params = {
        /*.mem_size   =*/ buf_size,
        /*.mem_buffer =*/ NULL,
--- a/Show More
+++ b/Show More