quantize : --pure option for disabling k-quant mixtures

ggml-quants : fix Zig and Swift builds + quantize tool
ggml-ci
2026-04-16 16:27:32 +03:00 · 2023-10-28 16:37:03 -04:00 · 2023-10-28 17:24:10 +03:00 · 2023-10-28 17:14:49 +03:00
31 changed files with 3245 additions and 2934 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -15,7 +15,6 @@
 .DS_Store
 .build/
 .cache/
-.ccls-cache/
 .direnv/
 .envrc
 .swiftpm
--- a/6
+++ b/6
@@ -541,10 +541,10 @@ OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o
 llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

-COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
-COMMON_DEPS   = common.o sampling.o grammar-parser.o
+COMMON_H_DEPS = common/common.h common/sampling.h build-info.h common/log.h
+COMMON_DEPS   = $(COMMON_H_DEPS) common.o sampling.o grammar-parser.o

-common.o: common/common.cpp build-info.h $(COMMON_H_DEPS)
+common.o: common/common.cpp $(COMMON_H_DEPS)
 	$(CXX) $(CXXFLAGS) -c $< -o $@

 sampling.o: common/sampling.cpp $(COMMON_H_DEPS)
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -103,24 +103,9 @@ void process_escapes(std::string& input) {
 }

 bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
-    bool result = true;
-    try {
-        if (!gpt_params_parse_ex(argc, argv, params)) {
-            gpt_print_usage(argc, argv, gpt_params());
-            exit(0);
-        }
-    }
-    catch (const std::invalid_argument& ex) {
-        fprintf(stderr, ex.what());
-        gpt_print_usage(argc, argv, gpt_params());
-        exit(1);
-    }
-    return result;
-}
-
-bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
    bool invalid_param = false;
    std::string arg;
+    gpt_params default_params;
    const std::string arg_prefix = "--";
    llama_sampling_params & sparams = params.sparams;

@@ -233,12 +218,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                break;
            }
            sparams.top_p = std::stof(argv[i]);
-        } else if (arg == "--min-p") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            sparams.min_p = std::stof(argv[i]);
        } else if (arg == "--temp") {
            if (++i >= argc) {
                invalid_param = true;
@@ -569,8 +548,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                break;
            }
        } else if (arg == "-h" || arg == "--help") {
-            return false;
-
+            gpt_print_usage(argc, argv, default_params);
+#ifndef LOG_DISABLE_LOGS
+            log_print_usage();
+#endif // LOG_DISABLE_LOGS
+            exit(0);
        } else if (arg == "--random-prompt") {
            params.random_prompt = true;
        } else if (arg == "--in-prefix-bos") {
@@ -629,17 +611,22 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
        // End of Parse args for logging parameters
 #endif // LOG_DISABLE_LOGS
        } else {
-            throw std::invalid_argument("error: unknown argument: " + arg);
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            gpt_print_usage(argc, argv, default_params);
+            exit(1);
        }
    }
    if (invalid_param) {
-        throw std::invalid_argument("error: invalid parameter for argument: " + arg);
+        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+        gpt_print_usage(argc, argv, default_params);
+        exit(1);
    }
    if (params.prompt_cache_all &&
            (params.interactive || params.interactive_first ||
             params.instruct)) {
-
-        throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
+        fprintf(stderr, "error: --prompt-cache-all not supported in interactive mode yet\n");
+        gpt_print_usage(argc, argv, default_params);
+        exit(1);
    }

    if (params.escape) {
@@ -658,7 +645,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
 void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    const llama_sampling_params & sparams = params.sparams;

-    printf("\n");
    printf("usage: %s [options]\n", argv[0]);
    printf("\n");
    printf("options:\n");
@@ -693,7 +679,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
    printf("  --top-k N             top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
    printf("  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
-    printf("  --min-p N             min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
    printf("  --tfs N               tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
    printf("  --typical N           locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
    printf("  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n);
@@ -770,9 +755,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  -ld LOGDIR, --logdir LOGDIR\n");
    printf("                        path under which to save YAML logs (no logging if unset)\n");
    printf("\n");
-#ifndef LOG_DISABLE_LOGS
-    log_print_usage();
-#endif // LOG_DISABLE_LOGS
 }

 std::string get_system_info(const gpt_params & params) {
@@ -907,7 +889,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par

        std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
        llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
-        llama_kv_cache_clear(lctx);
+        llama_kv_cache_tokens_rm(lctx, -1, -1);
        llama_reset_timings(lctx);
    }

@@ -1293,7 +1275,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
    fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
    fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
-    fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
    fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
    fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
 }
--- a/common/common.h
+++ b/common/common.h
@@ -110,8 +110,6 @@ struct gpt_params {
    std::string image = ""; // path to an image file
 };

-bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
-
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params);

 void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
--- a/common/log.h
+++ b/common/log.h
@@ -97,56 +97,38 @@
    #define LOG_TEE_TARGET stderr
 #endif

-// Utility for synchronizing log configuration state
-//  since std::optional was introduced only in c++17
-enum LogTriState
-{
-    LogTriStateSame,
-    LogTriStateFalse,
-    LogTriStateTrue
-};
-
+// NOTE: currently disabled as it produces too many log files
 // Utility to obtain "pid" like unique process id and use it when creating log files.
-inline std::string log_get_pid()
-{
-   static std::string pid;
-   if (pid.empty())
-   {
-       // std::this_thread::get_id() is the most portable way of obtaining a "process id"
-       //  it's not the same as "pid" but is unique enough to solve multiple instances
-       //  trying to write to the same log.
-       std::stringstream ss;
-       ss << std::this_thread::get_id();
-       pid = ss.str();
-   }
-
-   return pid;
-}
+//inline std::string log_get_pid()
+//{
+//    static std::string pid;
+//    if (pid.empty())
+//    {
+//        // std::this_thread::get_id() is the most portable way of obtaining a "process id"
+//        //  it's not the same as "pid" but is unique enough to solve multiple instances
+//        //  trying to write to the same log.
+//        std::stringstream ss;
+//        ss << std::this_thread::get_id();
+//        pid = ss.str();
+//    }
+//
+//    return pid;
+//}

 // Utility function for generating log file names with unique id based on thread id.
 //  invocation with log_filename_generator( "llama", "log" ) creates a string "llama.<number>.log"
 //  where the number is a runtime id of the current thread.

-#define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(LogTriStateSame, log_file_basename, log_file_extension)
+#define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(log_file_basename, log_file_extension)

 // INTERNAL, DO NOT USE
-inline std::string log_filename_generator_impl(LogTriState multilog, const std::string & log_file_basename, const std::string & log_file_extension)
+inline std::string log_filename_generator_impl(const std::string & log_file_basename, const std::string & log_file_extension)
 {
-    static bool _multilog = false;
-
-    if (multilog != LogTriStateSame)
-    {
-        _multilog = multilog == LogTriStateTrue;
-    }
-
    std::stringstream buf;

    buf << log_file_basename;
-    if (_multilog)
-    {
-        buf << ".";
-        buf << log_get_pid();
-    }
+    //buf << ".";
+    //buf << log_get_pid();
    buf << ".";
    buf << log_file_extension;

@@ -231,6 +213,15 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
    #define LOG_TEE_FLF_VAL ,""
 #endif

+// Utility for synchronizing log configuration state
+//  since std::optional was introduced only in c++17
+enum LogTriState
+{
+    LogTriStateSame,
+    LogTriStateFalse,
+    LogTriStateTrue
+};
+
 // INTERNAL, DO NOT USE
 //  USE LOG() INSTEAD
 //
@@ -324,23 +315,16 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
 #endif

 // INTERNAL, DO NOT USE
-inline FILE *log_handler1_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr)
+inline FILE *log_handler1_impl(bool change = false, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr)
 {
-    static bool _initialized = false;
-    static bool _append = false;
-    static bool _disabled = filename.empty() && target == nullptr;
+    static bool _initialized{false};
+    static bool _disabled{(filename.empty() && target == nullptr)};
    static std::string log_current_filename{filename};
    static FILE *log_current_target{target};
    static FILE *logfile = nullptr;

    if (change)
    {
-        if (append != LogTriStateSame)
-        {
-            _append = append == LogTriStateTrue;
-            return logfile;
-        }
-
        if (disable == LogTriStateTrue)
        {
            // Disable primary target
@@ -393,7 +377,7 @@ inline FILE *log_handler1_impl(bool change = false, LogTriState append = LogTriS
            }
        }

-        logfile = fopen(filename.c_str(), _append ? "a" : "w");
+        logfile = fopen(filename.c_str(), "w");
    }

    if (!logfile)
@@ -414,9 +398,9 @@ inline FILE *log_handler1_impl(bool change = false, LogTriState append = LogTriS
 }

 // INTERNAL, DO NOT USE
-inline FILE *log_handler2_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME)
+inline FILE *log_handler2_impl(bool change = false, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME)
 {
-    return log_handler1_impl(change, append, disable, filename, target);
+    return log_handler1_impl(change, disable, filename, target);
 }

 // Disables logs entirely at runtime.
@@ -427,7 +411,7 @@ inline FILE *log_handler2_impl(bool change = false, LogTriState append = LogTriS
 // INTERNAL, DO NOT USE
 inline FILE *log_disable_impl()
 {
-    return log_handler1_impl(true, LogTriStateSame, LogTriStateTrue);
+    return log_handler1_impl(true, LogTriStateTrue);
 }

 // Enables logs at runtime.
@@ -436,31 +420,19 @@ inline FILE *log_disable_impl()
 // INTERNAL, DO NOT USE
 inline FILE *log_enable_impl()
 {
-    return log_handler1_impl(true, LogTriStateSame, LogTriStateFalse);
+    return log_handler1_impl(true, LogTriStateFalse);
 }

 // Sets target fir logs, either by a file name or FILE* pointer (stdout, stderr, or any valid FILE*)
 #define log_set_target(target) log_set_target_impl(target)

 // INTERNAL, DO NOT USE
-inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, LogTriStateSame, filename); }
-inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, LogTriStateSame, target); }
+inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, filename); }
+inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, target); }

 // INTERNAL, DO NOT USE
 inline FILE *log_handler() { return log_handler1_impl(); }

-// Enable or disable creating separate log files for each run.
-//  can ONLY be invoked BEFORE first log use.
-#define log_multilog(enable) log_filename_generator_impl((enable) ? LogTriStateTrue : LogTriStateFalse, "", "")
-// Enable or disable append mode for log file.
-//  can ONLY be invoked BEFORE first log use.
-#define log_append(enable) log_append_impl(enable)
-// INTERNAL, DO NOT USE
-inline FILE *log_append_impl(bool enable)
-{
-    return log_handler1_impl(true, enable ? LogTriStateTrue : LogTriStateFalse, LogTriStateSame);
-}
-
 inline void log_test()
 {
    log_disable();
@@ -522,18 +494,6 @@ inline bool log_param_single_parse(const std::string & param)
        return true;
    }

-    if (param == "--log-new")
-    {
-        log_multilog(true);
-        return true;
-    }
-
-    if (param == "--log-append")
-    {
-        log_append(true);
-        return true;
-    }
-
    return false;
 }

@@ -563,9 +523,7 @@ inline void log_print_usage()
    printf("  --log-disable         Disable trace logs\n");
    printf("  --log-enable          Enable trace logs\n");
    printf("  --log-file            Specify a log filename (without extension)\n");
-    printf("  --log-new             Create a separate new log file on start. "
-                                   "Each log file will have unique name: \"<name>.<ID>.log\"\n");
-    printf("  --log-append          Don't truncate the old log file.\n");
+    printf("                        Log file will be tagged with unique ID and written as \"<name>.<ID>.log\"\n"); /*  */
 }

 #define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv)
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -39,7 +39,6 @@ void llama_sampling_free(struct llama_sampling_context * ctx) {
 void llama_sampling_reset(llama_sampling_context * ctx) {
    if (ctx->grammar != NULL) {
        llama_grammar_free(ctx->grammar);
-        ctx->grammar = NULL;
    }

    if (!ctx->parsed_grammar.rules.empty()) {
@@ -90,10 +89,10 @@ std::string llama_sampling_print(const llama_sampling_params & params) {

    snprintf(result, sizeof(result),
            "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
-            "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
+            "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, typical_p = %.3f, temp = %.3f\n"
            "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
            params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present,
-            params.top_k, params.tfs_z, params.top_p, params.min_p, params.typical_p, params.temp,
+            params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp,
            params.mirostat, params.mirostat_eta, params.mirostat_tau);

    return std::string(result);
@@ -111,7 +110,6 @@ llama_token llama_sampling_sample(
    const float   temp            = params.temp;
    const int32_t top_k           = params.top_k <= 0 ? n_vocab : params.top_k;
    const float   top_p           = params.top_p;
-    const float   min_p           = params.min_p;
    const float   tfs_z           = params.tfs_z;
    const float   typical_p       = params.typical_p;
    const int32_t penalty_last_n  = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
@@ -192,7 +190,6 @@ llama_token llama_sampling_sample(
            llama_sample_tail_free(ctx_main, &cur_p, tfs_z,     min_keep);
            llama_sample_typical  (ctx_main, &cur_p, typical_p, min_keep);
            llama_sample_top_p    (ctx_main, &cur_p, top_p,     min_keep);
-            llama_sample_min_p    (ctx_main, &cur_p, min_p,     min_keep);
            llama_sample_temp     (ctx_main, &cur_p, temp);

            id = llama_sample_token(ctx_main, &cur_p);
--- a/common/sampling.h
+++ b/common/sampling.h
@@ -14,7 +14,6 @@ typedef struct llama_sampling_params {
    int32_t n_probs           = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
    int32_t top_k             = 40;    // <= 0 to use vocab size
    float   top_p             = 0.95f; // 1.0 = disabled
-    float   min_p             = 0.05f; // 0.0 = disabled
    float   tfs_z             = 1.00f; // 1.0 = disabled
    float   typical_p         = 1.00f; // 1.0 = disabled
    float   temp              = 0.80f; // 1.0 = disabled
--- a/common/train.cpp
+++ b/common/train.cpp
@@ -1045,7 +1045,6 @@ struct train_params_common get_default_train_params_common() {
    params.n_batch    =    8;
    params.n_gradient_accumulation = 1;
    params.n_epochs   = -1;
-    params.n_gpu_layers = 0;

    params.custom_n_ctx = false;

@@ -1081,7 +1080,6 @@ struct train_params_common get_default_train_params_common() {
    params.adam_beta2          = 0.999f;
    params.adam_gclip          = 1.0f;
    params.adam_eps_f          = 0.0f;
-
    return params;
 }

--- a/common/train.h
+++ b/common/train.h
@@ -44,7 +44,6 @@ struct train_params_common {
    int n_batch;
    int n_gradient_accumulation;
    int n_epochs;
-    int n_gpu_layers;

    bool custom_n_ctx;

--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -185,7 +185,7 @@ int main(int argc, char ** argv) {

                const auto t_pp_start = ggml_time_us();

-                llama_kv_cache_clear(ctx);
+                llama_kv_cache_tokens_rm(ctx, -1, -1);

                if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
                    LOG_TEE("%s: llama_decode() failed\n", __func__);
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -652,7 +652,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
    GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);

    auto add_to_f32 = [] (struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
-        if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16) {
+        if (ggml_is_quantized(a->type)) {
            return ggml_add_cast(ctx, a, b, GGML_TYPE_F32);
        } else if (a->type == GGML_TYPE_F32) {
            return ggml_add(ctx, a, b);
@@ -1459,17 +1459,6 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par
            }
            params->n_rank_w3 = std::stoi(argv[i]);
            params->custom_n_rank_w3 = true;
-        } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
-            params->common.n_gpu_layers = std::stoi(argv[i]);
-#else
-            fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
-            fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
-#endif
        } else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            train_print_usage(argc, argv, &default_params);
@@ -1556,7 +1545,6 @@ int main(int argc, char ** argv) {
    srand(params.common.seed);

    struct llama_model_params llama_mparams = llama_model_default_params();
-    llama_mparams.n_gpu_layers = params.common.n_gpu_layers;
    llama_mparams.vocab_only = false;

    printf("%s: model base = '%s'\n", __func__, params.fn_model_base);
--- a/examples/finetune/finetune.sh
+++ b/examples/finetune/finetune.sh
@@ -1,34 +0,0 @@
-#!/bin/bash
-cd `dirname $0`
-cd ../..
-
-EXE="./finetune"
-
-if [[ ! $LLAMA_MODEL_DIR ]]; then LLAMA_MODEL_DIR="./models"; fi
-if [[ ! $LLAMA_TRAINING_DIR ]]; then LLAMA_TRAINING_DIR="."; fi
-
-# MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2-q8_0.gguf" # This is the model the readme uses.
-MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2.gguf" # An f16 model. Note in this case with "-g", you get an f32-format .BIN file that isn't yet supported if you use it with "main --lora" with GPU inferencing.
-
-while getopts "dg" opt; do
-  case $opt in
-    d)
-      DEBUGGER="gdb --args"
-      ;;
-    g)
-      EXE="./build/bin/Release/finetune"
-      GPUARG="--gpu-layers 25"
-      ;;
-  esac
-done
-
-$DEBUGGER $EXE \
-        --model-base $MODEL \
-        $GPUARG \
-        --checkpoint-in  chk-ol3b-shakespeare-LATEST.gguf \
-        --checkpoint-out chk-ol3b-shakespeare-ITERATION.gguf \
-        --lora-out lora-ol3b-shakespeare-ITERATION.bin \
-        --train-data "$LLAMA_TRAINING_DIR\shakespeare.txt" \
-        --save-every 10 \
-        --threads 10 --adam-iter 30 --batch 4 --ctx 64 \
-        --use-checkpointing
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -1037,7 +1037,7 @@ int main(int argc, char ** argv) {

        test t(inst, lmodel, ctx);

-        llama_kv_cache_clear(ctx);
+        llama_kv_cache_tokens_rm(ctx, -1, -1);

        // warmup run
        if (t.n_prompt > 0) {
@@ -1048,7 +1048,7 @@ int main(int argc, char ** argv) {
        }

        for (int i = 0; i < params.reps; i++) {
-            llama_kv_cache_clear(ctx);
+            llama_kv_cache_tokens_rm(ctx, -1, -1);

            uint64_t t_start = get_time_ns();
            if (t.n_prompt > 0) {
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -208,14 +208,6 @@ Top-p sampling, also known as nucleus sampling, is another text generation metho

 Example usage: `--top-p 0.95`

-### Min P Sampling
-
-   `--min-p N`: Sets a minimum base probability threshold for token selection (default: 0.05).
-
-The Min-P sampling method was designed as an alternative to Top-P, and aims to ensure a balance of quality and variety. The parameter *p* represents the minimum probability for a token to be considered, relative to the probability of the most likely token. For example, with *p*=0.05 and the most likely token having a probability of 0.9, logits with a value less than 0.045 are filtered out.
-
-Example usage: `--min-p 0.05`
-
 ### Tail Free Sampling (TFS)

 -   `--tfs N`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled).
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -298,7 +298,7 @@ int main(int argc, char ** argv) {
        }

        // remove any "future" tokens that we might have inherited from the previous session
-        llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1);
+        llama_kv_cache_tokens_rm(ctx, n_matching_session_tokens, -1);
    }

    LOGLN(
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -210,7 +210,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
        const auto t_start = std::chrono::high_resolution_clock::now();

        // clear the KV cache
-        llama_kv_cache_clear(ctx);
+        llama_kv_cache_tokens_rm(ctx, -1, -1);

        for (int j = 0; j < num_batches; ++j) {
            const int batch_start = start + j * n_batch;
@@ -339,7 +339,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
        const auto t_start = std::chrono::high_resolution_clock::now();

        // clear the KV cache
-        llama_kv_cache_clear(ctx);
+        llama_kv_cache_tokens_rm(ctx, -1, -1);

        for (int j = 0; j < num_batches; ++j) {
            const int batch_start = start + j * n_batch;
@@ -573,7 +573,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
        }

        // clear the KV cache
-        llama_kv_cache_clear(ctx);
+        llama_kv_cache_tokens_rm(ctx, -1, -1);

        auto logits = hellaswag_evaluate_tokens(ctx, query_embd, 0, params.n_batch, n_vocab);
        if (logits.empty()) {
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -149,7 +149,6 @@ struct task_server {
    task_type type;
    json data;
    bool infill_mode = false;
-    bool embedding_mode = false;
 };

 struct task_result {
@@ -372,7 +371,6 @@ struct llama_client_slot
    std::vector<completion_token_output> generated_token_probs;

    bool infill = false;
-    bool embedding = false;
    bool has_next_token = true;
    bool truncated = false;
    bool stopped_eos = false;
@@ -859,7 +857,7 @@ struct llama_server_context

    void kv_cache_clear() {
        // clear the entire KV cache
-        llama_kv_cache_clear(ctx);
+        llama_kv_cache_tokens_rm(ctx, -1, -1);
        clean_kv_cache = false;
    }

@@ -1246,14 +1244,13 @@ struct llama_server_context
        queue_results.push_back(res);
    }

-    int request_completion(json data, bool infill, bool embedding)
+    int request_completion(json data, bool infill)
    {
        std::lock_guard<std::mutex> lock(mutex_tasks);
        task_server task;
        task.id = id_gen++;
        task.data = data;
        task.infill_mode = infill;
-        task.embedding_mode = embedding;
        task.type = COMPLETION_TASK;
        queue_tasks.push_back(task);
        return task.id;
@@ -1379,7 +1376,7 @@ struct llama_server_context
                    {
                        LOG_TEE("slot unavailable\n");
                        // send error result
-                        send_error(task.id, "slot unavailable");
+                        send_error(task.id, "slot unavaliable");
                        return;
                    }

@@ -1391,7 +1388,6 @@ struct llama_server_context
                    slot->reset();

                    slot->infill = task.infill_mode;
-                    slot->embedding = task.embedding_mode;
                    slot->task_id = task.id;

                    if (!launch_slot_with_data(slot, task.data))
@@ -1699,7 +1695,7 @@ struct llama_server_context
                }

                // prompt evaluated for embedding
-                if (slot.embedding)
+                if (params.embedding)
                {
                    send_embedding(slot);
                    slot.release();
@@ -2278,7 +2274,7 @@ int main(int argc, char **argv)
    svr.Post("/completion", [&llama](const httplib::Request &req, httplib::Response &res)
            {
                json data = json::parse(req.body);
-                const int task_id = llama.request_completion(data, false, false);
+                const int task_id = llama.request_completion(data, false);
                if (!json_value(data, "stream", false)) {
                    std::string completion_text;
                    task_result result = llama.next_result(task_id);
@@ -2333,7 +2329,7 @@ int main(int argc, char **argv)
    svr.Post("/infill", [&llama](const httplib::Request &req, httplib::Response &res)
            {
                json data = json::parse(req.body);
-                const int task_id = llama.request_completion(data, true, false);
+                const int task_id = llama.request_completion(data, true);
                if (!json_value(data, "stream", false)) {
                    std::string completion_text;
                    task_result result = llama.next_result(task_id);
@@ -2437,7 +2433,7 @@ int main(int argc, char **argv)
                {
                    prompt = "";
                }
-                const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true);
+                const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false);
                task_result result = llama.next_result(task_id);
                return res.set_content(result.result_json.dump(), "application/json");
            });
--- a/flake.lock
+++ b/flake.lock
@@ -5,11 +5,11 @@
        "systems": "systems"
      },
      "locked": {
-        "lastModified": 1694529238,
-        "narHash": "sha256-zsNZZGTGnMOf9YpHKJqMSsa0dXbfmxeoJ7xHlrt+xmY=",
+        "lastModified": 1692799911,
+        "narHash": "sha256-3eihraek4qL744EvQXsK1Ha6C3CR7nnT8X2qWap4RNk=",
        "owner": "numtide",
        "repo": "flake-utils",
-        "rev": "ff7b65b44d01cf9ba6a71320833626af21126384",
+        "rev": "f9e7cf818399d17d347f847525c5a5a8032e4e44",
        "type": "github"
      },
      "original": {
@@ -20,11 +20,11 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1698318101,
-        "narHash": "sha256-gUihHt3yPD7bVqg+k/UVHgngyaJ3DMEBchbymBMvK1E=",
+        "lastModified": 1692913444,
+        "narHash": "sha256-1SvMQm2DwofNxXVtNWWtIcTh7GctEVrS/Xel/mdc6iY=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "63678e9f3d3afecfeafa0acead6239cdb447574c",
+        "rev": "18324978d632ffc55ef1d928e81630c620f4f447",
        "type": "github"
      },
      "original": {
--- a/flake.nix
+++ b/flake.nix
@@ -11,7 +11,8 @@
        meta.mainProgram = "llama";
        inherit (pkgs.stdenv) isAarch32 isAarch64 isDarwin;
        buildInputs = with pkgs; [ openmpi ];
-        osSpecific = with pkgs; buildInputs ++ (
+        osSpecific = with pkgs; buildInputs ++
+        (
          if isAarch64 && isDarwin then
            with pkgs.darwin.apple_sdk_11_0.frameworks; [
              Accelerate
@@ -50,9 +51,6 @@
        };
        llama-python =
          pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]);
-        # TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime
-        llama-python-extra =
-          pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece torchWithoutCuda transformers ]);
        postPatch = ''
          substituteInPlace ./ggml-metal.m \
            --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
@@ -95,15 +93,12 @@
        };
        packages.rocm = pkgs.stdenv.mkDerivation {
          inherit name src meta postPatch nativeBuildInputs postInstall;
-          buildInputs = with pkgs.rocmPackages; buildInputs ++ [ clr hipblas rocblas ];
+          buildInputs = with pkgs; buildInputs ++ [ hip hipblas rocblas ];
          cmakeFlags = cmakeFlags ++ [
            "-DLLAMA_HIPBLAS=1"
            "-DCMAKE_C_COMPILER=hipcc"
            "-DCMAKE_CXX_COMPILER=hipcc"
-            # Build all targets supported by rocBLAS. When updating search for TARGET_LIST_ROCM
-            # in github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/CMakeLists.txt
-            # and select the line that matches the current nixpkgs version of rocBLAS.
-            "-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
+            "-DCMAKE_POSITION_INDEPENDENT_CODE=ON"
          ];
        };
        apps.llama-server = {
@@ -131,9 +126,5 @@
          buildInputs = [ llama-python ];
          packages = nativeBuildInputs ++ osSpecific;
        };
-        devShells.extra = pkgs.mkShell {
-          buildInputs = [ llama-python-extra ];
-          packages = nativeBuildInputs ++ osSpecific;
-        };
      });
 }
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -513,15 +513,6 @@ static __global__ void add_f16_f32_f16(const half * x, const float * y, half * d
    dst[i] = __hadd(x[i], __float2half(y[i]));
 }

-static __global__ void add_f16_f32_f32(const half * x, const float * y, float * dst, const int k) {
-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-
-    if (i >= k) {
-        return;
-    }
-    dst[i] = __half2float(x[i]) + y[i];
-}
-
 static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
    const int i = blockDim.x*blockIdx.x + threadIdx.x;

@@ -4702,11 +4693,6 @@ static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, co
    add_f16_f32_f16<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
 }

-static void add_f16_f32_f32_cuda(const half * x, const float * y, float * dst, const int k, cudaStream_t stream) {
-    const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
-    add_f16_f32_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
-}
-
 static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
    const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE;
    mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
@@ -6010,10 +5996,7 @@ inline void ggml_cuda_op_add(
        add_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream);
    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
        add_f16_f32_f16_cuda((const half *) src0_dd, src1_dd, (half *) dst_dd, ggml_nelements(src0), main_stream);
-    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
-        add_f16_f32_f32_cuda((const half *) src0_dd, src1_dd, dst_dd, ggml_nelements(src0), main_stream);
    } else {
-        fprintf(stderr, "src0->type: %d  dst->type: %d\n", src0->type, dst->type);
        GGML_ASSERT(false);
    }

--- a/ggml-impl.h
+++ b/ggml-impl.h
@@ -1,237 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-
-// GGML internal header
-
-#include <assert.h>
-#include <stddef.h>
-#include <stdbool.h>
-#include <string.h> // memcpy
-#include <math.h>   // fabsf
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// static_assert should be a #define, but if it's not,
-// fall back to the _Static_assert C11 keyword.
-// if C99 - static_assert is noop
-// ref: https://stackoverflow.com/a/53923785/4039976
-#ifndef static_assert
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
-#define static_assert(cond, msg) _Static_assert(cond, msg)
-#else
-#define static_assert(cond, msg) struct global_scope_noop_trick
-#endif
-#endif
-
-// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
-#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
-#ifndef __FMA__
-#define __FMA__
-#endif
-#ifndef __F16C__
-#define __F16C__
-#endif
-#ifndef __SSE3__
-#define __SSE3__
-#endif
-#endif
-
-#undef MIN
-#undef MAX
-
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-
-// 16-bit float
-// on Arm, we use __fp16
-// on x86, we use uint16_t
-#if defined(__ARM_NEON) && !defined(_MSC_VER)
-
-// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
-//
-//   $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
-//
-#include <arm_neon.h>
-
-#define GGML_COMPUTE_FP16_TO_FP32(x) ((float) (x))
-#define GGML_COMPUTE_FP32_TO_FP16(x) (x)
-
-#define GGML_FP16_TO_FP32(x) ((float) (x))
-#define GGML_FP32_TO_FP16(x) (x)
-
-#else
-
-#ifdef __wasm_simd128__
-#include <wasm_simd128.h>
-#else
-#ifdef __POWER9_VECTOR__
-#include <altivec.h>
-#undef bool
-#define bool _Bool
-#else
-#if defined(_MSC_VER) || defined(__MINGW32__)
-#include <intrin.h>
-#else
-#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
-#if !defined(__riscv)
-#include <immintrin.h>
-#endif
-#endif
-#endif
-#endif
-#endif
-
-#ifdef __riscv_v_intrinsic
-#include <riscv_vector.h>
-#endif
-
-#ifdef __F16C__
-
-#ifdef _MSC_VER
-#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
-#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
-#else
-#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
-#endif
-
-#elif defined(__POWER9_VECTOR__)
-
-#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
-/* the inline asm below is about 12% faster than the lookup method */
-#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
-#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
-
-static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
-    register float f;
-    register double d;
-    __asm__(
-        "mtfprd %0,%2\n"
-        "xscvhpdp %0,%0\n"
-        "frsp %1,%0\n" :
-        /* temp */ "=d"(d),
-        /* out */  "=f"(f):
-        /* in */   "r"(h));
-    return f;
-}
-
-static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
-    register double d;
-    register ggml_fp16_t r;
-    __asm__( /* xscvdphp can work on double or single precision */
-        "xscvdphp %0,%2\n"
-        "mffprd %1,%0\n" :
-        /* temp */ "=d"(d),
-        /* out */  "=r"(r):
-        /* in */   "f"(f));
-    return r;
-}
-
-#else
-
-// FP16 <-> FP32
-// ref: https://github.com/Maratyszcza/FP16
-
-static inline float fp32_from_bits(uint32_t w) {
-    union {
-        uint32_t as_bits;
-        float as_value;
-    } fp32;
-    fp32.as_bits = w;
-    return fp32.as_value;
-}
-
-static inline uint32_t fp32_to_bits(float f) {
-    union {
-        float as_value;
-        uint32_t as_bits;
-    } fp32;
-    fp32.as_value = f;
-    return fp32.as_bits;
-}
-
-static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
-    const uint32_t w = (uint32_t) h << 16;
-    const uint32_t sign = w & UINT32_C(0x80000000);
-    const uint32_t two_w = w + w;
-
-    const uint32_t exp_offset = UINT32_C(0xE0) << 23;
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
-    const float exp_scale = 0x1.0p-112f;
-#else
-    const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
-#endif
-    const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
-
-    const uint32_t magic_mask = UINT32_C(126) << 23;
-    const float magic_bias = 0.5f;
-    const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
-
-    const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
-    const uint32_t result = sign |
-        (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
-    return fp32_from_bits(result);
-}
-
-static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
-    const float scale_to_inf = 0x1.0p+112f;
-    const float scale_to_zero = 0x1.0p-110f;
-#else
-    const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
-    const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
-#endif
-    float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
-
-    const uint32_t w = fp32_to_bits(f);
-    const uint32_t shl1_w = w + w;
-    const uint32_t sign = w & UINT32_C(0x80000000);
-    uint32_t bias = shl1_w & UINT32_C(0xFF000000);
-    if (bias < UINT32_C(0x71000000)) {
-        bias = UINT32_C(0x71000000);
-    }
-
-    base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
-    const uint32_t bits = fp32_to_bits(base);
-    const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
-    const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
-    const uint32_t nonsign = exp_bits + mantissa_bits;
-    return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
-}
-
-#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
-
-#endif // __F16C__
-
-#endif // __ARM_NEON
-
-// precomputed f32 table for f16 (256 KB)
-// defined in ggml.c, initialized in ggml_init()
-extern float ggml_table_f32_f16[1 << 16];
-
-// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
-// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
-// This is also true for POWER9.
-#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)
-
-inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
-    uint16_t s;
-    memcpy(&s, &f, sizeof(uint16_t));
-    return ggml_table_f32_f16[s];
-}
-
-#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
-#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
-
-#endif
-
-    // TODO: backend v2 PR
-
-#ifdef __cplusplus
-}
-#endif
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -238,17 +238,14 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
    // load kernels
    {
        NSError * error = nil;
-
-        /*
-        GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
-                (int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \
-                (int) ctx->pipeline_##name.threadExecutionWidth); \
-        */
 #define GGML_METAL_ADD_KERNEL(name) \
        ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
        ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
+        GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
+                (int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \
+                (int) ctx->pipeline_##name.threadExecutionWidth); \
        if (error) { \
-            GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
+          GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
            return NULL; \
        }

--- a/ggml-quants.c
+++ b/ggml-quants.c
--- a/ggml-quants.h
+++ b/ggml-quants.h
@@ -1,12 +1,22 @@
 #pragma once

-#include "ggml-impl.h"
+// This is a private API for quantization and dequantization
+// Should not be used directly, use ggml.h instead

-// GGML internal header
+#include "ggml.h"

 #include <stdint.h>
+#include <assert.h>
 #include <stddef.h>

+#ifndef static_assert
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
+#define static_assert(cond, msg) _Static_assert(cond, msg)
+#else
+#define static_assert(cond, msg) struct global_scope_noop_trick
+#endif
+#endif
+
 #define QK4_0 32
 typedef struct {
    ggml_fp16_t d;          // delta
--- a/ggml.c
+++ b/ggml.c
@@ -1,6 +1,6 @@
 #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows

-#include "ggml-impl.h"
+#include "ggml.h"
 #include "ggml-quants.h"

 #if defined(_MSC_VER) || defined(__MINGW32__)
@@ -27,6 +27,18 @@
 #include <unistd.h>
 #endif

+// static_assert should be a #define, but if it's not,
+// fall back to the _Static_assert C11 keyword.
+// if C99 - static_assert is noop
+// ref: https://stackoverflow.com/a/53923785/4039976
+#ifndef static_assert
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
+#define static_assert(cond, msg) _Static_assert(cond, msg)
+#else
+#define static_assert(cond, msg) struct global_scope_noop_trick
+#endif
+#endif
+
 #if defined(_MSC_VER)
 // disable "possible loss of data" to avoid hundreds of casts
 // we should just be careful :)
@@ -94,11 +106,23 @@ typedef void * thread_ret_t;
 #include <unistd.h>

 #endif
-
 #ifdef GGML_USE_CPU_HBM
 #include <hbwmalloc.h>
 #endif

+// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
+#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
+#ifndef __FMA__
+#define __FMA__
+#endif
+#ifndef __F16C__
+#define __F16C__
+#endif
+#ifndef __SSE3__
+#define __SSE3__
+#endif
+#endif
+
 /*#define GGML_PERF*/
 #define GGML_DEBUG 0
 #define GGML_GELU_FP16
@@ -224,27 +248,213 @@ inline static void * ggml_aligned_malloc(size_t size) {
 #include "ggml-opencl.h"
 #endif

+#undef MIN
+#undef MAX
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
 // floating point type used to accumulate sums
 typedef double ggml_float;

+// 16-bit float
+// on Arm, we use __fp16
+// on x86, we use uint16_t
+#if defined(__ARM_NEON) && !defined(_MSC_VER)
+
+// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
+//
+//   $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
+//
+#include <arm_neon.h>
+
+#define GGML_COMPUTE_FP16_TO_FP32(x) ((float) (x))
+#define GGML_COMPUTE_FP32_TO_FP16(x) (x)
+
+#define GGML_FP16_TO_FP32(x) ((float) (x))
+#define GGML_FP32_TO_FP16(x) (x)
+
+#else
+
+#ifdef __wasm_simd128__
+#include <wasm_simd128.h>
+#else
+#ifdef __POWER9_VECTOR__
+#include <altivec.h>
+#undef bool
+#define bool _Bool
+#else
+#if defined(_MSC_VER) || defined(__MINGW32__)
+#include <intrin.h>
+#else
+#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
+#if !defined(__riscv)
+#include <immintrin.h>
+#endif
+#endif
+#endif
+#endif
+#endif
+
+#ifdef __riscv_v_intrinsic
+#include <riscv_vector.h>
+#endif
+
+#ifdef __F16C__
+
+#ifdef _MSC_VER
+#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
+#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
+#else
+#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
+#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
+#endif
+
+#elif defined(__POWER9_VECTOR__)
+
+#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
+/* the inline asm below is about 12% faster than the lookup method */
+#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
+#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
+
+static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
+    register float f;
+    register double d;
+    __asm__(
+        "mtfprd %0,%2\n"
+        "xscvhpdp %0,%0\n"
+        "frsp %1,%0\n" :
+        /* temp */ "=d"(d),
+        /* out */  "=f"(f):
+        /* in */   "r"(h));
+    return f;
+}
+
+static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
+    register double d;
+    register ggml_fp16_t r;
+    __asm__( /* xscvdphp can work on double or single precision */
+        "xscvdphp %0,%2\n"
+        "mffprd %1,%0\n" :
+        /* temp */ "=d"(d),
+        /* out */  "=r"(r):
+        /* in */   "f"(f));
+    return r;
+}
+
+#else
+
+// FP16 <-> FP32
+// ref: https://github.com/Maratyszcza/FP16
+
+static inline float fp32_from_bits(uint32_t w) {
+    union {
+        uint32_t as_bits;
+        float as_value;
+    } fp32;
+    fp32.as_bits = w;
+    return fp32.as_value;
+}
+
+static inline uint32_t fp32_to_bits(float f) {
+    union {
+        float as_value;
+        uint32_t as_bits;
+    } fp32;
+    fp32.as_value = f;
+    return fp32.as_bits;
+}
+
+static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
+    const uint32_t w = (uint32_t) h << 16;
+    const uint32_t sign = w & UINT32_C(0x80000000);
+    const uint32_t two_w = w + w;
+
+    const uint32_t exp_offset = UINT32_C(0xE0) << 23;
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
+    const float exp_scale = 0x1.0p-112f;
+#else
+    const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
+#endif
+    const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
+
+    const uint32_t magic_mask = UINT32_C(126) << 23;
+    const float magic_bias = 0.5f;
+    const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
+
+    const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
+    const uint32_t result = sign |
+        (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
+    return fp32_from_bits(result);
+}
+
+static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
+    const float scale_to_inf = 0x1.0p+112f;
+    const float scale_to_zero = 0x1.0p-110f;
+#else
+    const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
+    const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
+#endif
+    float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
+
+    const uint32_t w = fp32_to_bits(f);
+    const uint32_t shl1_w = w + w;
+    const uint32_t sign = w & UINT32_C(0x80000000);
+    uint32_t bias = shl1_w & UINT32_C(0xFF000000);
+    if (bias < UINT32_C(0x71000000)) {
+        bias = UINT32_C(0x71000000);
+    }
+
+    base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
+    const uint32_t bits = fp32_to_bits(base);
+    const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
+    const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
+    const uint32_t nonsign = exp_bits + mantissa_bits;
+    return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
+}
+
+#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
+
+#endif // __F16C__
+
+#endif // __ARM_NEON
+
 //
 // global data
 //

 // precomputed gelu table for f16 (128 KB)
-static ggml_fp16_t ggml_table_gelu_f16[1 << 16];
+static ggml_fp16_t table_gelu_f16[1 << 16];

 // precomputed quick gelu table for f16 (128 KB)
-static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
+static ggml_fp16_t table_gelu_quick_f16[1 << 16];

 // precomputed silu table for f16 (128 KB)
-static ggml_fp16_t ggml_table_silu_f16[1 << 16];
+static ggml_fp16_t table_silu_f16[1 << 16];

 // precomputed exp table for f16 (128 KB)
-static ggml_fp16_t ggml_table_exp_f16[1 << 16];
+static ggml_fp16_t table_exp_f16[1 << 16];

-// precomputed f32 table for f16 (256 KB) (ggml-impl.h)
-float ggml_table_f32_f16[1 << 16];
+// precomputed f32 table for f16 (256 KB)
+static float table_f32_f16[1 << 16];
+
+// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
+// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
+// This is also true for POWER9.
+#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)
+
+inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
+    uint16_t s;
+    memcpy(&s, &f, sizeof(uint16_t));
+    return table_f32_f16[s];
+}
+
+#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
+#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
+
+#endif

 // note: do not use these inside ggml.c
 // these are meant to be used via the ggml.h API
@@ -422,28 +632,6 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .vec_dot                  = ggml_vec_dot_q4_1_q8_1,
        .vec_dot_type             = GGML_TYPE_Q8_1,
    },
-    [4] = { // GGML_TYPE_Q4_2
-        .type_name                = "DEPRECATED",
-        .blck_size                = 0,
-        .type_size                = 0,
-        .is_quantized             = false,
-        .to_float                 = NULL,
-        .from_float               = NULL,
-        .from_float_reference     = NULL,
-        .vec_dot                  = NULL,
-        .vec_dot_type             = GGML_TYPE_COUNT,
-    },
-    [5] = { // GGML_TYPE_Q4_3
-        .type_name                = "DEPRECATED",
-        .blck_size                = 0,
-        .type_size                = 0,
-        .is_quantized             = false,
-        .to_float                 = NULL,
-        .from_float               = NULL,
-        .from_float_reference     = NULL,
-        .vec_dot                  = NULL,
-        .vec_dot_type             = GGML_TYPE_COUNT,
-    },
    [GGML_TYPE_Q5_0] = {
        .type_name                = "q5_0",
        .blck_size                = QK5_0,
@@ -1363,7 +1551,7 @@ inline static float ggml_gelu_f32(float x) {
 inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
    const uint16_t * i16 = (const uint16_t *) x;
    for (int i = 0; i < n; ++i) {
-        y[i] = ggml_table_gelu_f16[i16[i]];
+        y[i] = table_gelu_f16[i16[i]];
    }
 }

@@ -1373,7 +1561,7 @@ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
    for (int i = 0; i < n; ++i) {
        ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
        memcpy(&t, &fp16, sizeof(uint16_t));
-        y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_f16[t]);
+        y[i] = GGML_FP16_TO_FP32(table_gelu_f16[t]);
    }
 }
 #else
@@ -1391,7 +1579,7 @@ inline static float ggml_gelu_quick_f32(float x) {
 //inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
 //    const uint16_t * i16 = (const uint16_t *) x;
 //    for (int i = 0; i < n; ++i) {
-//        y[i] = ggml_table_gelu_quick_f16[i16[i]];
+//        y[i] = table_gelu_quick_f16[i16[i]];
 //    }
 //}

@@ -1401,7 +1589,7 @@ inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float *
    for (int i = 0; i < n; ++i) {
        ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
        memcpy(&t, &fp16, sizeof(uint16_t));
-        y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]);
+        y[i] = GGML_FP16_TO_FP32(table_gelu_quick_f16[t]);
    }
 }
 #else
@@ -1420,7 +1608,7 @@ inline static float ggml_silu_f32(float x) {
 //inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
 //    const uint16_t * i16 = (const uint16_t *) x;
 //    for (int i = 0; i < n; ++i) {
-//        y[i] = ggml_table_silu_f16[i16[i]];
+//        y[i] = table_silu_f16[i16[i]];
 //    }
 //}

@@ -1430,7 +1618,7 @@ inline static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
    for (int i = 0; i < n; ++i) {
        ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
        memcpy(&t, &fp16, sizeof(uint16_t));
-        y[i] = GGML_FP16_TO_FP32(ggml_table_silu_f16[t]);
+        y[i] = GGML_FP16_TO_FP32(table_silu_f16[t]);
    }
 }
 #else
@@ -2146,11 +2334,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
            for (int i = 0; i < (1 << 16); ++i) {
                uint16_t ui = i;
                memcpy(&ii, &ui, sizeof(ii));
-                const float f = ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(ii);
-                ggml_table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
-                ggml_table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f));
-                ggml_table_silu_f16[i] = GGML_FP32_TO_FP16(ggml_silu_f32(f));
-                ggml_table_exp_f16[i]  = GGML_FP32_TO_FP16(expf(f));
+                const float f = table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(ii);
+                table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
+                table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f));
+                table_silu_f16[i] = GGML_FP32_TO_FP16(ggml_silu_f32(f));
+                table_exp_f16[i]  = GGML_FP32_TO_FP16(expf(f));
            }

            const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
@@ -3153,7 +3341,7 @@ static struct ggml_tensor * ggml_add_cast_impl(
    // TODO: support less-strict constraint
    //       GGML_ASSERT(ggml_can_repeat(b, a));
    GGML_ASSERT(ggml_can_repeat_rows(b, a));
-    GGML_ASSERT(ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16); // currently only supported for quantized input and f16
+    GGML_ASSERT(ggml_is_quantized(a->type)); // currently only supported for quantized input

    bool is_node = false;

@@ -6927,15 +7115,9 @@ static void ggml_compute_forward_add_f16_f32(

    GGML_ASSERT(src0->type == GGML_TYPE_F16);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type  == GGML_TYPE_F16);

-    if (dst->type == GGML_TYPE_F32) {
-        GGML_ASSERT( nb0 == sizeof(float));
-    }
-    else {
-        GGML_ASSERT(dst->type  == GGML_TYPE_F16);
-        GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
-    }
-
+    GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));

    // rows per thread
@@ -6946,35 +7128,18 @@ static void ggml_compute_forward_add_f16_f32(
    const int ir1 = MIN(ir0 + dr, nr);

    if (nb10 == sizeof(float)) {
-        if (dst->type == GGML_TYPE_F16) {
-            for (int ir = ir0; ir < ir1; ++ir) {
-                // src0, src1 and dst are same shape => same indices
-                const int i3 = ir/(ne2*ne1);
-                const int i2 = (ir - i3*ne2*ne1)/ne1;
-                const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+        for (int ir = ir0; ir < ir1; ++ir) {
+            // src0, src1 and dst are same shape => same indices
+            const int i3 = ir/(ne2*ne1);
+            const int i2 = (ir - i3*ne2*ne1)/ne1;
+            const int i1 = (ir - i3*ne2*ne1 - i2*ne1);

-                ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
-                ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
-                float *       src1_ptr = (float *)       ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
+            ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
+            ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+            float *       src1_ptr = (float *)       ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);

-                for (int i = 0; i < ne0; i++) {
-                    dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]);
-                }
-            }
-        } else {
-            for (int ir = ir0; ir < ir1; ++ir) {
-                // src0, src1 and dst are same shape => same indices
-                const int i3 = ir/(ne2*ne1);
-                const int i2 = (ir - i3*ne2*ne1)/ne1;
-                const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
-
-                float *       dst_ptr  = (float *)       ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
-                ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
-                float *       src1_ptr = (float *)       ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
-
-                for (int i = 0; i < ne0; i++) {
-                    dst_ptr[i] = GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i];
-                }
+            for (int i = 0; i < ne0; i++) {
+                dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]);
            }
        }
    }
@@ -10536,7 +10701,7 @@ static void ggml_compute_forward_soft_max_f32(
                // const float val = (sp[i] == -INFINITY) ? 0.0 : exp(sp[i] - max);
                ggml_fp16_t s = GGML_FP32_TO_FP16(sp[i] - max);
                memcpy(&scvt, &s, sizeof(scvt));
-                const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
+                const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
                sum += (ggml_float)val;
                dp[i] = val;
            }
@@ -12825,7 +12990,7 @@ static void ggml_compute_forward_flash_attn_f32(
 #else
                            ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
                            memcpy(&scvt[j], &s, sizeof(uint16_t));
-                            const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]);
+                            const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
 #endif
                            sump[j] += (ggml_float)val;
                            SS[j] = val;
@@ -13027,7 +13192,7 @@ static void ggml_compute_forward_flash_attn_f16(
                        } else {
                            ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
                            memcpy(&scvt[j], &s, sizeof(uint16_t));
-                            const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]);
+                            const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
                            sump[j] += (ggml_float)val;
                            SS[j] = val;
                        }
@@ -13478,7 +13643,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
 #else
                                    ggml_fp16_t s = GGML_FP32_TO_FP16(SR[j] - max);
                                    memcpy(&scvt[j], &s, sizeof(uint16_t));
-                                    const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]);
+                                    const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
 #endif
                                    sump[j] += (ggml_float)val;
                                    SW[j] = val;
@@ -14228,7 +14393,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
 #else
                    ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
                    memcpy(&scvt, &s, sizeof(scvt));
-                    const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
+                    const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
 #endif
                    sum += (ggml_float)val;
                    st[i] = val;
@@ -14342,7 +14507,7 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
 #else
                    ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
                    memcpy(&scvt, &s, sizeof(scvt));
-                    const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
+                    const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
 #endif
                    sum += (ggml_float)val;
                    ds0[i] = val;
--- a/ggml.h
+++ b/ggml.h
@@ -709,7 +709,7 @@ extern "C" {
    // Context tensor enumeration and lookup
    GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
    GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
-    GGML_API struct ggml_tensor * ggml_get_tensor      (struct ggml_context * ctx, const char * name);
+    GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);

    GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
    GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
--- a/llama.cpp
+++ b/llama.cpp
--- a/llama.h
+++ b/llama.h
@@ -334,14 +334,17 @@ extern "C" {
    LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
            "avoid using this, it will be removed in the future, instead - count the tokens in user code");

-    // Clear the KV cache
-    LLAMA_API void llama_kv_cache_clear(
-            struct llama_context * ctx);
+    // Remove all tokens data of cells in [c0, c1)
+    // c0 < 0 : [0,  c1]
+    // c1 < 0 : [c0, inf)
+    LLAMA_API void llama_kv_cache_tokens_rm(
+            struct llama_context * ctx,
+                         int32_t   c0,
+                         int32_t   c1);

    // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
-    // seq_id < 0 : match any sequence
-    // p0 < 0     : [0,  p1]
-    // p1 < 0     : [p0, inf)
+    // p0 < 0 : [0,  p1]
+    // p1 < 0 : [p0, inf)
    LLAMA_API void llama_kv_cache_seq_rm(
            struct llama_context * ctx,
                    llama_seq_id   seq_id,
@@ -598,13 +601,6 @@ extern "C" {
                           float   p,
                          size_t   min_keep);

-    /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
-    LLAMA_API void llama_sample_min_p(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates,
-                           float   p,
-                          size_t   min_keep);
-
    /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
    LLAMA_API void llama_sample_tail_free(
            struct llama_context * ctx,
--- a/scripts/server-llm.sh
+++ b/scripts/server-llm.sh
@@ -1,391 +0,0 @@
-#!/bin/bash
-#
-# Helper script for deploying llama.cpp server with a single Bash command
-#
-# - Works on Linux and macOS
-# - Supports: CPU, CUDA, Metal, OpenCL
-# - Can run all GGUF models from HuggingFace
-# - Can serve requests in parallel
-# - Always builds latest llama.cpp from GitHub
-#
-# Limitations
-#
-# - Chat templates are poorly supported (base models recommended)
-# - Might be unstable!
-#
-# Usage:
-#   ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose]
-#
-#   --port:       port number, default is 8888
-#   --repo:       path to a repo containing GGUF model files
-#   --wtype:      weights type (f16, q8_0, q4_0, q4_1), default is user-input
-#   --backend:    cpu, cuda, metal, opencl, depends on the OS
-#   --gpu-id:     gpu id, default is 0
-#   --n-parallel: number of parallel requests, default is 8
-#   --n-kv:       KV cache size, default is 4096
-#   --verbose:    verbose output
-#
-# Example:
-#
-#   bash -c "$(curl -s https://ggml.ai/server-llm.sh)"
-#
-
-set -e
-
-# required utils: curl, git, make
-if ! command -v curl &> /dev/null; then
-    printf "[-] curl not found\n"
-    exit 1
-fi
-if ! command -v git &> /dev/null; then
-    printf "[-] git not found\n"
-    exit 1
-fi
-if ! command -v make &> /dev/null; then
-    printf "[-] make not found\n"
-    exit 1
-fi
-
-# parse arguments
-port=8888
-repo=""
-wtype=""
-backend="cpu"
-
-# if macOS, use metal backend by default
-if [[ "$OSTYPE" == "darwin"* ]]; then
-    backend="metal"
-elif command -v nvcc &> /dev/null; then
-    backend="cuda"
-fi
-
-gpu_id=0
-n_parallel=8
-n_kv=4096
-verbose=0
-
-function print_usage {
-    printf "Usage:\n"
-    printf "  ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose]\n\n"
-    printf "  --port:       port number, default is 8888\n"
-    printf "  --repo:       path to a repo containing GGUF model files\n"
-    printf "  --wtype:      weights type (f16, q8_0, q4_0, q4_1), default is user-input\n"
-    printf "  --backend:    cpu, cuda, metal, opencl, depends on the OS\n"
-    printf "  --gpu-id:     gpu id, default is 0\n"
-    printf "  --n-parallel: number of parallel requests, default is 8\n"
-    printf "  --n-kv:       KV cache size, default is 4096\n"
-    printf "  --verbose:    verbose output\n\n"
-    printf "Example:\n\n"
-    printf '  bash -c "$(curl -s https://ggml.ai/server-llm.sh)"\n\n'
-}
-
-while [[ $# -gt 0 ]]; do
-    key="$1"
-    case $key in
-        --port)
-            port="$2"
-            shift
-            shift
-            ;;
-        --repo)
-            repo="$2"
-            shift
-            shift
-            ;;
-        --wtype)
-            wtype="$2"
-            shift
-            shift
-            ;;
-        --backend)
-            backend="$2"
-            shift
-            shift
-            ;;
-        --gpu-id)
-            gpu_id="$2"
-            shift
-            shift
-            ;;
-        --n-parallel)
-            n_parallel="$2"
-            shift
-            shift
-            ;;
-        --n-kv)
-            n_kv="$2"
-            shift
-            shift
-            ;;
-        --verbose)
-            verbose=1
-            shift
-            ;;
-        --help)
-            print_usage
-            exit 0
-            ;;
-        *)
-            echo "Unknown argument: $key"
-            print_usage
-            exit 1
-            ;;
-    esac
-done
-
-# available weights types
-wtypes=("F16" "Q8_0" "Q4_0" "Q4_1" "Q5_0" "Q5_1" "Q6_K" "Q5_K_M" "Q5_K_S" "Q4_K_M" "Q4_K_S" "Q3_K_L" "Q3_K_M" "Q3_K_S" "Q2_K")
-
-wfiles=()
-for wt in "${wtypes[@]}"; do
-    wfiles+=("")
-done
-
-# sample repos
-repos=(
-    "https://huggingface.co/TheBloke/Llama-2-7B-GGUF"
-    "https://huggingface.co/TheBloke/Llama-2-13B-GGUF"
-    "https://huggingface.co/TheBloke/Llama-2-70B-GGUF"
-    "https://huggingface.co/TheBloke/CodeLlama-7B-GGUF"
-    "https://huggingface.co/TheBloke/CodeLlama-13B-GGUF"
-    "https://huggingface.co/TheBloke/CodeLlama-34B-GGUF"
-    "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF"
-    "https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF"
-    "https://huggingface.co/TheBloke/OpenHermes-2-Mistral-7B-GGUF"
-    "https://huggingface.co/TheBloke/CausalLM-7B-GGUF"
-)
-
-printf "\n"
-printf "[I] This is a helper script for deploying llama.cpp's server on this machine.\n\n"
-printf "    Based on the options that follow, the script might download a model file\n"
-printf "    from the internet, which can be a few GBs in size. The script will also\n"
-printf "    build the latest llama.cpp source code from GitHub, which can be unstable.\n"
-printf "\n"
-printf "    Upon success, an HTTP server will be started and it will serve the selected\n"
-printf "    model using llama.cpp for demonstration purposes.\n"
-printf "\n"
-printf "    Please note:\n"
-printf "\n"
-printf "    - All new data will be stored in the current folder\n"
-printf "    - The server will be listening on all network interfaces\n"
-printf "    - The server will run with default settings which are not always optimal\n"
-printf "    - Do not judge the quality of a model based on the results from this script\n"
-printf "    - Do not use this script to benchmark llama.cpp\n"
-printf "    - Do not use this script in production\n"
-printf "    - This script is only for demonstration purposes\n"
-printf "\n"
-printf "    If you don't know what you are doing, please press Ctrl-C to abort now\n"
-printf "\n"
-printf "    Press Enter to continue ...\n\n"
-
-read
-
-if [[ -z "$repo" ]]; then
-    printf "[+] No repo provided from the command line\n"
-    printf "    Please select a number from the list below or enter an URL:\n\n"
-
-    is=0
-    for r in "${repos[@]}"; do
-        printf "    %2d) %s\n" $is "$r"
-        is=$((is+1))
-    done
-
-    # ask for repo until index of sample repo is provided or an URL
-    while [[ -z "$repo" ]]; do
-        printf "\n    Or choose one from: https://huggingface.co/models?sort=trending&search=gguf\n\n"
-        read -p "[+] Select repo: " repo
-
-        # check if the input is a number
-        if [[ "$repo" =~ ^[0-9]+$ ]]; then
-            if [[ "$repo" -ge 0 && "$repo" -lt ${#repos[@]} ]]; then
-                repo="${repos[$repo]}"
-            else
-                printf "[-] Invalid repo index: %s\n" "$repo"
-                repo=""
-            fi
-        elif [[ "$repo" =~ ^https?:// ]]; then
-            repo="$repo"
-        else
-            printf "[-] Invalid repo URL: %s\n" "$repo"
-            repo=""
-        fi
-    done
-fi
-
-# remove suffix
-repo=$(echo "$repo" | sed -E 's/\/tree\/main$//g')
-
-printf "[+] Checking for GGUF model files in %s\n" "$repo"
-
-# find GGUF files in the source
-# TODO: better logic
-model_tree="${repo%/}/tree/main"
-model_files=$(curl -s "$model_tree" | grep -i "\\.gguf</span>" | sed -E 's/.*<span class="truncate group-hover:underline">(.*)<\/span><\/a>/\1/g')
-
-# list all files in the provided git repo
-printf "[+] Model files:\n\n"
-for file in $model_files; do
-    # determine iw by grepping the filename with wtypes
-    iw=-1
-    is=0
-    for wt in "${wtypes[@]}"; do
-        # uppercase
-        ufile=$(echo "$file" | tr '[:lower:]' '[:upper:]')
-        if [[ "$ufile" =~ "$wt" ]]; then
-            iw=$is
-            break
-        fi
-        is=$((is+1))
-    done
-
-    if [[ $iw -eq -1 ]]; then
-        continue
-    fi
-
-    wfiles[$iw]="$file"
-
-    have=" "
-    if [[ -f "$file" ]]; then
-        have="*"
-    fi
-
-    printf "    %2d) %s %s\n" $iw "$have" "$file"
-done
-
-# ask for weights type until provided and available
-while [[ -z "$wtype" ]]; do
-    printf "\n"
-    read -p "[+] Select weight type: " wtype
-    wfile="${wfiles[$wtype]}"
-
-    if [[ -z "$wfile" ]]; then
-        printf "[-] Invalid weight type: %s\n" "$wtype"
-        wtype=""
-    fi
-done
-
-printf "[+] Selected weight type: %s (%s)\n" "$wtype" "$wfile"
-
-url="${repo%/}/resolve/main/$wfile"
-
-# check file if the model has been downloaded before
-chk="$wfile.chk"
-
-# check if we should download the file
-# - if $wfile does not exist
-# - if $wfile exists but $chk does not exist
-# - if $wfile exists and $chk exists but $wfile is newer than $chk
-# TODO: better logic using git lfs info
-
-do_download=0
-
-if [[ ! -f "$wfile" ]]; then
-    do_download=1
-elif [[ ! -f "$chk" ]]; then
-    do_download=1
-elif [[ "$wfile" -nt "$chk" ]]; then
-    do_download=1
-fi
-
-if [[ $do_download -eq 1 ]]; then
-    printf "[+] Downloading weights from %s\n" "$url"
-
-    # download the weights file
-    curl -o "$wfile" -# -L "$url"
-
-    # create a check file if successful
-    if [[ $? -eq 0 ]]; then
-        printf "[+] Creating check file %s\n" "$chk"
-        touch "$chk"
-    fi
-else
-    printf "[+] Using cached weights %s\n" "$wfile"
-fi
-
-# get latest llama.cpp and build
-
-printf "[+] Downloading latest llama.cpp\n"
-
-llama_cpp_dir="__llama_cpp_port_${port}__"
-
-if [[ -d "$llama_cpp_dir" && ! -f "$llama_cpp_dir/__ggml_script__" ]]; then
-    # if the dir exists and there isn't a file "__ggml_script__" in it, abort
-    printf "[-] Directory %s already exists\n" "$llama_cpp_dir"
-    printf "[-] Please remove it and try again\n"
-    exit 1
-elif [[ -d "$llama_cpp_dir" ]]; then
-    printf "[+] Directory %s already exists\n" "$llama_cpp_dir"
-    printf "[+] Using cached llama.cpp\n"
-
-    cd "$llama_cpp_dir"
-    git reset --hard
-    git fetch
-    git checkout origin/master
-
-    cd ..
-else
-    printf "[+] Cloning llama.cpp\n"
-
-    git clone https://github.com/ggerganov/llama.cpp "$llama_cpp_dir"
-fi
-
-# mark that that the directory is made by this script
-touch "$llama_cpp_dir/__ggml_script__"
-
-if [[ $verbose -eq 1 ]]; then
-    set -x
-fi
-
-# build
-cd "$llama_cpp_dir"
-
-make clean
-
-log="--silent"
-if [[ $verbose -eq 1 ]]; then
-    log=""
-fi
-
-if [[ "$backend" == "cuda" ]]; then
-    printf "[+] Building with CUDA backend\n"
-    LLAMA_CUBLAS=1 make -j server $log
-elif [[ "$backend" == "cpu" ]]; then
-    printf "[+] Building with CPU backend\n"
-    make -j server $log
-elif [[ "$backend" == "metal" ]]; then
-    printf "[+] Building with Metal backend\n"
-    make -j server $log
-elif [[ "$backend" == "opencl" ]]; then
-    printf "[+] Building with OpenCL backend\n"
-    LLAMA_CLBLAST=1 make -j server $log
-else
-    printf "[-] Unknown backend: %s\n" "$backend"
-    exit 1
-fi
-
-# run the server
-
-printf "[+] Running server\n"
-
-args=""
-if [[ "$backend" == "cuda" ]]; then
-    export CUDA_VISIBLE_DEVICES=$gpu_id
-    args="-ngl 999"
-elif [[ "$backend" == "cpu" ]]; then
-    args="-ngl 0"
-elif [[ "$backend" == "metal" ]]; then
-    args="-ngl 999"
-elif [[ "$backend" == "opencl" ]]; then
-    args="-ngl 999"
-else
-    printf "[-] Unknown backend: %s\n" "$backend"
-    exit 1
-fi
-
-if [[ $verbose -eq 1 ]]; then
-    args="$args --verbose"
-fi
-
-./server -m "../$wfile" --host 0.0.0.0 --port "$port" -c $n_kv -np "$n_parallel" $args
-
-exit 0
--- a/tests/test-double-float.cpp
+++ b/tests/test-double-float.cpp
@@ -4,7 +4,7 @@

 #undef NDEBUG
 #include <cassert>
-#if !defined(__riscv) && !defined(__s390__) && !defined(__ARM_NEON)
+#if !defined(__riscv) && !defined(__s390__)
 #include <immintrin.h>
 #endif
 #include <cmath>
--- a/tests/test-quantize-fns.cpp
+++ b/tests/test-quantize-fns.cpp
@@ -129,13 +129,6 @@ int main(int argc, char * argv[]) {
        ggml_type type = (ggml_type) i;
        ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);

-        // deprecated - skip
-        if (qfns.blck_size == 0) {
-            continue;
-        }
-
-        printf("Testing %s\n", ggml_type_name((ggml_type) i));
-
        if (qfns.from_float && qfns.to_float) {
            const float total_error = total_quantization_error(qfns, test_size, test_data.data());
            const float max_quantization_error =
Author	SHA1	Message	Date
cebtenzzre	8a86b95e87	quantize : --pure option for disabling k-quant mixtures	2023-10-28 16:37:03 -04:00
Georgi Gerganov	ee37e35dc5	ggml-quants : fix Zig and Swift builds + quantize tool ggml-ci	2023-10-28 17:24:10 +03:00
Georgi Gerganov	3412be728b	ggml : factor all quantization code in ggml-quants ggml-ci	2023-10-28 17:14:49 +03:00