llama : refactor quantization to avoid <mutex> header

ggml-ci
2026-02-12 14:03:20 +02:00 · 2024-01-02 15:56:57 +02:00
61 changed files with 5413 additions and 9848 deletions
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -9,7 +9,7 @@
  git,
  python3,
  mpi,
-  openblas, # TODO: Use the generic `blas` so users could switch between alternative implementations
+  openblas, # TODO: Use the generic `blas` so users could switch betwen alternative implementations
  cudaPackages,
  darwin,
  rocmPackages,
--- a/.github/workflows/nix-flake-update.yml
+++ b/.github/workflows/nix-flake-update.yml
@@ -19,4 +19,4 @@ jobs:
          pr-labels: |
            nix
          pr-reviewers: philiptaron,SomeoneSerge
-          token: ${{ secrets.FLAKE_TOKEN }}
+          token: ${{ secrets.GITHUB_TOKEN }}
--- a/.gitignore
+++ b/.gitignore
@@ -51,7 +51,6 @@ models-mnt
 /lookup
 /main
 /metal
-/passkey
 /perplexity
 /q8dot
 /quantize
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -177,29 +177,27 @@ if (LLAMA_METAL)
    if (LLAMA_METAL_SHADER_DEBUG)
        # custom command to do the following:
        #   xcrun -sdk macosx metal    -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
-        #   xcrun -sdk macosx metallib                   ggml-metal.air   -o default.metallib
+        #   xcrun -sdk macosx metallib                   ggml-metal.air   -o ggml.metallib
        #
        # note: this is the only way I found to disable fast-math in Metal. it's ugly, but at least it works
        #       disabling fast math is needed in order to pass tests/test-backend-ops
        # note: adding -fno-inline fixes the tests when using MTL_SHADER_VALIDATION=1
-        # note: unfortunately, we have to call it default.metallib instead of ggml.metallib
-        #       ref: https://github.com/ggerganov/whisper.cpp/issues/1720
        set(XC_FLAGS -fno-fast-math -fno-inline -g)
        if (LLAMA_QKK_64)
            set(XC_FLAGS ${XC_FLAGS} -DQK_K=64)
        endif()

        add_custom_command(
-            OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
+            OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml.metallib
            COMMAND xcrun -sdk macosx metal    ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
-            COMMAND xcrun -sdk macosx metallib                ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air   -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
+            COMMAND xcrun -sdk macosx metallib                ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air   -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml.metallib
            DEPENDS ggml-metal.metal
            COMMENT "Compiling Metal kernels"
        )

        add_custom_target(
            ggml-metal ALL
-            DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
+            DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml.metallib
        )
    endif()

@@ -230,11 +228,7 @@ if (LLAMA_BLAS)
            if (${LLAMA_BLAS_VENDOR} MATCHES "Generic")
                pkg_check_modules(DepBLAS REQUIRED blas)
            elseif (${LLAMA_BLAS_VENDOR} MATCHES "OpenBLAS")
-                # As of openblas v0.3.22, the 64-bit is named openblas64.pc
-                pkg_check_modules(DepBLAS openblas64)
-                if (NOT DepBLAS_FOUND)
-                    pkg_check_modules(DepBLAS REQUIRED openblas)
-                endif()
+                pkg_check_modules(DepBLAS REQUIRED openblas)
            elseif (${LLAMA_BLAS_VENDOR} MATCHES "FLAME")
                pkg_check_modules(DepBLAS REQUIRED blis)
            elseif (${LLAMA_BLAS_VENDOR} MATCHES "ATLAS")
--- a/10
+++ b/10
@@ -1,8 +1,8 @@
 # Define the default target now so that it is always the first target
 BUILD_TARGETS = \
-	main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
+	main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
 	simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search  \
-	speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey tests/test-c.o
+	speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup tests/test-c.o

 # Binaries only useful for tests
 TEST_TARGETS = \
@@ -614,9 +614,6 @@ quantize-stats: examples/quantize-stats/quantize-stats.cpp    build-info.o ggml.
 perplexity: examples/perplexity/perplexity.cpp                ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-imatrix: examples/imatrix/imatrix.cpp                         ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
 embedding: examples/embedding/embedding.cpp                   ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

@@ -668,9 +665,6 @@ lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS
 lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
 ifdef LLAMA_METAL
 metal: examples/metal/metal.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
--- a/Package.swift
+++ b/Package.swift
@@ -13,17 +13,21 @@ let package = Package(
    products: [
        .library(name: "llama", targets: ["llama"]),
    ],
-    dependencies: [
-        .package(url: "https://github.com/ggerganov/ggml.git", .branch("release"))
-    ],
    targets: [
        .target(
            name: "llama",
-            dependencies: ["ggml"],
            path: ".",
-            exclude: ["ggml-metal.metal"],
+            exclude: [],
            sources: [
+                "ggml.c",
                "llama.cpp",
+                "ggml-alloc.c",
+                "ggml-backend.c",
+                "ggml-quants.c",
+                "ggml-metal.m",
+            ],
+            resources: [
+                .process("ggml-metal.metal")
            ],
            publicHeadersPath: "spm-headers",
            cSettings: [
--- a/README.md
+++ b/README.md
@@ -10,7 +10,6 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++

 ### Hot topics

- New SOTA quantized models, including pure 2-bits: https://huggingface.co/ikawrakow
 - Collecting Apple Silicon performance stats:
  - M-series: https://github.com/ggerganov/llama.cpp/discussions/4167
  - A-series: https://github.com/ggerganov/llama.cpp/discussions/4508
@@ -119,7 +118,6 @@ as the main playground for developing new features for the [ggml](https://github
 - Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
 - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
 - Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
- JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp)
 - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
 - Rust: [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
 - C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
@@ -137,7 +135,6 @@ as the main playground for developing new features for the [ggml](https://github
 - [semperai/amica](https://github.com/semperai/amica)
 - [psugihara/FreeChat](https://github.com/psugihara/FreeChat)
 - [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)
- [iohub/collama](https://github.com/iohub/coLLaMA)

 ---

--- a/common/common.cpp
+++ b/common/common.cpp
@@ -220,20 +220,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.n_ctx = std::stoi(argv[i]);
-        } else if (arg == "--grp-attn-n" || arg == "-gan") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-
-            params.grp_attn_n = std::stoi(argv[i]);
-        } else if (arg == "--grp-attn-w" || arg == "-gaw") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-
-            params.grp_attn_w = std::stoi(argv[i]);
        } else if (arg == "--rope-freq-base") {
            if (++i >= argc) {
                invalid_param = true;
@@ -630,12 +616,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.ppl_stride = std::stoi(argv[i]);
-        } else if (arg == "-ptc" || arg == "--print-token-count") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_print = std::stoi(argv[i]);
        } else if (arg == "--ppl-output-type") {
            if (++i >= argc) {
                invalid_param = true;
@@ -924,10 +904,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("                        Not recommended since this is both slower and uses more VRAM.\n");
 #endif // GGML_USE_CUBLAS
 #endif
-    printf("  -gan N, --grp-attn-n N\n");
-    printf("                        group-attention factor (default: %d)\n", params.grp_attn_n);
-    printf("  -gaw N, --grp-attn-w N\n");
-    printf("                        group-attention width (default: %.1f)\n", (double)params.grp_attn_w);
    printf("  --verbose-prompt      print prompt before generation\n");
    printf("  -dkvc, --dump-kv-cache\n");
    printf("                        verbose print of the KV cache\n");
@@ -950,8 +926,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  --override-kv KEY=TYPE:VALUE\n");
    printf("                        advanced option to override model metadata by key. may be specified multiple times.\n");
    printf("                        types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
-    printf("  -stc N --print-token-count N\n");
-    printf("                        print token count every N tokens (default: %d)\n", params.n_print);
    printf("\n");
 #ifndef LOG_DISABLE_LOGS
    log_print_usage();
--- a/common/common.h
+++ b/common/common.h
@@ -62,9 +62,6 @@ struct gpt_params {
    int32_t main_gpu                        = 0;     // the GPU that is used for scratch and small tensors
    float   tensor_split[LLAMA_MAX_DEVICES] = {0};   // how split tensors should be distributed across GPUs
    int32_t n_beams                         = 0;     // if non-zero then use beam search of given width.
-    int32_t grp_attn_n                      = 1;     // group-attention factor
-    int32_t grp_attn_w                      = 512;   // group-attention width
-    int32_t n_print                         = -1;    // print token count every n tokens (-1 = disabled)
    float   rope_freq_base                  = 0.0f;  // RoPE base frequency
    float   rope_freq_scale                 = 0.0f;  // RoPE frequency scaling factor
    float   yarn_ext_factor                 = -1.0f; // YaRN extrapolation mix factor
@@ -243,3 +240,4 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);

 // Dump the KV cache view showing individual sequences in each cell (long output).
 void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
+
--- a/common/train.cpp
+++ b/common/train.cpp
@@ -1107,7 +1107,7 @@ void print_common_train_usage(int /*argc*/, char ** /*argv*/, const struct train
    fprintf(stderr, "  --sample-start STR         Sets the starting point for samples after the specified pattern. If empty use every token position as sample start. (default '%s')\n", params->sample_start.c_str());
    fprintf(stderr, "  --include-sample-start     Include the sample start in the samples. (default off)\n");
    fprintf(stderr, "  --escape                   process sample start escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
-    fprintf(stderr, "  --overlapping-samples      Samples may overlap, will include sample-start of second and following samples. When off, samples will end at begin of next sample. (default off)\n");
+    fprintf(stderr, "  --overlapping-samples      Samples my overlap, will include sample-start of second and following samples. When off, samples will end at begin of next sample. (default off)\n");
    fprintf(stderr, "  --fill-with-next-samples   Samples shorter than context length will be followed by the next (shuffled) samples. (default off)\n");
    fprintf(stderr, "  --separate-with-eos        When fill-with-next-samples, insert end-of-sequence token between samples.%s\n", params->separate_with_eos ? " (default)" : "");
    fprintf(stderr, "  --separate-with-bos        When fill-with-next-samples, insert begin-of-sequence token between samples.%s\n", params->separate_with_bos ? " (default)" : "");
--- a/convert.py
+++ b/convert.py
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -31,12 +31,10 @@ else()
    add_subdirectory(quantize-stats)
    add_subdirectory(save-load-state)
    add_subdirectory(simple)
-    add_subdirectory(passkey)
    add_subdirectory(speculative)
    add_subdirectory(lookahead)
    add_subdirectory(lookup)
    add_subdirectory(train-text-from-scratch)
-    add_subdirectory(imatrix)
    if (LLAMA_METAL)
        add_subdirectory(metal)
    endif()
--- a/examples/base-translate.sh
+++ b/examples/base-translate.sh
@@ -1,61 +0,0 @@
-#!/bin/bash
-#
-# Few-shot translation example.
-# Requires a base model (i.e. no fine-tuned or instruct models).
-#
-# Usage:
-#
-#   cd llama.cpp
-#   make -j
-#
-#   ./examples/base-translate.sh <model-base> "<text>" [extra-main-args]
-#
-
-if [ $# -lt 2 ]; then
-  echo "Usage: ./base-translate.sh <model-base> \"<text>\" [extra-main-args]"
-  exit 1
-fi
-
-eargs=""
-if [ $# -gt 2 ]; then
-  eargs="${@:3}"
-fi
-
-ftmp="__llama.cpp_example_tmp__.txt"
-trap "rm -f $ftmp" EXIT
-
-echo "Translate from English to French:
-
-===
-
-sea otter, peppermint, plush girafe:
-
-sea otter => loutre de mer
-peppermint => menthe poivrée
-plush girafe => girafe peluche
-
-===
-
-violin
-
-violin => violon
-
-===
-
-phone, computer, mouse, keyboard:
-
-phone => téléphone
-computer => ordinateur
-mouse => souris
-keyboard => clavier
-
-===
-" > $ftmp
-
-echo "$2
-" >> $ftmp
-
-model=$1
-
-# generate the most likely continuation until the string "===" is found
-./main -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -69,7 +69,6 @@ int main(int argc, char ** argv) {

    std::vector<llama_token> tokens_list;
    tokens_list = ::llama_tokenize(model, params.prompt, true);
-
    const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size())*n_parallel;

    // initialize the context
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -3,9 +3,15 @@
 #include "llama.h"
 #include "common.h"
 #include "train.h"
+#include <unordered_map>
 #include <vector>
+#include <cassert>
+#include <climits>
 #include <cstring>
+#include <cstdarg>
 #include <ctime>
+#include <random>
+#include <stdexcept>
 #include <algorithm>
 #include <string>

--- a/examples/imatrix/CMakeLists.txt
+++ b/examples/imatrix/CMakeLists.txt
@@ -1,5 +0,0 @@
-set(TARGET imatrix)
-add_executable(${TARGET} imatrix.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -1,380 +0,0 @@
-#include "common.h"
-#include "llama.h"
-
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <ctime>
-#include <sstream>
-#include <thread>
-#include <mutex>
-#include <vector>
-#include <fstream>
-#include <unordered_map>
-#include <algorithm>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-struct Stats {
-    std::vector<float> values;
-    int ncall = 0;
-};
-
-struct StatParams {
-    std::string ofile = "imatrix.dat";
-    int         n_output_frequency = 10;
-    int         verbosity = 1;
-    bool        collect_output_weight = false;
-};
-
-class IMatrixCollector {
-public:
-    IMatrixCollector() = default;
-    void set_parameters(StatParams&& params) { m_params = std::move(params); }
-    void collect_imatrix(const struct ggml_tensor * src0, const struct ggml_tensor * src1);
-    void save_imatrix() const;
-private:
-    std::unordered_map<std::string, Stats> m_stats;
-    StatParams                             m_params;
-    std::mutex                             m_mutex;
-    int                                    m_last_call = 0;
-};
-
-void IMatrixCollector::collect_imatrix(const struct ggml_tensor * src0, const struct ggml_tensor * src1) {
-    if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return;
-    if (!(strncmp(src0->name, "blk.", 4) == 0 || (m_params.collect_output_weight && strcmp(src0->name, "output.weight") == 0))) return;
-    std::lock_guard<std::mutex> lock(m_mutex);
-    auto& e = m_stats[src0->name];
-    if (e.values.empty()) {
-        e.values.resize(src1->ne[0], 0);
-    }
-    else if (e.values.size() != (size_t)src1->ne[0]) {
-        fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]);
-        exit(1); //GGML_ASSERT(false);
-    }
-    ++e.ncall;
-    if (m_params.verbosity > 1) {
-        printf("%s[%d]: %s, %d x %d, %d\n",__func__,m_last_call,src0->name,(int)src1->ne[0],(int)src1->ne[1],(int)src1->type);
-    }
-    for (int row = 0; row < (int)src1->ne[1]; ++row) {
-        const float * x = (const float *)src1->data + row * src1->ne[0];
-        for (int j = 0; j < (int)src1->ne[0]; ++j) {
-            e.values[j] += x[j]*x[j];
-        }
-    }
-    if (e.ncall > m_last_call) {
-        m_last_call = e.ncall;
-        if (m_last_call % m_params.n_output_frequency == 0) {
-            save_imatrix();
-        }
-    }
-}
-
-void IMatrixCollector::save_imatrix() const {
-    const char * fname = m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str();
-    std::ofstream out(fname, std::ios::binary);
-    int n_entries = m_stats.size();
-    out.write((const char*)&n_entries, sizeof(n_entries));
-    for (auto& p : m_stats) {
-        int len = p.first.size();
-        out.write((const char*)&len, sizeof(len));
-        out.write(p.first.c_str(), len);
-        out.write((const char*)&p.second.ncall, sizeof(p.second.ncall));
-        int nval = p.second.values.size();
-        out.write((const char*)&nval, sizeof(nval));
-        if (nval > 0) out.write((const char*)p.second.values.data(), nval*sizeof(float));
-    }
-    if (m_params.verbosity > 0) {
-        fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n",__func__,m_last_call,fname);
-    }
-}
-
-static IMatrixCollector g_collector;
-
-static void ik_collect_imatrix(const struct ggml_tensor * src0, const struct ggml_tensor * src1) {
-    g_collector.collect_imatrix(src0, src1);
-}
-
-
-struct results_log_softmax {
-    double log_softmax;
-    float  logit;
-    float  prob;
-};
-
-static std::vector<float> softmax(const std::vector<float>& logits) {
-    std::vector<float> probs(logits.size());
-    float max_logit = logits[0];
-    for (float v : logits) {
-        max_logit = std::max(max_logit, v);
-    }
-    double sum_exp = 0.0;
-    for (size_t i = 0; i < logits.size(); i++) {
-        // Subtract the maximum logit value from the current logit value for numerical stability
-        const float logit = logits[i] - max_logit;
-        const float exp_logit = expf(logit);
-        sum_exp += exp_logit;
-        probs[i] = exp_logit;
-    }
-    for (size_t i = 0; i < probs.size(); i++) {
-        probs[i] /= sum_exp;
-    }
-    return probs;
-}
-
-static results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) {
-    float max_logit = logits[0];
-    for (int i = 1; i < n_vocab; ++i) {
-        max_logit = std::max(max_logit, logits[i]);
-    }
-    double sum_exp = 0.0;
-    for (int i = 0; i < n_vocab; ++i) {
-        sum_exp += expf(logits[i] - max_logit);
-    }
-    return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp};
-}
-
-static void process_logits(
-    int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
-    double & nll, double & nll2, float * logit_history, float * prob_history
-) {
-    std::mutex mutex;
-    int counter = 0;
-    auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () {
-        double local_nll  = 0;
-        double local_nll2 = 0;
-        while (true) {
-            std::unique_lock<std::mutex> lock(mutex);
-            int i = counter++;
-            if (i >= n_token) {
-                nll += local_nll; nll2 += local_nll2;
-                break;
-            }
-            lock.unlock();
-            const results_log_softmax results = log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]);
-            const double v = -results.log_softmax;
-            local_nll += v;
-            local_nll2 += v*v;
-
-            logit_history[i] = results.logit;
-            prob_history[i]  = results.prob;
-        }
-    };
-    for (auto & w : workers) {
-        w = std::thread(compute);
-    }
-    compute();
-    for (auto & w : workers) {
-        w.join();
-    }
-}
-
-static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
-
-    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
-    const int n_ctx = llama_n_ctx(ctx);
-
-    auto tim1 = std::chrono::high_resolution_clock::now();
-    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
-
-    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
-
-    auto tim2 = std::chrono::high_resolution_clock::now();
-    fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
-
-    if (int(tokens.size()) < 2*n_ctx) {
-        fprintf(stderr, "%s: you need at least %d tokens for a context of %d tokens\n",__func__,2*n_ctx,
-                n_ctx);
-        fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
-        return false;
-    }
-
-    std::vector<float> logit_history;
-    logit_history.resize(tokens.size());
-
-    std::vector<float> prob_history;
-    prob_history.resize(tokens.size());
-
-    const int n_chunk_max = tokens.size() / n_ctx;
-
-    const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
-    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
-    const int n_batch = params.n_batch;
-
-    int count = 0;
-    double nll = 0.0;
-    double nll2 = 0.0;
-
-    fprintf(stderr, "%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch);
-
-    std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
-
-    for (int i = 0; i < n_chunk; ++i) {
-        const int start =     i * n_ctx;
-        const int end   = start + n_ctx;
-
-        const int num_batches = (n_ctx + n_batch - 1) / n_batch;
-
-        std::vector<float> logits;
-
-        const auto t_start = std::chrono::high_resolution_clock::now();
-
-        // clear the KV cache
-        llama_kv_cache_clear(ctx);
-
-        for (int j = 0; j < num_batches; ++j) {
-            const int batch_start = start + j * n_batch;
-            const int batch_size  = std::min(end - batch_start, n_batch);
-
-            // save original token and restore it after eval
-            const auto token_org = tokens[batch_start];
-
-            // add BOS token for the first batch of each chunk
-            if (add_bos && j == 0) {
-                tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
-            }
-
-            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
-                fprintf(stderr, "%s : failed to eval\n", __func__);
-                return false;
-            }
-
-            // restore the original token in case it was set to BOS
-            tokens[batch_start] = token_org;
-
-            const auto * batch_logits = llama_get_logits(ctx);
-            logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
-        }
-
-        const auto t_end = std::chrono::high_resolution_clock::now();
-
-        if (i == 0) {
-            const float t_total = std::chrono::duration<float>(t_end - t_start).count();
-            fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
-            int total_seconds = (int)(t_total * n_chunk);
-            if (total_seconds >= 60*60) {
-                fprintf(stderr, "%d hours ", total_seconds / (60*60));
-                total_seconds = total_seconds % (60*60);
-            }
-            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
-        }
-
-        const int first = n_ctx/2;
-        process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
-                       workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
-        count += n_ctx - first - 1;
-
-        printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
-        fflush(stdout);
-    }
-    printf("\n");
-
-    nll2 /= count;
-    nll /= count;
-    const double ppl = exp(nll);
-    nll2 -= nll * nll;
-    if (nll2 > 0) {
-        nll2 = sqrt(nll2/(count-1));
-        printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
-    } else {
-        printf("Unexpected negative standard deviation of log(prob)\n");
-    }
-
-    return true;
-}
-
-int main(int argc, char ** argv) {
-
-    StatParams sparams;
-    std::vector<char*> args;
-    args.push_back(argv[0]);
-    int iarg = 1;
-    for (; iarg < argc-1; ++iarg) {
-        std::string arg{argv[iarg]};
-        if (arg == "-o" || arg == "--output-file") {
-            sparams.ofile = argv[++iarg];
-        }
-        else if (arg == "-ofreq" || arg == "--output-frequency") {
-            sparams.n_output_frequency = std::stoi(argv[++iarg]);
-        }
-        else if (arg == "-ow" || arg == "--output-weight") {
-            sparams.collect_output_weight = std::stoi(argv[++iarg]);
-        }
-        else if (arg == "--verbosity") {
-            sparams.verbosity = std::stoi(argv[++iarg]);
-        } else {
-            args.push_back(argv[iarg]);
-        }
-    }
-    if (iarg < argc) {
-        args.push_back(argv[iarg]);
-    }
-
-    gpt_params params;
-    params.n_batch = 512;
-    if (!gpt_params_parse(args.size(), args.data(), params)) {
-        return 1;
-    }
-
-    g_collector.set_parameters(std::move(sparams));
-
-    ggml_set_imatrix_collection(ik_collect_imatrix);
-
-    params.logits_all = true;
-    params.n_batch = std::min(params.n_batch, params.n_ctx);
-
-    print_build_info();
-
-    if (params.seed == LLAMA_DEFAULT_SEED) {
-        params.seed = time(NULL);
-    }
-
-    fprintf(stderr, "%s: seed  = %u\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
-    if (params.random_prompt) {
-        params.prompt = gpt_random_prompt(rng);
-    }
-
-    llama_backend_init(params.numa);
-
-    llama_model * model;
-    llama_context * ctx;
-
-    // load the model and apply lora adapter, if any
-    std::tie(model, ctx) = llama_init_from_gpt_params(params);
-    if (model == NULL) {
-        fprintf(stderr, "%s: error: unable to load model\n", __func__);
-        return 1;
-    }
-
-    const int n_ctx_train = llama_n_ctx_train(model);
-    if (params.n_ctx > n_ctx_train) {
-        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
-                __func__, n_ctx_train, params.n_ctx);
-    }
-
-    // print system information
-    {
-        fprintf(stderr, "\n");
-        fprintf(stderr, "%s\n", get_system_info(params).c_str());
-    }
-
-    bool OK = compute_imatrix(ctx, params);
-    if (!OK) {
-        return 1;
-    }
-
-    g_collector.save_imatrix();
-
-    llama_print_timings(ctx);
-
-    llama_free(ctx);
-    llama_free_model(model);
-
-    llama_backend_free();
-
-    return 0;
-}
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -138,7 +138,6 @@ struct cmd_params {
    std::vector<int> n_threads;
    std::vector<int> n_gpu_layers;
    std::vector<int> main_gpu;
-    std::vector<bool> no_kv_offload;
    std::vector<bool> mul_mat_q;
    std::vector<std::array<float, LLAMA_MAX_DEVICES>> tensor_split;
    int reps;
@@ -156,7 +155,6 @@ static const cmd_params cmd_params_defaults = {
    /* n_threads     */ {get_num_physical_cores()},
    /* n_gpu_layers  */ {99},
    /* main_gpu      */ {0},
-    /* no_kv_offload */ {false},
    /* mul_mat_q     */ {true},
    /* tensor_split  */ {{}},
    /* reps          */ 5,
@@ -178,7 +176,6 @@ static void print_usage(int /* argc */, char ** argv) {
    printf("  -t, --threads <n>                 (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
    printf("  -ngl, --n-gpu-layers <n>          (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
    printf("  -mg, --main-gpu <i>               (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
-    printf("  -nkvo, --no-kv-offload <0|1>      (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
    printf("  -mmq, --mul-mat-q <0|1>           (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
    printf("  -ts, --tensor_split <ts0/ts1/..>               \n");
    printf("  -r, --repetitions <n>             (default: %d)\n", cmd_params_defaults.reps);
@@ -312,13 +309,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                break;
            }
            params.main_gpu = split<int>(argv[i], split_delim);
-        } else if (arg == "-nkvo" || arg == "--no-kv-offload") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            auto p = split<bool>(argv[i], split_delim);
-            params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
        } else if (arg == "-mmq" || arg == "--mul-mat-q") {
            if (++i >= argc) {
                invalid_param = true;
@@ -393,7 +383,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
    if (params.type_v.empty())       { params.type_v = cmd_params_defaults.type_v; }
    if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
    if (params.main_gpu.empty())     { params.main_gpu = cmd_params_defaults.main_gpu; }
-    if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
    if (params.mul_mat_q.empty())    { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
    if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
    if (params.n_threads.empty())    { params.n_threads = cmd_params_defaults.n_threads; }
@@ -411,7 +400,6 @@ struct cmd_params_instance {
    int n_threads;
    int n_gpu_layers;
    int main_gpu;
-    bool no_kv_offload;
    bool mul_mat_q;
    std::array<float, LLAMA_MAX_DEVICES> tensor_split;

@@ -440,7 +428,6 @@ struct cmd_params_instance {
        cparams.type_k = type_k;
        cparams.type_v = type_v;
        cparams.mul_mat_q = mul_mat_q;
-        cparams.offload_kqv = !no_kv_offload;

        return cparams;
    }
@@ -457,7 +444,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
    for (const auto & tk : params.type_k)
    for (const auto & tv : params.type_v)
    for (const auto & mmq : params.mul_mat_q)
-    for (const auto & nkvo : params.no_kv_offload)
    for (const auto & nt : params.n_threads) {
        cmd_params_instance instance = {
            /* .model        = */ m,
@@ -469,7 +455,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
            /* .n_threads    = */ nt,
            /* .n_gpu_layers = */ nl,
            /* .main_gpu     = */ mg,
-            /* .no_kv_offload= */ nkvo,
            /* .mul_mat_q    = */ mmq,
            /* .tensor_split = */ ts,
        };
@@ -491,7 +476,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
    for (const auto & tk : params.type_k)
    for (const auto & tv : params.type_v)
    for (const auto & mmq : params.mul_mat_q)
-    for (const auto & nkvo : params.no_kv_offload)
    for (const auto & nt : params.n_threads) {
        for (const auto & n_prompt : params.n_prompt) {
            if (n_prompt == 0) {
@@ -507,7 +491,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .n_threads    = */ nt,
                /* .n_gpu_layers = */ nl,
                /* .main_gpu     = */ mg,
-                /* .no_kv_offload= */ nkvo,
                /* .mul_mat_q    = */ mmq,
                /* .tensor_split = */ ts,
            };
@@ -528,7 +511,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .n_threads    = */ nt,
                /* .n_gpu_layers = */ nl,
                /* .main_gpu     = */ mg,
-                /* .no_kv_offload= */ nkvo,
                /* .mul_mat_q    = */ mmq,
                /* .tensor_split = */ ts,
            };
@@ -577,7 +559,6 @@ struct test {
    ggml_type type_v;
    int n_gpu_layers;
    int main_gpu;
-    bool no_kv_offload;
    bool mul_mat_q;
    std::array<float, LLAMA_MAX_DEVICES> tensor_split;
    int n_prompt;
@@ -598,7 +579,6 @@ struct test {
        type_v = inst.type_v;
        n_gpu_layers = inst.n_gpu_layers;
        main_gpu = inst.main_gpu;
-        no_kv_offload = inst.no_kv_offload;
        mul_mat_q = inst.mul_mat_q;
        tensor_split = inst.tensor_split;
        n_prompt = inst.n_prompt;
@@ -660,8 +640,7 @@ struct test {
            "cpu_info", "gpu_info",
            "model_filename", "model_type", "model_size", "model_n_params",
            "n_batch", "n_threads", "type_k", "type_v",
-            "n_gpu_layers", "main_gpu", "no_kv_offload",
-            "mul_mat_q", "tensor_split",
+            "n_gpu_layers", "main_gpu", "mul_mat_q", "tensor_split",
            "n_prompt", "n_gen", "test_time",
            "avg_ns", "stddev_ns",
            "avg_ts", "stddev_ts"
@@ -680,7 +659,7 @@ struct test {
            return INT;
        }
        if (field == "cuda" || field == "opencl" || field == "metal" || field == "gpu_blas" || field == "blas" ||
-            field == "f16_kv" || field == "no_kv_offload" || field == "mul_mat_q") {
+            field == "f16_kv" || field == "mul_mat_q") {
            return BOOL;
        }
        if (field == "avg_ts" || field == "stddev_ts") {
@@ -711,8 +690,7 @@ struct test {
            cpu_info, gpu_info,
            model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
            std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
-            std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(no_kv_offload),
-            std::to_string(mul_mat_q), tensor_split_str,
+            std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), tensor_split_str,
            std::to_string(n_prompt), std::to_string(n_gen), test_time,
            std::to_string(avg_ns()), std::to_string(stdev_ns()),
            std::to_string(avg_ts()), std::to_string(stdev_ts())
@@ -873,9 +851,6 @@ struct markdown_printer : public printer {
        if (field == "mul_mat_q") {
            return "mmq";
        }
-        if (field == "no_kv_offload") {
-            return "nkvo";
-        }
        if (field == "tensor_split") {
            return "ts";
        }
@@ -910,9 +885,6 @@ struct markdown_printer : public printer {
        if (params.mul_mat_q.size() > 1 || params.mul_mat_q != cmd_params_defaults.mul_mat_q) {
            fields.push_back("mul_mat_q");
        }
-        if (params.no_kv_offload.size() > 1 || params.no_kv_offload != cmd_params_defaults.no_kv_offload) {
-            fields.push_back("no_kv_offload");
-        }
        if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
            fields.push_back("tensor_split");
        }
--- a/examples/llama.swiftui/README.md
+++ b/examples/llama.swiftui/README.md
@@ -1,12 +1,7 @@
-# llama.cpp/examples/llama.swiftui
+# llama.swiftui

-Local inference of llama.cpp on an iPhone. This is a sample app that can be used as a starting
-point for more advanced projects.
-
-For usage instructions and performance stats, check the following discussion: https://github.com/ggerganov/llama.cpp/discussions/4508
-
-![image](https://github.com/ggerganov/llama.cpp/assets/1991296/2b40284f-8421-47a2-b634-74eece09a299)
-
-Video demonstration:
+Local inference of llama.cpp on an iPhone.
+So far I only tested with starcoder 1B model, but it can most likely handle 7B models as well.

 https://github.com/bachittle/llama.cpp/assets/39804642/e290827a-4edb-4093-9642-2a5e399ec545
+
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@@ -1,5 +1,8 @@
 import Foundation
-import llama
+
+// To use this in your own project, add llama.cpp as a swift package dependency
+// and uncomment this import line.
+// import llama

 enum LlamaError: Error {
    case couldNotInitializeContext
@@ -158,7 +161,7 @@ actor LlamaContext {
            new_token_id = llama_sample_token_greedy(context, &candidates_p)
        }

-        if new_token_id == llama_token_eos(model) || n_cur == n_len {
+        if new_token_id == llama_token_eos(context) || n_cur == n_len {
            print("\n")
            let new_token_str = String(cString: temporary_invalid_cchars + [0])
            temporary_invalid_cchars.removeAll()
--- a/examples/llama.swiftui/llama.cpp.swift/bridging-header.h
+++ b/examples/llama.swiftui/llama.cpp.swift/bridging-header.h
@@ -0,0 +1,5 @@
+//
+//  Use this file to import your target's public headers that you would like to expose to Swift.
+//
+
+#import "llama.h"
--- a/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj
+++ b/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj
@@ -7,32 +7,51 @@
 	objects = {

 /* Begin PBXBuildFile section */
+		542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 542376072B0D9BFB008E6A1C /* ggml-quants.c */; settings = {COMPILER_FLAGS = "-O3"; }; };
+		5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */ = {isa = PBXBuildFile; fileRef = 5423760A2B0D9C4B008E6A1C /* ggml-backend.c */; settings = {COMPILER_FLAGS = "-O3"; }; };
+		542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */ = {isa = PBXBuildFile; fileRef = 549479C82AC9E10B00E0F78B /* ggml-metal.metal */; };
+		542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09B2AC8723900A8AEE9 /* ggml.c */; settings = {COMPILER_FLAGS = "-DGGML_USE_ACCELERATE -DGGML_USE_METAL -DGGML_USE_K_QUANTS -O3"; }; };
+		542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */; settings = {COMPILER_FLAGS = "-O3"; }; };
+		542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 542EA0A12AC8729100A8AEE9 /* llama.cpp */; settings = {COMPILER_FLAGS = "-DGGML_USE_K_QUANTS -DGGML_USE_METAL -O3"; }; };
 		549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 549479CA2AC9E16000E0F78B /* Metal.framework */; };
+		549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */ = {isa = PBXBuildFile; fileRef = 549479C52AC9E0F200E0F78B /* ggml-metal.m */; settings = {COMPILER_FLAGS = "-fno-objc-arc -DGGML_SWIFT -DGGML_USE_METAL -O3"; }; };
 		7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */; };
 		8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */; };
 		8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83782AC328BD0096AF73 /* ContentView.swift */; };
 		8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837A2AC328BE0096AF73 /* Assets.xcassets */; };
+		8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */; };
 		8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 8A39BE092AC7601000BFEB40 /* Accelerate.framework */; };
 		8A3F84242AC4C891005E2EE8 /* models in Resources */ = {isa = PBXBuildFile; fileRef = 8A3F84232AC4C891005E2EE8 /* models */; };
 		8A907F332AC7138A006146EA /* LibLlama.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A907F322AC7134E006146EA /* LibLlama.swift */; };
 		8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */; };
-		DF810E132B4A5BA200301144 /* llama in Frameworks */ = {isa = PBXBuildFile; productRef = DF810E122B4A5BA200301144 /* llama */; };
-		F1FE20E22B465ECA00B45541 /* LoadCustomButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = F1FE20E12B465EC900B45541 /* LoadCustomButton.swift */; };
 /* End PBXBuildFile section */

 /* Begin PBXFileReference section */
+		542376062B0D9BEA008E6A1C /* ggml-quants.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-quants.h"; path = "../../ggml-quants.h"; sourceTree = "<group>"; };
+		542376072B0D9BFB008E6A1C /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../ggml-quants.c"; sourceTree = "<group>"; };
+		542376092B0D9C40008E6A1C /* ggml-backend.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-backend.h"; path = "../../ggml-backend.h"; sourceTree = "<group>"; };
+		5423760A2B0D9C4B008E6A1C /* ggml-backend.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-backend.c"; path = "../../ggml-backend.c"; sourceTree = "<group>"; };
+		542EA09B2AC8723900A8AEE9 /* ggml.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = ggml.c; path = ../../ggml.c; sourceTree = "<group>"; };
+		542EA09C2AC8723900A8AEE9 /* ggml.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ggml.h; path = ../../ggml.h; sourceTree = "<group>"; };
+		542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-alloc.h"; path = "../../ggml-alloc.h"; sourceTree = "<group>"; };
+		542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-alloc.c"; path = "../../ggml-alloc.c"; sourceTree = "<group>"; };
+		542EA0A12AC8729100A8AEE9 /* llama.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = llama.cpp; path = ../../llama.cpp; sourceTree = "<group>"; };
+		542EA0A22AC8729100A8AEE9 /* llama.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = llama.h; path = ../../llama.h; sourceTree = "<group>"; };
+		549479C52AC9E0F200E0F78B /* ggml-metal.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "ggml-metal.m"; path = "../../ggml-metal.m"; sourceTree = "<group>"; };
+		549479C62AC9E0F200E0F78B /* ggml-metal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-metal.h"; path = "../../ggml-metal.h"; sourceTree = "<group>"; };
+		549479C82AC9E10B00E0F78B /* ggml-metal.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; name = "ggml-metal.metal"; path = "../../ggml-metal.metal"; sourceTree = "<group>"; };
 		549479CA2AC9E16000E0F78B /* Metal.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Metal.framework; path = System/Library/Frameworks/Metal.framework; sourceTree = SDKROOT; };
 		7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = DownloadButton.swift; sourceTree = "<group>"; };
+		8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "bridging-header.h"; sourceTree = "<group>"; };
 		8A1C83732AC328BD0096AF73 /* llama.swiftui.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = llama.swiftui.app; sourceTree = BUILT_PRODUCTS_DIR; };
 		8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = llama_swiftuiApp.swift; sourceTree = "<group>"; };
 		8A1C83782AC328BD0096AF73 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = "<group>"; };
 		8A1C837A2AC328BE0096AF73 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
+		8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = "Preview Assets.xcassets"; sourceTree = "<group>"; };
 		8A39BE092AC7601000BFEB40 /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; };
 		8A3F84232AC4C891005E2EE8 /* models */ = {isa = PBXFileReference; lastKnownFileType = folder; name = models; path = llama.swiftui/Resources/models; sourceTree = "<group>"; };
 		8A907F322AC7134E006146EA /* LibLlama.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LibLlama.swift; sourceTree = "<group>"; };
 		8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaState.swift; sourceTree = "<group>"; };
-		DF2D2FE72B4A59BE00FCB72D /* llama.cpp */ = {isa = PBXFileReference; lastKnownFileType = wrapper; name = llama.cpp; path = ../..; sourceTree = "<group>"; };
-		F1FE20E12B465EC900B45541 /* LoadCustomButton.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LoadCustomButton.swift; sourceTree = "<group>"; };
 /* End PBXFileReference section */

 /* Begin PBXFrameworksBuildPhase section */
@@ -40,7 +59,6 @@
 			isa = PBXFrameworksBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
-				DF810E132B4A5BA200301144 /* llama in Frameworks */,
 				549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */,
 				8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */,
 			);
@@ -49,10 +67,30 @@
 /* End PBXFrameworksBuildPhase section */

 /* Begin PBXGroup section */
+		8A08D1F62AC7383900FE6CD4 /* llama.cpp */ = {
+			isa = PBXGroup;
+			children = (
+				5423760A2B0D9C4B008E6A1C /* ggml-backend.c */,
+				542376092B0D9C40008E6A1C /* ggml-backend.h */,
+				542376062B0D9BEA008E6A1C /* ggml-quants.h */,
+				542376072B0D9BFB008E6A1C /* ggml-quants.c */,
+				549479C82AC9E10B00E0F78B /* ggml-metal.metal */,
+				549479C62AC9E0F200E0F78B /* ggml-metal.h */,
+				549479C52AC9E0F200E0F78B /* ggml-metal.m */,
+				542EA09B2AC8723900A8AEE9 /* ggml.c */,
+				542EA09C2AC8723900A8AEE9 /* ggml.h */,
+				542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */,
+				542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */,
+				542EA0A12AC8729100A8AEE9 /* llama.cpp */,
+				542EA0A22AC8729100A8AEE9 /* llama.h */,
+			);
+			name = llama.cpp;
+			sourceTree = "<group>";
+		};
 		8A1C836A2AC328BD0096AF73 = {
 			isa = PBXGroup;
 			children = (
-				DF2D2FE72B4A59BE00FCB72D /* llama.cpp */,
+				8A08D1F62AC7383900FE6CD4 /* llama.cpp */,
 				8A907F312AC7134E006146EA /* llama.cpp.swift */,
 				8A3F84232AC4C891005E2EE8 /* models */,
 				8A1C83752AC328BD0096AF73 /* llama.swiftui */,
@@ -77,10 +115,19 @@
 				8A9F7C4A2AC332BF008AE1EA /* UI */,
 				8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */,
 				8A1C837A2AC328BE0096AF73 /* Assets.xcassets */,
+				8A1C837C2AC328BE0096AF73 /* Preview Content */,
 			);
 			path = llama.swiftui;
 			sourceTree = "<group>";
 		};
+		8A1C837C2AC328BE0096AF73 /* Preview Content */ = {
+			isa = PBXGroup;
+			children = (
+				8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */,
+			);
+			path = "Preview Content";
+			sourceTree = "<group>";
+		};
 		8A39BE082AC7601000BFEB40 /* Frameworks */ = {
 			isa = PBXGroup;
 			children = (
@@ -108,6 +155,7 @@
 		8A907F312AC7134E006146EA /* llama.cpp.swift */ = {
 			isa = PBXGroup;
 			children = (
+				8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */,
 				8A907F322AC7134E006146EA /* LibLlama.swift */,
 			);
 			path = llama.cpp.swift;
@@ -118,7 +166,6 @@
 			children = (
 				7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */,
 				8A1C83782AC328BD0096AF73 /* ContentView.swift */,
-				F1FE20E12B465EC900B45541 /* LoadCustomButton.swift */,
 			);
 			path = UI;
 			sourceTree = "<group>";
@@ -148,7 +195,6 @@
 			);
 			name = llama.swiftui;
 			packageProductDependencies = (
-				DF810E122B4A5BA200301144 /* llama */,
 			);
 			productName = llama.swiftui;
 			productReference = 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */;
@@ -195,7 +241,9 @@
 			isa = PBXResourcesBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
+				542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */,
 				8A3F84242AC4C891005E2EE8 /* models in Resources */,
+				8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */,
 				8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
@@ -207,12 +255,17 @@
 			isa = PBXSourcesBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
-				F1FE20E22B465ECA00B45541 /* LoadCustomButton.swift in Sources */,
+				542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */,
+				549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */,
+				542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */,
 				8A907F332AC7138A006146EA /* LibLlama.swift in Sources */,
+				542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */,
 				8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */,
 				8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */,
 				8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */,
 				7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */,
+				542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */,
+				5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
@@ -342,9 +395,11 @@
 			isa = XCBuildConfiguration;
 			buildSettings = {
 				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+				ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
 				CLANG_ENABLE_MODULES = YES;
 				CODE_SIGN_STYLE = Automatic;
 				CURRENT_PROJECT_VERSION = 1;
+				DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\"";
 				DEVELOPMENT_TEAM = STLSG3FG8Q;
 				ENABLE_PREVIEWS = YES;
 				GENERATE_INFOPLIST_FILE = YES;
@@ -361,12 +416,11 @@
 				MARKETING_VERSION = 1.0;
 				PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift";
 				PRODUCT_NAME = "$(TARGET_NAME)";
-				SUPPORTED_PLATFORMS = "iphoneos iphonesimulator xros xrsimulator";
-				SUPPORTS_XR_DESIGNED_FOR_IPHONE_IPAD = NO;
 				SWIFT_EMIT_LOC_STRINGS = YES;
+				SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h";
 				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
 				SWIFT_VERSION = 5.0;
-				TARGETED_DEVICE_FAMILY = "1,2,7";
+				TARGETED_DEVICE_FAMILY = "1,2";
 			};
 			name = Debug;
 		};
@@ -374,9 +428,11 @@
 			isa = XCBuildConfiguration;
 			buildSettings = {
 				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+				ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
 				CLANG_ENABLE_MODULES = YES;
 				CODE_SIGN_STYLE = Automatic;
 				CURRENT_PROJECT_VERSION = 1;
+				DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\"";
 				DEVELOPMENT_TEAM = STLSG3FG8Q;
 				ENABLE_PREVIEWS = YES;
 				GENERATE_INFOPLIST_FILE = YES;
@@ -393,11 +449,10 @@
 				MARKETING_VERSION = 1.0;
 				PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift";
 				PRODUCT_NAME = "$(TARGET_NAME)";
-				SUPPORTED_PLATFORMS = "iphoneos iphonesimulator xros xrsimulator";
-				SUPPORTS_XR_DESIGNED_FOR_IPHONE_IPAD = NO;
 				SWIFT_EMIT_LOC_STRINGS = YES;
+				SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h";
 				SWIFT_VERSION = 5.0;
-				TARGETED_DEVICE_FAMILY = "1,2,7";
+				TARGETED_DEVICE_FAMILY = "1,2";
 			};
 			name = Release;
 		};
@@ -423,13 +478,6 @@
 			defaultConfigurationName = Release;
 		};
 /* End XCConfigurationList section */
-
-/* Begin XCSwiftPackageProductDependency section */
-		DF810E122B4A5BA200301144 /* llama */ = {
-			isa = XCSwiftPackageProductDependency;
-			productName = llama;
-		};
-/* End XCSwiftPackageProductDependency section */
 	};
 	rootObject = 8A1C836B2AC328BD0096AF73 /* Project object */;
 }
--- a/examples/llama.swiftui/llama.swiftui/Assets.xcassets/AccentColor.colorset/Contents.json
+++ b/examples/llama.swiftui/llama.swiftui/Assets.xcassets/AccentColor.colorset/Contents.json
@@ -0,0 +1,11 @@
+{
+  "colors" : [
+    {
+      "idiom" : "universal"
+    }
+  ],
+  "info" : {
+    "author" : "xcode",
+    "version" : 1
+  }
+}
--- a/examples/llama.swiftui/llama.swiftui/Preview
+++ b/examples/llama.swiftui/llama.swiftui/Preview
@@ -0,0 +1,6 @@
+{
+  "info" : {
+    "author" : "xcode",
+    "version" : 1
+  }
+}
--- a/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift
+++ b/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift
@@ -103,8 +103,6 @@ struct ContentView: View {
                    ContentView.cleanupModelCaches()
                    llamaState.cacheCleared = true
                }
-
-                LoadCustomButton(llamaState: llamaState)
            }
            .padding(.top, 4)
            .font(.system(size: 12))
--- a/examples/llama.swiftui/llama.swiftui/UI/LoadCustomButton.swift
+++ b/examples/llama.swiftui/llama.swiftui/UI/LoadCustomButton.swift
@@ -1,44 +0,0 @@
-import SwiftUI
-import UniformTypeIdentifiers
-
-struct LoadCustomButton: View {
-    @ObservedObject private var llamaState: LlamaState
-    @State private var showFileImporter = false
-
-    init(llamaState: LlamaState) {
-        self.llamaState = llamaState
-    }
-
-    var body: some View {
-        VStack {
-            Button(action: {
-                showFileImporter = true
-            }) {
-                Text("Load Custom Model")
-            }
-        }
-        .fileImporter(
-            isPresented: $showFileImporter,
-            allowedContentTypes: [UTType(filenameExtension: "gguf", conformingTo: .data)!],
-            allowsMultipleSelection: false
-        ) { result in
-            switch result {
-            case .success(let files):
-                files.forEach { file in
-                    let gotAccess = file.startAccessingSecurityScopedResource()
-                    if !gotAccess { return }
-
-                    do {
-                        try llamaState.loadModel(modelUrl: file.absoluteURL)
-                    } catch let err {
-                        print("Error: \(err.localizedDescription)")
-                    }
-
-                    file.stopAccessingSecurityScopedResource()
-                }
-            case .failure(let error):
-                print(error)
-            }
-        }
-    }
-}
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -126,7 +126,24 @@ static struct ggml_tensor * get_tensor(struct ggml_context * ctx, const std::str
 }

 static std::string get_ftype(int ftype) {
-    return ggml_type_name(static_cast<ggml_type>(ftype));
+    switch (ftype) {
+    case 0:
+        return "f32";
+    case 1:
+        return "f16";
+    case 2:
+        return "q4_0";
+    case 3:
+        return "q4_1";
+    case 6:
+        return "q5_0";
+    case 7:
+        return "q5_1";
+    case 8:
+        return "q8_0";
+    default:
+        throw std::runtime_error(format("%s: Unrecognized file type: %d\n", __func__, ftype));
+    }
 }

 //
@@ -516,7 +533,6 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
    buffer_size += n_tensors * 128 /* CLIP PADDING */;

    clip_ctx * new_clip = new clip_ctx;
-
 #ifdef GGML_USE_CUBLAS
    new_clip->backend = ggml_backend_cuda_init(0);
    printf("%s: CLIP using CUDA backend\n", __func__);
@@ -527,7 +543,6 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
    printf("%s: CLIP using Metal backend\n", __func__);
 #endif

-
    if (!new_clip->backend) {
        new_clip->backend = ggml_backend_cpu_init();
        printf("%s: CLIP using CPU backend\n", __func__);
@@ -916,8 +931,26 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i

    ggml_type type = GGML_TYPE_Q4_1;

-    assert(itype < GGML_TYPE_COUNT);
-    type = static_cast<ggml_type>(itype);
+    switch (itype) {
+        case 2:
+            type = GGML_TYPE_Q4_0;
+            break;
+        case 3:
+            type = GGML_TYPE_Q4_1;
+            break;
+        case 6:
+            type = GGML_TYPE_Q5_0;
+            break;
+        case 7:
+            type = GGML_TYPE_Q5_1;
+            break;
+        case 8:
+            type = GGML_TYPE_Q8_0;
+            break;
+        default:
+            fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype);
+            return false;
+    };

    auto * ctx_clip = clip_model_load(fname_inp, 2);

@@ -977,10 +1010,6 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i

        if (quantize) {
            new_type = type;
-            if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) {
-                new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type
-                // fprintf(stderr, "%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
-            }
            const size_t n_elms = ggml_nelements(cur);
            float * f32_data;

@@ -1025,21 +1054,6 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
                case GGML_TYPE_Q8_0: {
                    new_size = ggml_quantize_q8_0(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
                } break;
-                case GGML_TYPE_Q2_K: {
-                    new_size = ggml_quantize_q2_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
-                } break;
-                case GGML_TYPE_Q3_K: {
-                    new_size = ggml_quantize_q3_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
-                } break;
-                case GGML_TYPE_Q4_K: {
-                    new_size = ggml_quantize_q4_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
-                } break;
-                case GGML_TYPE_Q5_K: {
-                    new_size = ggml_quantize_q5_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
-                } break;
-                case GGML_TYPE_Q6_K: {
-                    new_size = ggml_quantize_q6_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
-                } break;
                default: {
                    fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, new_type);
                    return false;
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -243,9 +243,6 @@ int main(int argc, char ** argv) {
    }

    auto image_embed = load_image(ctx_llava, &params);
-    if (!image_embed) {
-        return 1;
-    }

    // process the prompt
    process_prompt(ctx_llava, image_embed, &params, params.prompt);
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -439,21 +439,6 @@ int main(int argc, char ** argv) {
    LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
    LOG_TEE("sampling order: \n%s\n", llama_sampling_order_print(sparams).c_str());
    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
-
-    // group-attention state
-    // number of grouped KV tokens so far (used only if params.grp_attn_n > 1)
-    int ga_i = 0;
-
-    const int ga_n = params.grp_attn_n;
-    const int ga_w = params.grp_attn_w;
-
-    if (ga_n != 1) {
-        GGML_ASSERT(ga_n > 0                    && "grp_attn_n must be positive");                     // NOLINT
-        GGML_ASSERT(ga_w % ga_n == 0            && "grp_attn_w must be a multiple of grp_attn_n");     // NOLINT
-      //GGML_ASSERT(n_ctx_train % ga_w == 0     && "n_ctx_train must be a multiple of grp_attn_w");    // NOLINT
-      //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT
-        LOG_TEE("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w);
-    }
    LOG_TEE("\n\n");

    if (params.interactive) {
@@ -500,7 +485,7 @@ int main(int argc, char ** argv) {
    while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
        // predict
        if (!embd.empty()) {
-            // Note: (n_ctx - 4) here is to match the logic for commandline prompt handling via
+            // Note: n_ctx - 4 here is to match the logic for commandline prompt handling via
            // --prompt or --file which uses the same value.
            int max_embd_size = n_ctx - 4;

@@ -515,61 +500,37 @@ int main(int argc, char ** argv) {
                fflush(stdout);
            }

-            if (ga_n == 1) {
-                // infinite text generation via context shifting
-                // if we run out of context:
-                // - take the n_keep first tokens from the original prompt (via n_past)
-                // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
-                if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
-                    if (params.n_predict == -2) {
-                        LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
-                        break;
-                    }
-
-                    const int n_left    = n_past - params.n_keep - 1;
-                    const int n_discard = n_left/2;
-
-                    LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
-                            n_past, n_left, n_ctx, params.n_keep, n_discard);
-
-                    llama_kv_cache_seq_rm   (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
-                    llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
-
-                    n_past -= n_discard;
-
-                    if (ctx_guidance) {
-                        n_past_guidance -= n_discard;
-                    }
-
-                    LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
-
-                    LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
-
-                    LOG("clear session path\n");
-                    path_session.clear();
+            // infinite text generation via context swapping
+            // if we run out of context:
+            // - take the n_keep first tokens from the original prompt (via n_past)
+            // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
+            if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
+                if (params.n_predict == -2) {
+                    LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
+                    break;
                }
-            } else {
-                // context extension via Self-Extend
-                while (n_past >= ga_i + ga_w) {
-                    const int ib = (ga_n*ga_i)/ga_w;
-                    const int bd = (ga_w/ga_n)*(ga_n - 1);
-                    const int dd = (ga_w/ga_n) - ib*bd - ga_w;

-                    LOG("\n");
-                    LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd);
-                    LOG("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
-                    LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
+                const int n_left    = n_past - params.n_keep - 1;
+                const int n_discard = n_left/2;

-                    llama_kv_cache_seq_shift(ctx, 0, ga_i,                n_past,              ib*bd);
-                    llama_kv_cache_seq_div  (ctx, 0, ga_i + ib*bd,        ga_i + ib*bd + ga_w, ga_n);
-                    llama_kv_cache_seq_shift(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd,      dd);
+                LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
+                    n_past, n_left, n_ctx, params.n_keep, n_discard);

-                    n_past -= bd;
+                llama_kv_cache_seq_rm   (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
+                llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);

-                    ga_i += ga_w/ga_n;
+                n_past -= n_discard;

-                    LOG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i);
+                if (ctx_guidance) {
+                    n_past_guidance -= n_discard;
                }
+
+                LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
+
+                LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
+
+                LOG("clear session path\n");
+                path_session.clear();
            }

            // try to reuse a matching prefix from the loaded session instead of re-eval (via n_past)
@@ -650,10 +611,6 @@ int main(int argc, char ** argv) {
                n_past += n_eval;

                LOG("n_past = %d\n", n_past);
-                // Display total tokens alongside total time
-                if (params.n_print > 0 && n_past % params.n_print == 0) {
-                    LOG_TEE("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx);
-                }
            }

            if (!embd.empty() && !path_session.empty()) {
--- a/examples/passkey/CMakeLists.txt
+++ b/examples/passkey/CMakeLists.txt
@@ -1,5 +0,0 @@
-set(TARGET passkey)
-add_executable(${TARGET} passkey.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/passkey/README.md
+++ b/examples/passkey/README.md
@@ -1,12 +0,0 @@
-# llama.cpp/example/passkey
-
-See the following PRs for more info:
-
- https://github.com/ggerganov/llama.cpp/pull/3856
- https://github.com/ggerganov/llama.cpp/pull/4810
-
-### Usage
-
-```bash
-make -j && ./passkey ./models/llama-7b-v2/ggml-model-f16.gguf 250
-```
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -1,296 +0,0 @@
-#include "common.h"
-#include "llama.h"
-
-#include <cmath>
-#include <cstdio>
-#include <string>
-#include <vector>
-
-int main(int argc, char ** argv) {
-    gpt_params params;
-
-    if (argc == 1 || argv[1][0] == '-') {
-        printf("usage: %s MODEL_PATH N_JUNK N_GRP I_POS SEED\n" , argv[0]);
-        return 1 ;
-    }
-
-    int seed = -1;
-
-    int n_junk = 250; // number of times to repeat the junk text
-    int n_keep = 32;  // number of tokens in the prompt prefix
-    int n_grp  = 1;   // if more than 1 - perform LongLM SelfExtend
-    int i_pos  = -1;  // position of the passkey in the junk text
-
-    if (argc >= 2) {
-        params.model = argv[1];
-    }
-
-    if (argc >= 3) {
-        n_junk = std::stoi(argv[2]);
-    }
-
-    if (argc >= 4) {
-        n_grp = std::stoi(argv[3]);
-    }
-
-    if (argc >= 5) {
-        i_pos = std::stoi(argv[4]);
-    }
-
-    if (argc >= 6) {
-        seed = std::stoi(argv[5]);
-    }
-
-    if (seed == -1) {
-        seed = time(NULL);
-    }
-
-    srand(seed);
-
-    if (i_pos == -1) {
-        i_pos = rand() % n_junk;
-    }
-
-    const std::string prompt_prefix = "There is an important info hidden inside a lot of irrelevant text. Find it and memorize them. I will quiz you about the important information there.";
-    const std::string prompt_suffix = " What is the pass key? The pass key is";
-
-    // generate junk text
-    params.prompt = prompt_prefix;
-
-    const int passkey = rand() % 50000 + 1;
-
-    for (int i = 0; i < n_junk; i++) {
-        if (i % n_junk == i_pos) {
-            params.prompt += " The pass key is " + std::to_string(passkey) + ". Remember it. " + std::to_string(passkey) + " is the pass key.";
-        }
-
-        params.prompt += " The grass is green. The sky is blue. The sun is yellow. Here we go. There and back again.";
-    }
-
-    params.prompt += prompt_suffix;
-
-    // init LLM
-
-    llama_backend_init(params.numa);
-
-    // initialize the model
-
-    llama_model_params model_params = llama_model_default_params();
-
-    model_params.n_gpu_layers = 99; // offload all layers to the GPU
-
-    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
-
-    if (model == NULL) {
-        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
-        return 1;
-    }
-
-    // initialize the context
-
-    llama_context_params ctx_params = llama_context_default_params();
-
-    ctx_params.seed    = seed;
-    ctx_params.n_ctx   = llama_n_ctx_train(model)*n_grp + n_keep;
-    ctx_params.n_batch = 512;
-    ctx_params.n_threads       = params.n_threads;
-    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
-
-    GGML_ASSERT(ctx_params.n_batch % n_grp == 0 && "n_batch must be divisible by n_grp");
-
-    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
-
-    if (ctx == NULL) {
-        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
-        return 1;
-    }
-
-    // tokenize the prompt
-    std::vector<llama_token> tokens_list;
-    tokens_list = ::llama_tokenize(ctx, params.prompt, true);
-
-    // tokenize the prefix and use it as a sink
-    const int n_tokens_prefix = ::llama_tokenize(ctx, prompt_prefix, true).size();
-
-    const int n_tokens_all = tokens_list.size();
-
-    // we leave a margin of 16 tokens for the generated text - it should contain just the passkey
-    const int n_predict = 16;
-
-    // total length of the sequences including the prompt
-    const int n_len = n_tokens_all + n_predict;
-
-    const int n_ctx       = llama_n_ctx(ctx) - n_keep;
-    const int n_kv_req    = llama_n_ctx(ctx);
-    const int n_batch     = ctx_params.n_batch;
-    const int n_batch_grp = ctx_params.n_batch/n_grp;
-
-    LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch);
-
-    // print the prompt token-by-token
-
-    LOG_TEE("\n");
-    LOG_TEE("prefix tokens: %d\n", n_tokens_prefix);
-    LOG_TEE("prompt tokens: %d\n", n_tokens_all);
-    //LOG_TEE("prompt: %s\n", params.prompt.c_str());
-
-    llama_batch batch = llama_batch_init(512, 0, 1);
-
-    int n_past = 0;
-
-    // fill the KV cache
-    for (int i = 0; i < n_ctx; i += n_batch) {
-        if (i > 0 && n_grp > 1) {
-            // if SelfExtend is enabled, we compress the position from the last batch by a factor of n_grp
-            const int ib = i/n_batch - 1;
-            const int bd = n_batch_grp*(n_grp - 1);
-
-            llama_kv_cache_seq_shift(ctx, 0, n_past - n_batch,         n_past,         ib*bd);
-            llama_kv_cache_seq_div  (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
-
-            n_past -= bd;
-        }
-
-        llama_batch_clear(batch);
-
-        for (int j = 0; j < n_batch && i + j < n_tokens_all; j++) {
-            llama_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false);
-        }
-
-        if (i + n_batch >= n_tokens_all) {
-            batch.logits[batch.n_tokens - 1] = true;
-        }
-
-        if (llama_decode(ctx, batch) != 0) {
-            LOG_TEE("%s: llama_decode() failed\n", __func__);
-            return 1;
-        }
-
-        LOG_TEE("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
-
-        if (i + n_batch >= n_tokens_all) {
-            break;
-        }
-    }
-
-    for (int i = n_ctx; i < n_tokens_all; i += n_batch) {
-        const int n_discard = n_batch;
-
-        LOG_TEE("%s: shifting KV cache with %d\n", __func__, n_discard);
-
-        llama_kv_cache_seq_rm   (ctx, 0, n_keep            , n_keep + n_discard);
-        llama_kv_cache_seq_shift(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
-
-        n_past -= n_discard;
-
-        llama_batch_clear(batch);
-
-        for (int j = 0; j < n_batch && i + j < n_tokens_all; j++) {
-            llama_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false);
-        }
-
-        if (i + n_batch >= n_tokens_all) {
-            batch.logits[batch.n_tokens - 1] = true;
-        }
-
-        if (llama_decode(ctx, batch) != 0) {
-            LOG_TEE("%s: llama_decode() failed\n", __func__);
-            return 1;
-        }
-
-        LOG_TEE("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
-    }
-
-    {
-        const int n_discard = n_past - n_ctx + n_predict;
-
-        if (n_discard > 0) {
-            LOG_TEE("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
-
-            llama_kv_cache_seq_rm   (ctx, 0, n_keep            , n_keep + n_discard);
-            llama_kv_cache_seq_shift(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
-
-            n_past -= n_discard;
-        }
-    }
-
-    LOG_TEE("\n");
-    LOG_TEE("%s: passkey = %d, inserted at position %d / %d (token pos: ~%d)\n", __func__, passkey, i_pos, n_junk, (i_pos * n_tokens_all) / n_junk);
-    LOG_TEE("\n");
-
-    // main loop
-
-    int n_cur    = n_tokens_all;
-    int n_decode = 0;
-
-    LOG_TEE("%s", prompt_suffix.c_str());
-    fflush(stdout);
-
-    const auto t_main_start = ggml_time_us();
-
-    while (n_cur <= n_len) {
-        // sample the next token
-        {
-            auto   n_vocab = llama_n_vocab(model);
-            auto * logits  = llama_get_logits_ith(ctx, batch.n_tokens - 1);
-
-            std::vector<llama_token_data> candidates;
-            candidates.reserve(n_vocab);
-
-            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-                candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
-            }
-
-            llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-
-            // sample the most likely token
-            const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
-
-            // is it an end of stream?
-            if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
-                LOG_TEE("\n");
-
-                break;
-            }
-
-            LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
-            fflush(stdout);
-
-            n_decode += 1;
-
-            // prepare the next batch
-            llama_batch_clear(batch);
-
-            // push this new token for next evaluation
-            llama_batch_add(batch, new_token_id, n_past++, { 0 }, true);
-        }
-
-        n_cur += 1;
-
-        // evaluate the current batch with the transformer model
-        if (llama_decode(ctx, batch)) {
-            fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
-            return 1;
-        }
-    }
-
-    LOG_TEE("\n");
-
-    const auto t_main_end = ggml_time_us();
-
-    LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
-            __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
-
-    llama_print_timings(ctx);
-
-    fprintf(stderr, "\n");
-
-    llama_batch_free(batch);
-
-    llama_free(ctx);
-    llama_free_model(model);
-
-    llama_backend_free();
-
-    return 0;
-}
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -18,7 +18,6 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
    { "Q5_0",   LLAMA_FTYPE_MOSTLY_Q5_0,   " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", },
    { "Q5_1",   LLAMA_FTYPE_MOSTLY_Q5_1,   " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
    { "Q2_K",   LLAMA_FTYPE_MOSTLY_Q2_K,   " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
-    { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
    { "Q3_K",   LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
    { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
    { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", },
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -23,8 +23,6 @@ Command line options:
 -   `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`.
 -   `--port`: Set the port to listen. Default: `8080`.
 -   `--path`: path from which to serve static files (default examples/server/public)
-   `--api-key`: Set an api key for request authorization. By default the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys.
-   `--api-key-file`: path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`'s.
 -   `--embedding`: Enable embedding extraction, Default: disabled.
 -   `-np N`, `--parallel N`: Set the number of slots for process requests (default: 1)
 -   `-cb`, `--cont-batching`: enable continuous batching (a.k.a dynamic batching) (default: disabled)
@@ -111,10 +109,6 @@ node index.js
 ```

 ## API Endpoints
- **GET** `/health`: Returns the current state of the server:
-    - `{"status": "loading model"}` if the model is still being loaded.
-    - `{"status": "error"}` if the model failed to load.
-    - `{"status": "ok"}` if the model is successfully loaded and the server is ready for further requests mentioned below.

 -   **POST** `/completion`: Given a `prompt`, it returns the predicted completion.

@@ -174,51 +168,42 @@ node index.js

    `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:`. In this case, `[img-12]` will be replaced by the embeddings of the image with id `12` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.

+    *Result JSON:*
+
+    Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion.
+
+    `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string.
+
+    `stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options)
+
+    `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`
+
+    `model`: The path to the model loaded with `-m`
+
+    `prompt`: The provided `prompt`
+
+    `stopped_eos`: Indicating whether the completion has stopped because it encountered the EOS token
+
+    `stopped_limit`: Indicating whether the completion stopped because `n_predict` tokens were generated before stop words or EOS was encountered
+
+    `stopped_word`: Indicating whether the completion stopped due to encountering a stopping word from `stop` JSON array provided
+
+    `stopping_word`: The stopping word encountered which stopped the generation (or "" if not stopped due to a stopping word)
+
+    `timings`: Hash of timing information about the completion such as the number of tokens `predicted_per_second`
+
+    `tokens_cached`: Number of tokens from the prompt which could be re-used from previous completion (`n_past`)
+
+    `tokens_evaluated`: Number of tokens evaluated in total from the prompt
+
+    `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)
+
    `slot_id`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot (default: -1)

    `cache_prompt`: Save the prompt and generation for avoid reprocess entire prompt if a part of this isn't change (default: false)

    `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)

-### Result JSON:
-
-* Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion.
-
-
- `completion_probabilities`: An array of token probabilities for each completion. The array's length is `n_predict`. Each item in the array has the following structure:
-
-```
-{
-  "content": "<the token selected by the model>",
-  "probs": [
-    {
-      "prob": float,
-      "tok_str": "<most likely token>"
-    },
-    {
-      "prob": float,
-      "tok_str": "<second most likely tonen>"
-    },
-    ...
-  ]
-},
-```
-Notice that each `probs` is an array of length `n_probs`.
-
- `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string.
- `stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options)
- `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`
- `model`: The path to the model loaded with `-m`
- `prompt`: The provided `prompt`
- `stopped_eos`: Indicating whether the completion has stopped because it encountered the EOS token
- `stopped_limit`: Indicating whether the completion stopped because `n_predict` tokens were generated before stop words or EOS was encountered
- `stopped_word`: Indicating whether the completion stopped due to encountering a stopping word from `stop` JSON array provided
- `stopping_word`: The stopping word encountered which stopped the generation (or "" if not stopped due to a stopping word)
- `timings`: Hash of timing information about the completion such as the number of tokens `predicted_per_second`
- `tokens_cached`: Number of tokens from the prompt which could be re-used from previous completion (`n_past`)
- `tokens_evaluated`: Number of tokens evaluated in total from the prompt
- `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)
-
 -   **POST** `/tokenize`: Tokenize a given text.

    *Options:*
--- a/examples/server/completion.js.hpp
+++ b/examples/server/completion.js.hpp
@@ -74,376 +74,355 @@ unsigned char completion_js[] = {
  0x6f, 0x6e, 0x2f, 0x6a, 0x73, 0x6f, 0x6e, 0x27, 0x2c, 0x0a, 0x20, 0x20,
  0x20, 0x20, 0x20, 0x20, 0x27, 0x41, 0x63, 0x63, 0x65, 0x70, 0x74, 0x27,
  0x3a, 0x20, 0x27, 0x74, 0x65, 0x78, 0x74, 0x2f, 0x65, 0x76, 0x65, 0x6e,
-  0x74, 0x2d, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x27, 0x2c, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x28, 0x70, 0x61, 0x72,
-  0x61, 0x6d, 0x73, 0x2e, 0x61, 0x70, 0x69, 0x5f, 0x6b, 0x65, 0x79, 0x20,
-  0x3f, 0x20, 0x7b, 0x27, 0x41, 0x75, 0x74, 0x68, 0x6f, 0x72, 0x69, 0x7a,
-  0x61, 0x74, 0x69, 0x6f, 0x6e, 0x27, 0x3a, 0x20, 0x60, 0x42, 0x65, 0x61,
-  0x72, 0x65, 0x72, 0x20, 0x24, 0x7b, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73,
-  0x2e, 0x61, 0x70, 0x69, 0x5f, 0x6b, 0x65, 0x79, 0x7d, 0x60, 0x7d, 0x20,
-  0x3a, 0x20, 0x7b, 0x7d, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x2c,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x3a,
-  0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e,
-  0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x2c, 0x0a, 0x20, 0x20, 0x7d, 0x29,
-  0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72,
-  0x65, 0x61, 0x64, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x72, 0x65, 0x73, 0x70,
-  0x6f, 0x6e, 0x73, 0x65, 0x2e, 0x62, 0x6f, 0x64, 0x79, 0x2e, 0x67, 0x65,
-  0x74, 0x52, 0x65, 0x61, 0x64, 0x65, 0x72, 0x28, 0x29, 0x3b, 0x0a, 0x20,
-  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x64, 0x65, 0x63, 0x6f, 0x64,
-  0x65, 0x72, 0x20, 0x3d, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x54, 0x65, 0x78,
-  0x74, 0x44, 0x65, 0x63, 0x6f, 0x64, 0x65, 0x72, 0x28, 0x29, 0x3b, 0x0a,
-  0x0a, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65,
-  0x6e, 0x74, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20, 0x6c,
-  0x65, 0x74, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72, 0x20,
-  0x3d, 0x20, 0x22, 0x22, 0x3b, 0x20, 0x2f, 0x2f, 0x20, 0x42, 0x75, 0x66,
-  0x66, 0x65, 0x72, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x70, 0x61, 0x72, 0x74,
-  0x69, 0x61, 0x6c, 0x6c, 0x79, 0x20, 0x72, 0x65, 0x61, 0x64, 0x20, 0x6c,
-  0x69, 0x6e, 0x65, 0x73, 0x0a, 0x0a, 0x20, 0x20, 0x74, 0x72, 0x79, 0x20,
-  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f,
-  0x6e, 0x74, 0x20, 0x3d, 0x20, 0x74, 0x72, 0x75, 0x65, 0x3b, 0x0a, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x63,
-  0x6f, 0x6e, 0x74, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c,
-  0x74, 0x20, 0x3d, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x72, 0x65,
-  0x61, 0x64, 0x65, 0x72, 0x2e, 0x72, 0x65, 0x61, 0x64, 0x28, 0x29, 0x3b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72,
-  0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x6f, 0x6e, 0x65, 0x29, 0x20,
-  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x72,
-  0x65, 0x61, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
-  0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x41,
-  0x64, 0x64, 0x20, 0x61, 0x6e, 0x79, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f,
-  0x76, 0x65, 0x72, 0x20, 0x64, 0x61, 0x74, 0x61, 0x20, 0x74, 0x6f, 0x20,
-  0x74, 0x68, 0x65, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x20,
-  0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x64, 0x61, 0x74,
-  0x61, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73,
-  0x74, 0x20, 0x74, 0x65, 0x78, 0x74, 0x20, 0x3d, 0x20, 0x6c, 0x65, 0x66,
-  0x74, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x2b, 0x20, 0x64, 0x65, 0x63, 0x6f,
-  0x64, 0x65, 0x72, 0x2e, 0x64, 0x65, 0x63, 0x6f, 0x64, 0x65, 0x28, 0x72,
-  0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29,
-  0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20,
-  0x43, 0x68, 0x65, 0x63, 0x6b, 0x20, 0x69, 0x66, 0x20, 0x74, 0x68, 0x65,
-  0x20, 0x6c, 0x61, 0x73, 0x74, 0x20, 0x63, 0x68, 0x61, 0x72, 0x61, 0x63,
-  0x74, 0x65, 0x72, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x6c, 0x69, 0x6e,
-  0x65, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x6e, 0x64, 0x73,
-  0x57, 0x69, 0x74, 0x68, 0x4c, 0x69, 0x6e, 0x65, 0x42, 0x72, 0x65, 0x61,
-  0x6b, 0x20, 0x3d, 0x20, 0x74, 0x65, 0x78, 0x74, 0x2e, 0x65, 0x6e, 0x64,
-  0x73, 0x57, 0x69, 0x74, 0x68, 0x28, 0x27, 0x5c, 0x6e, 0x27, 0x29, 0x3b,
-  0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x53,
-  0x70, 0x6c, 0x69, 0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x74, 0x65, 0x78,
-  0x74, 0x20, 0x69, 0x6e, 0x74, 0x6f, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x6c,
-  0x69, 0x6e, 0x65, 0x73, 0x20, 0x3d, 0x20, 0x74, 0x65, 0x78, 0x74, 0x2e,
-  0x73, 0x70, 0x6c, 0x69, 0x74, 0x28, 0x27, 0x5c, 0x6e, 0x27, 0x29, 0x3b,
-  0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x49,
-  0x66, 0x20, 0x74, 0x68, 0x65, 0x20, 0x74, 0x65, 0x78, 0x74, 0x20, 0x64,
-  0x6f, 0x65, 0x73, 0x6e, 0x27, 0x74, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77,
-  0x69, 0x74, 0x68, 0x20, 0x61, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x20, 0x62,
-  0x72, 0x65, 0x61, 0x6b, 0x2c, 0x20, 0x74, 0x68, 0x65, 0x6e, 0x20, 0x74,
-  0x68, 0x65, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x20, 0x6c, 0x69, 0x6e, 0x65,
-  0x20, 0x69, 0x73, 0x20, 0x69, 0x6e, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65,
-  0x74, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20,
-  0x53, 0x74, 0x6f, 0x72, 0x65, 0x20, 0x69, 0x74, 0x20, 0x69, 0x6e, 0x20,
-  0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x74, 0x6f, 0x20,
-  0x62, 0x65, 0x20, 0x61, 0x64, 0x64, 0x65, 0x64, 0x20, 0x74, 0x6f, 0x20,
-  0x74, 0x68, 0x65, 0x20, 0x6e, 0x65, 0x78, 0x74, 0x20, 0x63, 0x68, 0x75,
-  0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x64, 0x61, 0x74, 0x61, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x65, 0x6e,
-  0x64, 0x73, 0x57, 0x69, 0x74, 0x68, 0x4c, 0x69, 0x6e, 0x65, 0x42, 0x72,
-  0x65, 0x61, 0x6b, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72, 0x20,
-  0x3d, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73, 0x2e, 0x70, 0x6f, 0x70, 0x28,
-  0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x65,
-  0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x3d,
-  0x20, 0x22, 0x22, 0x3b, 0x20, 0x2f, 0x2f, 0x20, 0x52, 0x65, 0x73, 0x65,
-  0x74, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x69,
-  0x66, 0x20, 0x77, 0x65, 0x20, 0x68, 0x61, 0x76, 0x65, 0x20, 0x61, 0x20,
-  0x6c, 0x69, 0x6e, 0x65, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x20, 0x61,
-  0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x65, 0x6e, 0x64, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x2f, 0x2f, 0x20, 0x50, 0x61, 0x72, 0x73, 0x65, 0x20, 0x61, 0x6c,
-  0x6c, 0x20, 0x73, 0x73, 0x65, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x73,
-  0x20, 0x61, 0x6e, 0x64, 0x20, 0x61, 0x64, 0x64, 0x20, 0x74, 0x68, 0x65,
-  0x6d, 0x20, 0x74, 0x6f, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
-  0x72, 0x65, 0x67, 0x65, 0x78, 0x20, 0x3d, 0x20, 0x2f, 0x5e, 0x28, 0x5c,
-  0x53, 0x2b, 0x29, 0x3a, 0x5c, 0x73, 0x28, 0x2e, 0x2a, 0x29, 0x24, 0x2f,
-  0x67, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f,
-  0x72, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x69, 0x6e,
-  0x65, 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73, 0x29, 0x20,
-  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
-  0x6e, 0x73, 0x74, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x20, 0x3d, 0x20,
-  0x72, 0x65, 0x67, 0x65, 0x78, 0x2e, 0x65, 0x78, 0x65, 0x63, 0x28, 0x6c,
-  0x69, 0x6e, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x29,
-  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x5b, 0x6d, 0x61, 0x74, 0x63,
-  0x68, 0x5b, 0x31, 0x5d, 0x5d, 0x20, 0x3d, 0x20, 0x6d, 0x61, 0x74, 0x63,
-  0x68, 0x5b, 0x32, 0x5d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x73, 0x69, 0x6e, 0x63, 0x65, 0x20,
-  0x77, 0x65, 0x20, 0x6b, 0x6e, 0x6f, 0x77, 0x20, 0x74, 0x68, 0x69, 0x73,
-  0x20, 0x69, 0x73, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70,
-  0x70, 0x2c, 0x20, 0x6c, 0x65, 0x74, 0x27, 0x73, 0x20, 0x6a, 0x75, 0x73,
-  0x74, 0x20, 0x64, 0x65, 0x63, 0x6f, 0x64, 0x65, 0x20, 0x74, 0x68, 0x65,
-  0x20, 0x6a, 0x73, 0x6f, 0x6e, 0x20, 0x69, 0x6e, 0x20, 0x64, 0x61, 0x74,
-  0x61, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64,
-  0x61, 0x74, 0x61, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c,
-  0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x4a, 0x53, 0x4f,
-  0x4e, 0x2e, 0x70, 0x61, 0x72, 0x73, 0x65, 0x28, 0x72, 0x65, 0x73, 0x75,
-  0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x3b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
-  0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x2b, 0x3d, 0x20, 0x72, 0x65, 0x73,
-  0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e,
-  0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x79, 0x69,
-  0x65, 0x6c, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x79, 0x69, 0x65, 0x6c, 0x64, 0x20, 0x72, 0x65,
-  0x73, 0x75, 0x6c, 0x74, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x69, 0x66,
-  0x20, 0x77, 0x65, 0x20, 0x67, 0x6f, 0x74, 0x20, 0x61, 0x20, 0x73, 0x74,
-  0x6f, 0x70, 0x20, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x20, 0x66, 0x72, 0x6f,
-  0x6d, 0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2c, 0x20, 0x77, 0x65,
-  0x20, 0x77, 0x69, 0x6c, 0x6c, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x20,
-  0x68, 0x65, 0x72, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73,
-  0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x73, 0x74, 0x6f,
-  0x70, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72,
-  0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x67,
-  0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65,
-  0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e,
-  0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x3d, 0x20,
-  0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e,
-  0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73,
-  0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x3b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x66, 0x61,
-  0x6c, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b,
-  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74,
-  0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65,
-  0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x20, 0x3d,
-  0x20, 0x4a, 0x53, 0x4f, 0x4e, 0x2e, 0x70, 0x61, 0x72, 0x73, 0x65, 0x28,
-  0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72,
-  0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x65,
-  0x72, 0x72, 0x6f, 0x72, 0x28, 0x60, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e,
-  0x63, 0x70, 0x70, 0x20, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x3a, 0x20, 0x24,
-  0x7b, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f,
-  0x72, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x7d, 0x60, 0x29,
-  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x7d, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x63, 0x61, 0x74, 0x63, 0x68, 0x20,
-  0x28, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66,
-  0x20, 0x28, 0x65, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x20, 0x21, 0x3d, 0x3d,
-  0x20, 0x27, 0x41, 0x62, 0x6f, 0x72, 0x74, 0x45, 0x72, 0x72, 0x6f, 0x72,
-  0x27, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63,
-  0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72,
-  0x28, 0x22, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x20, 0x65, 0x72, 0x72, 0x6f,
-  0x72, 0x3a, 0x20, 0x22, 0x2c, 0x20, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x72, 0x6f,
-  0x77, 0x20, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x66,
-  0x69, 0x6e, 0x61, 0x6c, 0x6c, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e,
-  0x61, 0x62, 0x6f, 0x72, 0x74, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d,
-  0x0a, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x63,
-  0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f,
-  0x2f, 0x20, 0x43, 0x61, 0x6c, 0x6c, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61,
-  0x2c, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x61, 0x6e, 0x20,
-  0x65, 0x76, 0x65, 0x6e, 0x74, 0x20, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74,
-  0x20, 0x74, 0x68, 0x61, 0x74, 0x20, 0x79, 0x6f, 0x75, 0x20, 0x63, 0x61,
-  0x6e, 0x20, 0x73, 0x75, 0x62, 0x73, 0x63, 0x72, 0x69, 0x62, 0x65, 0x20,
-  0x74, 0x6f, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x45, 0x78, 0x61,
-  0x6d, 0x70, 0x6c, 0x65, 0x3a, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20,
-  0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x7b, 0x20,
-  0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61,
-  0x72, 0x67, 0x65, 0x74, 0x20, 0x7d, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20,
-  0x27, 0x2f, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e,
-  0x2e, 0x6a, 0x73, 0x27, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20,
-  0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x6e,
-  0x20, 0x3d, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x45, 0x76, 0x65, 0x6e,
-  0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x28, 0x70, 0x72, 0x6f, 0x6d,
-  0x70, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
-  0x6e, 0x6e, 0x2e, 0x61, 0x64, 0x64, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x4c,
-  0x69, 0x73, 0x74, 0x65, 0x6e, 0x65, 0x72, 0x28, 0x22, 0x6d, 0x65, 0x73,
-  0x73, 0x61, 0x67, 0x65, 0x22, 0x2c, 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e,
-  0x6b, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x2f, 0x2f, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74,
-  0x2e, 0x77, 0x72, 0x69, 0x74, 0x65, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b,
-  0x2e, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x2e, 0x63, 0x6f, 0x6e, 0x74,
-  0x65, 0x6e, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x7d,
-  0x29, 0x0a, 0x2f, 0x2f, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20,
-  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x45,
-  0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x3d,
-  0x20, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61,
-  0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x7b, 0x7d, 0x2c, 0x20, 0x63,
-  0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20, 0x3d, 0x20, 0x7b, 0x7d, 0x29, 0x20,
-  0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
-  0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74,
-  0x20, 0x3d, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45, 0x76, 0x65, 0x6e, 0x74,
-  0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20,
-  0x28, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e,
-  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63,
+  0x74, 0x2d, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x27, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x7d, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x69, 0x67,
+  0x6e, 0x61, 0x6c, 0x3a, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c,
+  0x6c, 0x65, 0x72, 0x2e, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x2c, 0x0a,
+  0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e,
+  0x73, 0x74, 0x20, 0x72, 0x65, 0x61, 0x64, 0x65, 0x72, 0x20, 0x3d, 0x20,
+  0x72, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x2e, 0x62, 0x6f, 0x64,
+  0x79, 0x2e, 0x67, 0x65, 0x74, 0x52, 0x65, 0x61, 0x64, 0x65, 0x72, 0x28,
+  0x29, 0x3b, 0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x64,
+  0x65, 0x63, 0x6f, 0x64, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x6e, 0x65, 0x77,
+  0x20, 0x54, 0x65, 0x78, 0x74, 0x44, 0x65, 0x63, 0x6f, 0x64, 0x65, 0x72,
+  0x28, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63,
  0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61,
-  0x69, 0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68,
-  0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61,
-  0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72,
-  0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x29,
-  0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66,
-  0x20, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61,
-  0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x2b, 0x3d, 0x20, 0x63,
-  0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f,
-  0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67,
-  0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68, 0x45,
-  0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, 0x73,
-  0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, 0x6d, 0x65,
-  0x73, 0x73, 0x61, 0x67, 0x65, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65,
-  0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e,
-  0x64, 0x61, 0x74, 0x61, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x69, 0x66, 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61,
+  0x0a, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f,
+  0x76, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x20, 0x2f, 0x2f,
+  0x20, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x20, 0x66, 0x6f, 0x72, 0x20,
+  0x70, 0x61, 0x72, 0x74, 0x69, 0x61, 0x6c, 0x6c, 0x79, 0x20, 0x72, 0x65,
+  0x61, 0x64, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73, 0x0a, 0x0a, 0x20, 0x20,
+  0x74, 0x72, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65,
+  0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x74, 0x72, 0x75,
+  0x65, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x77, 0x68, 0x69, 0x6c,
+  0x65, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x29, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72,
+  0x65, 0x73, 0x75, 0x6c, 0x74, 0x20, 0x3d, 0x20, 0x61, 0x77, 0x61, 0x69,
+  0x74, 0x20, 0x72, 0x65, 0x61, 0x64, 0x65, 0x72, 0x2e, 0x72, 0x65, 0x61,
+  0x64, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69,
+  0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x6f,
+  0x6e, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x2f, 0x2f, 0x20, 0x41, 0x64, 0x64, 0x20, 0x61, 0x6e, 0x79, 0x20, 0x6c,
+  0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x64, 0x61, 0x74, 0x61,
+  0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x75, 0x72, 0x72,
+  0x65, 0x6e, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66,
+  0x20, 0x64, 0x61, 0x74, 0x61, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x74, 0x65, 0x78, 0x74, 0x20, 0x3d,
+  0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x2b, 0x20,
+  0x64, 0x65, 0x63, 0x6f, 0x64, 0x65, 0x72, 0x2e, 0x64, 0x65, 0x63, 0x6f,
+  0x64, 0x65, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x76, 0x61,
+  0x6c, 0x75, 0x65, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x2f, 0x2f, 0x20, 0x43, 0x68, 0x65, 0x63, 0x6b, 0x20, 0x69, 0x66,
+  0x20, 0x74, 0x68, 0x65, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x20, 0x63, 0x68,
+  0x61, 0x72, 0x61, 0x63, 0x74, 0x65, 0x72, 0x20, 0x69, 0x73, 0x20, 0x61,
+  0x20, 0x6c, 0x69, 0x6e, 0x65, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
+  0x65, 0x6e, 0x64, 0x73, 0x57, 0x69, 0x74, 0x68, 0x4c, 0x69, 0x6e, 0x65,
+  0x42, 0x72, 0x65, 0x61, 0x6b, 0x20, 0x3d, 0x20, 0x74, 0x65, 0x78, 0x74,
+  0x2e, 0x65, 0x6e, 0x64, 0x73, 0x57, 0x69, 0x74, 0x68, 0x28, 0x27, 0x5c,
+  0x6e, 0x27, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x2f, 0x2f, 0x20, 0x53, 0x70, 0x6c, 0x69, 0x74, 0x20, 0x74, 0x68, 0x65,
+  0x20, 0x74, 0x65, 0x78, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x6f, 0x20, 0x6c,
+  0x69, 0x6e, 0x65, 0x73, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c,
+  0x65, 0x74, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73, 0x20, 0x3d, 0x20, 0x74,
+  0x65, 0x78, 0x74, 0x2e, 0x73, 0x70, 0x6c, 0x69, 0x74, 0x28, 0x27, 0x5c,
+  0x6e, 0x27, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x2f, 0x2f, 0x20, 0x49, 0x66, 0x20, 0x74, 0x68, 0x65, 0x20, 0x74, 0x65,
+  0x78, 0x74, 0x20, 0x64, 0x6f, 0x65, 0x73, 0x6e, 0x27, 0x74, 0x20, 0x65,
+  0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x61, 0x20, 0x6c, 0x69,
+  0x6e, 0x65, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x2c, 0x20, 0x74, 0x68,
+  0x65, 0x6e, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x20,
+  0x6c, 0x69, 0x6e, 0x65, 0x20, 0x69, 0x73, 0x20, 0x69, 0x6e, 0x63, 0x6f,
+  0x6d, 0x70, 0x6c, 0x65, 0x74, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x2f, 0x2f, 0x20, 0x53, 0x74, 0x6f, 0x72, 0x65, 0x20, 0x69, 0x74,
+  0x20, 0x69, 0x6e, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72,
+  0x20, 0x74, 0x6f, 0x20, 0x62, 0x65, 0x20, 0x61, 0x64, 0x64, 0x65, 0x64,
+  0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6e, 0x65, 0x78, 0x74,
+  0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x64, 0x61,
+  0x74, 0x61, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20,
+  0x28, 0x21, 0x65, 0x6e, 0x64, 0x73, 0x57, 0x69, 0x74, 0x68, 0x4c, 0x69,
+  0x6e, 0x65, 0x42, 0x72, 0x65, 0x61, 0x6b, 0x29, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f,
+  0x76, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73, 0x2e,
+  0x70, 0x6f, 0x70, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76,
+  0x65, 0x72, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x20, 0x2f, 0x2f, 0x20,
+  0x52, 0x65, 0x73, 0x65, 0x74, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76,
+  0x65, 0x72, 0x20, 0x69, 0x66, 0x20, 0x77, 0x65, 0x20, 0x68, 0x61, 0x76,
+  0x65, 0x20, 0x61, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x20, 0x62, 0x72, 0x65,
+  0x61, 0x6b, 0x20, 0x61, 0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x65, 0x6e,
+  0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x50, 0x61, 0x72, 0x73,
+  0x65, 0x20, 0x61, 0x6c, 0x6c, 0x20, 0x73, 0x73, 0x65, 0x20, 0x65, 0x76,
+  0x65, 0x6e, 0x74, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x61, 0x64, 0x64,
+  0x20, 0x74, 0x68, 0x65, 0x6d, 0x20, 0x74, 0x6f, 0x20, 0x72, 0x65, 0x73,
+  0x75, 0x6c, 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
+  0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x67, 0x65, 0x78, 0x20, 0x3d, 0x20,
+  0x2f, 0x5e, 0x28, 0x5c, 0x53, 0x2b, 0x29, 0x3a, 0x5c, 0x73, 0x28, 0x2e,
+  0x2a, 0x29, 0x24, 0x2f, 0x67, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74,
+  0x20, 0x6c, 0x69, 0x6e, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x69, 0x6e,
+  0x65, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6d, 0x61, 0x74, 0x63,
+  0x68, 0x20, 0x3d, 0x20, 0x72, 0x65, 0x67, 0x65, 0x78, 0x2e, 0x65, 0x78,
+  0x65, 0x63, 0x28, 0x6c, 0x69, 0x6e, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x6d, 0x61,
+  0x74, 0x63, 0x68, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x5b,
+  0x6d, 0x61, 0x74, 0x63, 0x68, 0x5b, 0x31, 0x5d, 0x5d, 0x20, 0x3d, 0x20,
+  0x6d, 0x61, 0x74, 0x63, 0x68, 0x5b, 0x32, 0x5d, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x73, 0x69,
+  0x6e, 0x63, 0x65, 0x20, 0x77, 0x65, 0x20, 0x6b, 0x6e, 0x6f, 0x77, 0x20,
+  0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x6c, 0x6c, 0x61, 0x6d,
+  0x61, 0x2e, 0x63, 0x70, 0x70, 0x2c, 0x20, 0x6c, 0x65, 0x74, 0x27, 0x73,
+  0x20, 0x6a, 0x75, 0x73, 0x74, 0x20, 0x64, 0x65, 0x63, 0x6f, 0x64, 0x65,
+  0x20, 0x74, 0x68, 0x65, 0x20, 0x6a, 0x73, 0x6f, 0x6e, 0x20, 0x69, 0x6e,
+  0x20, 0x64, 0x61, 0x74, 0x61, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75,
+  0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72,
+  0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d,
+  0x20, 0x4a, 0x53, 0x4f, 0x4e, 0x2e, 0x70, 0x61, 0x72, 0x73, 0x65, 0x28,
+  0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29,
+  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x2b, 0x3d,
+  0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61,
+  0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f,
+  0x2f, 0x20, 0x79, 0x69, 0x65, 0x6c, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x79, 0x69, 0x65, 0x6c,
+  0x64, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x3b, 0x0a, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f,
+  0x2f, 0x20, 0x69, 0x66, 0x20, 0x77, 0x65, 0x20, 0x67, 0x6f, 0x74, 0x20,
+  0x61, 0x20, 0x73, 0x74, 0x6f, 0x70, 0x20, 0x74, 0x6f, 0x6b, 0x65, 0x6e,
+  0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72,
+  0x2c, 0x20, 0x77, 0x65, 0x20, 0x77, 0x69, 0x6c, 0x6c, 0x20, 0x62, 0x72,
+  0x65, 0x61, 0x6b, 0x20, 0x68, 0x65, 0x72, 0x65, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20,
+  0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61,
+  0x2e, 0x73, 0x74, 0x6f, 0x70, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69,
+  0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61,
  0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f,
  0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20,
-  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76,
-  0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69,
-  0x73, 0x70, 0x61, 0x74, 0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28,
-  0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76,
-  0x65, 0x6e, 0x74, 0x28, 0x22, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74,
-  0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73,
-  0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a,
-  0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e,
-  0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73,
-  0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x7d, 0x29, 0x29, 0x3b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b,
-  0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74, 0x69, 0x6d, 0x69, 0x6e, 0x67,
-  0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74,
-  0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68, 0x45, 0x76, 0x65,
-  0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, 0x73, 0x74, 0x6f,
-  0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, 0x74, 0x69, 0x6d, 0x69,
-  0x6e, 0x67, 0x73, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74, 0x61,
-  0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61,
-  0x74, 0x61, 0x2e, 0x74, 0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x7d,
-  0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76,
-  0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69,
-  0x73, 0x70, 0x61, 0x74, 0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28,
-  0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76,
-  0x65, 0x6e, 0x74, 0x28, 0x22, 0x64, 0x6f, 0x6e, 0x65, 0x22, 0x2c, 0x20,
-  0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x7b, 0x20,
-  0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x7d, 0x20, 0x7d, 0x29,
-  0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x29, 0x28, 0x29, 0x3b, 0x0a, 0x20,
-  0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x65, 0x76, 0x65, 0x6e,
-  0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x3b, 0x0a, 0x7d, 0x0a, 0x0a,
-  0x2f, 0x2f, 0x20, 0x43, 0x61, 0x6c, 0x6c, 0x20, 0x6c, 0x6c, 0x61, 0x6d,
-  0x61, 0x2c, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x61, 0x20,
-  0x70, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x20, 0x74, 0x68, 0x61, 0x74,
-  0x20, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76, 0x65, 0x73, 0x20, 0x74, 0x6f,
-  0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74,
-  0x65, 0x64, 0x20, 0x74, 0x65, 0x78, 0x74, 0x2e, 0x20, 0x54, 0x68, 0x69,
-  0x73, 0x20, 0x64, 0x6f, 0x65, 0x73, 0x20, 0x6e, 0x6f, 0x74, 0x20, 0x73,
-  0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x73, 0x74, 0x72, 0x65, 0x61,
-  0x6d, 0x69, 0x6e, 0x67, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x45,
-  0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x3a, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f,
-  0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x50,
-  0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70,
-  0x74, 0x29, 0x2e, 0x74, 0x68, 0x65, 0x6e, 0x28, 0x28, 0x63, 0x6f, 0x6e,
-  0x74, 0x65, 0x6e, 0x74, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x2f,
-  0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75,
-  0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69, 0x74, 0x65, 0x28, 0x63,
-  0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x6f, 0x72, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63,
-  0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x61, 0x77, 0x61,
-  0x69, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x6d,
-  0x69, 0x73, 0x65, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x0a,
-  0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d,
-  0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69, 0x74, 0x65, 0x28, 0x63, 0x6f,
-  0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x0a, 0x65, 0x78,
+  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61,
+  0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67,
+  0x73, 0x20, 0x3d, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64,
+  0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69,
+  0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x3b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x20,
+  0x3d, 0x20, 0x66, 0x61, 0x6c, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62,
+  0x72, 0x65, 0x61, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x7d, 0x20,
+  0x63, 0x61, 0x74, 0x63, 0x68, 0x20, 0x28, 0x65, 0x29, 0x20, 0x7b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x65, 0x2e, 0x6e, 0x61,
+  0x6d, 0x65, 0x20, 0x21, 0x3d, 0x3d, 0x20, 0x27, 0x41, 0x62, 0x6f, 0x72,
+  0x74, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x27, 0x29, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65,
+  0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x22, 0x6c, 0x6c, 0x61, 0x6d,
+  0x61, 0x20, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x3a, 0x20, 0x22, 0x2c, 0x20,
+  0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x65, 0x3b, 0x0a, 0x20,
+  0x20, 0x7d, 0x0a, 0x20, 0x20, 0x66, 0x69, 0x6e, 0x61, 0x6c, 0x6c, 0x79,
+  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72,
+  0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x61, 0x62, 0x6f, 0x72, 0x74, 0x28,
+  0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x72, 0x65,
+  0x74, 0x75, 0x72, 0x6e, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74,
+  0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x61, 0x6c, 0x6c,
+  0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x72, 0x65, 0x74, 0x75,
+  0x72, 0x6e, 0x20, 0x61, 0x6e, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x20,
+  0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x74, 0x68, 0x61, 0x74, 0x20,
+  0x79, 0x6f, 0x75, 0x20, 0x63, 0x61, 0x6e, 0x20, 0x73, 0x75, 0x62, 0x63,
+  0x72, 0x69, 0x62, 0x65, 0x20, 0x74, 0x6f, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f,
+  0x2f, 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x3a, 0x0a, 0x2f,
+  0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6f,
+  0x72, 0x74, 0x20, 0x7b, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x45, 0x76,
+  0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x7d, 0x20,
+  0x66, 0x72, 0x6f, 0x6d, 0x20, 0x27, 0x2f, 0x63, 0x6f, 0x6d, 0x70, 0x6c,
+  0x65, 0x74, 0x69, 0x6f, 0x6e, 0x2e, 0x6a, 0x73, 0x27, 0x0a, 0x2f, 0x2f,
+  0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
+  0x20, 0x63, 0x6f, 0x6e, 0x6e, 0x20, 0x3d, 0x20, 0x6c, 0x6c, 0x61, 0x6d,
+  0x61, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74,
+  0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20,
+  0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x6e, 0x2e, 0x61, 0x64, 0x64, 0x45,
+  0x76, 0x65, 0x6e, 0x74, 0x4c, 0x69, 0x73, 0x74, 0x65, 0x6e, 0x65, 0x72,
+  0x28, 0x22, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x22, 0x2c, 0x20,
+  0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b,
+  0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63,
+  0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69, 0x74, 0x65, 0x28,
+  0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c,
+  0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a, 0x2f, 0x2f,
+  0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x2f, 0x2f, 0x0a, 0x65, 0x78,
  0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c,
-  0x6c, 0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x20,
-  0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70,
-  0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x7b, 0x7d, 0x2c, 0x20,
-  0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20, 0x3d, 0x20, 0x7b, 0x7d, 0x29,
-  0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75,
-  0x72, 0x6e, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x50, 0x72, 0x6f, 0x6d, 0x69,
-  0x73, 0x65, 0x28, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x72, 0x65,
-  0x73, 0x6f, 0x6c, 0x76, 0x65, 0x2c, 0x20, 0x72, 0x65, 0x6a, 0x65, 0x63,
-  0x74, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x6c, 0x61, 0x6d, 0x61, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72,
+  0x67, 0x65, 0x74, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70,
+  0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20,
+  0x7b, 0x7d, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20, 0x3d,
+  0x20, 0x7b, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20,
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54,
+  0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x3d, 0x20, 0x6e, 0x65, 0x77, 0x20,
+  0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x28,
+  0x29, 0x3b, 0x0a, 0x20, 0x20, 0x28, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20,
+  0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
  0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20,
-  0x3d, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x72,
-  0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f,
+  0x3d, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f,
  0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e,
  0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20,
  0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74,
  0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x6f,
  0x6e, 0x66, 0x69, 0x67, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b,
+  0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
  0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74,
  0x20, 0x2b, 0x3d, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61,
  0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76, 0x65, 0x28, 0x63, 0x6f,
-  0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x7d, 0x20, 0x63, 0x61, 0x74, 0x63, 0x68, 0x20, 0x28, 0x65, 0x72, 0x72,
-  0x6f, 0x72, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x72, 0x65, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x65, 0x72, 0x72, 0x6f, 0x72,
-  0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x7d,
-  0x29, 0x3b, 0x0a, 0x7d, 0x3b, 0x0a, 0x0a, 0x2f, 0x2a, 0x2a, 0x0a, 0x20,
-  0x2a, 0x20, 0x28, 0x64, 0x65, 0x70, 0x72, 0x65, 0x63, 0x61, 0x74, 0x65,
-  0x64, 0x29, 0x0a, 0x20, 0x2a, 0x2f, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72,
-  0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d,
-  0x61, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x65, 0x20, 0x3d, 0x20,
-  0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x70, 0x61, 0x72, 0x61, 0x6d,
-  0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65,
-  0x72, 0x2c, 0x20, 0x63, 0x61, 0x6c, 0x6c, 0x62, 0x61, 0x63, 0x6b, 0x29,
-  0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20,
-  0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74,
-  0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x6c,
-  0x61, 0x6d, 0x61, 0x28, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x70,
-  0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d,
-  0x73, 0x2c, 0x20, 0x7b, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c,
-  0x6c, 0x65, 0x72, 0x20, 0x7d, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x63, 0x61, 0x6c, 0x6c, 0x62, 0x61, 0x63, 0x6b, 0x28, 0x63,
-  0x68, 0x75, 0x6e, 0x6b, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x7d,
-  0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x47, 0x65, 0x74, 0x20, 0x74, 0x68, 0x65,
-  0x20, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x20, 0x69, 0x6e, 0x66, 0x6f, 0x20,
-  0x66, 0x72, 0x6f, 0x6d, 0x20, 0x74, 0x68, 0x65, 0x20, 0x73, 0x65, 0x72,
-  0x76, 0x65, 0x72, 0x2e, 0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73,
-  0x20, 0x75, 0x73, 0x65, 0x66, 0x75, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x20,
-  0x67, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x20, 0x74, 0x68, 0x65, 0x20,
-  0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x20, 0x77, 0x69, 0x6e, 0x64,
-  0x6f, 0x77, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x73, 0x6f, 0x20, 0x6f, 0x6e,
-  0x2e, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e,
-  0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x4d, 0x6f, 0x64, 0x65,
-  0x6c, 0x49, 0x6e, 0x66, 0x6f, 0x20, 0x3d, 0x20, 0x61, 0x73, 0x79, 0x6e,
-  0x63, 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20,
-  0x69, 0x66, 0x20, 0x28, 0x21, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74,
-  0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73,
-  0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x67, 0x65, 0x6e, 0x65,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e,
+  0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70,
+  0x61, 0x74, 0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65,
+  0x77, 0x20, 0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e,
+  0x74, 0x28, 0x22, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x22, 0x2c,
+  0x20, 0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63,
+  0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x7d, 0x29,
+  0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x68, 0x75,
+  0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65,
  0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69,
-  0x6e, 0x67, 0x73, 0x20, 0x3d, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20,
-  0x66, 0x65, 0x74, 0x63, 0x68, 0x28, 0x22, 0x2f, 0x6d, 0x6f, 0x64, 0x65,
-  0x6c, 0x2e, 0x6a, 0x73, 0x6f, 0x6e, 0x22, 0x29, 0x2e, 0x74, 0x68, 0x65,
-  0x6e, 0x28, 0x72, 0x20, 0x3d, 0x3e, 0x20, 0x72, 0x2e, 0x6a, 0x73, 0x6f,
-  0x6e, 0x28, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20,
-  0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72,
-  0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e,
-  0x67, 0x73, 0x3b, 0x0a, 0x7d, 0x0a
+  0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67,
+  0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68, 0x45,
+  0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, 0x73,
+  0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, 0x67, 0x65,
+  0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74,
+  0x74, 0x69, 0x6e, 0x67, 0x73, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65,
+  0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e,
+  0x64, 0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74,
+  0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73,
+  0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28,
+  0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74,
+  0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54,
+  0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74,
+  0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20,
+  0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28,
+  0x22, 0x74, 0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x22, 0x2c, 0x20, 0x7b,
+  0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68, 0x75,
+  0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74, 0x69, 0x6d, 0x69,
+  0x6e, 0x67, 0x73, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67,
+  0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68, 0x45,
+  0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, 0x73,
+  0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, 0x64, 0x6f,
+  0x6e, 0x65, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69,
+  0x6c, 0x3a, 0x20, 0x7b, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74,
+  0x20, 0x7d, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x29,
+  0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e,
+  0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74,
+  0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x61, 0x6c, 0x6c,
+  0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x72, 0x65, 0x74, 0x75,
+  0x72, 0x6e, 0x20, 0x61, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65,
+  0x20, 0x74, 0x68, 0x61, 0x74, 0x20, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76,
+  0x65, 0x73, 0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x6f,
+  0x6d, 0x70, 0x6c, 0x65, 0x74, 0x65, 0x64, 0x20, 0x74, 0x65, 0x78, 0x74,
+  0x2e, 0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x64, 0x6f, 0x65, 0x73, 0x20,
+  0x6e, 0x6f, 0x74, 0x20, 0x73, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, 0x20,
+  0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x69, 0x6e, 0x67, 0x0a, 0x2f, 0x2f,
+  0x0a, 0x2f, 0x2f, 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x3a,
+  0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c,
+  0x6c, 0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28,
+  0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x2e, 0x74, 0x68, 0x65, 0x6e,
+  0x28, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x20, 0x3d,
+  0x3e, 0x20, 0x7b, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72,
+  0x69, 0x74, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29,
+  0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x2f,
+  0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x72, 0x0a,
+  0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
+  0x6e, 0x73, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20,
+  0x3d, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d,
+  0x61, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x70, 0x72, 0x6f,
+  0x6d, 0x70, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69,
+  0x74, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a,
+  0x2f, 0x2f, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f,
+  0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f,
+  0x6d, 0x69, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x6d,
+  0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d,
+  0x20, 0x7b, 0x7d, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20,
+  0x3d, 0x20, 0x7b, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x65, 0x77, 0x20,
+  0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x61, 0x73, 0x79, 0x6e,
+  0x63, 0x20, 0x28, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76, 0x65, 0x2c, 0x20,
+  0x72, 0x65, 0x6a, 0x65, 0x63, 0x74, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e,
+  0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x74, 0x72, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74,
+  0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e,
+  0x6b, 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70,
+  0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d,
+  0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x29, 0x29, 0x20,
+  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
+  0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x2b, 0x3d, 0x20, 0x63, 0x68, 0x75,
+  0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74,
+  0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x6f, 0x6c,
+  0x76, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x3b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x63, 0x61, 0x74, 0x63, 0x68,
+  0x20, 0x28, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x6a, 0x65, 0x63, 0x74, 0x28,
+  0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x7d, 0x0a, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x7d, 0x3b, 0x0a, 0x0a,
+  0x2f, 0x2a, 0x2a, 0x0a, 0x20, 0x2a, 0x20, 0x28, 0x64, 0x65, 0x70, 0x72,
+  0x65, 0x63, 0x61, 0x74, 0x65, 0x64, 0x29, 0x0a, 0x20, 0x2a, 0x2f, 0x0a,
+  0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
+  0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65,
+  0x74, 0x65, 0x20, 0x3d, 0x20, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28,
+  0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x74,
+  0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2c, 0x20, 0x63, 0x61, 0x6c, 0x6c,
+  0x62, 0x61, 0x63, 0x6b, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x28,
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20,
+  0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x61, 0x72,
+  0x61, 0x6d, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20,
+  0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x7b, 0x20, 0x63, 0x6f,
+  0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x20, 0x7d, 0x29, 0x29,
+  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x61, 0x6c, 0x6c, 0x62,
+  0x61, 0x63, 0x6b, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x29, 0x3b, 0x0a,
+  0x20, 0x20, 0x7d, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x47, 0x65,
+  0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x20,
+  0x69, 0x6e, 0x66, 0x6f, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x74, 0x68,
+  0x65, 0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2e, 0x20, 0x54, 0x68,
+  0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x75, 0x73, 0x65, 0x66, 0x75, 0x6c,
+  0x20, 0x66, 0x6f, 0x72, 0x20, 0x67, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67,
+  0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74,
+  0x20, 0x77, 0x69, 0x6e, 0x64, 0x6f, 0x77, 0x20, 0x61, 0x6e, 0x64, 0x20,
+  0x73, 0x6f, 0x20, 0x6f, 0x6e, 0x2e, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72,
+  0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d,
+  0x61, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x49, 0x6e, 0x66, 0x6f, 0x20, 0x3d,
+  0x20, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e,
+  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x67, 0x65,
+  0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74,
+  0x74, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f,
+  0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x3d, 0x20, 0x61,
+  0x77, 0x61, 0x69, 0x74, 0x20, 0x66, 0x65, 0x74, 0x63, 0x68, 0x28, 0x22,
+  0x2f, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x2e, 0x6a, 0x73, 0x6f, 0x6e, 0x22,
+  0x29, 0x2e, 0x74, 0x68, 0x65, 0x6e, 0x28, 0x72, 0x20, 0x3d, 0x3e, 0x20,
+  0x72, 0x2e, 0x6a, 0x73, 0x6f, 0x6e, 0x28, 0x29, 0x29, 0x3b, 0x0a, 0x20,
+  0x20, 0x7d, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
+  0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73,
+  0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x3b, 0x0a, 0x7d, 0x0a
 };
-unsigned int completion_js_len = 5346;
+unsigned int completion_js_len = 5099;
--- a/examples/server/index.html.hpp
+++ b/examples/server/index.html.hpp
--- a/examples/server/index.js.hpp
+++ b/examples/server/index.js.hpp
--- a/examples/server/public/completion.js
+++ b/examples/server/public/completion.js
@@ -95,15 +95,6 @@ export async function* llama(prompt, params = {}, config = {}) {
              break;
            }
          }
-          if (result.error) {
-            result.error = JSON.parse(result.error);
-            if (result.error.content.includes('slot unavailable')) {
-              // Throw an error to be caught by upstream callers
-              throw new Error('slot unavailable');
-            } else {
-              console.error(`llama.cpp error: ${result.error.content}`);
-            }
-          }
          if (result.error) {
            result.error = JSON.parse(result.error);
            console.error(`llama.cpp error: ${result.error.content}`);
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@@ -427,7 +427,7 @@
        }

        if (data.timings) {
-          llamaStats.value = data;
+          llamaStats.value = data.timings;
        }
      }

@@ -880,7 +880,7 @@
      }
      return html`
        <span>
-          ${llamaStats.value.tokens_predicted} predicted, ${llamaStats.value.tokens_cached} cached, ${llamaStats.value.timings.predicted_per_token_ms.toFixed()}ms per token, ${llamaStats.value.timings.predicted_per_second.toFixed(2)} tokens per second
+          ${llamaStats.value.predicted_per_token_ms.toFixed()}ms per token, ${llamaStats.value.predicted_per_second.toFixed(2)} tokens per second
        </span>
      `
    }
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -26,7 +26,6 @@
 #include <mutex>
 #include <chrono>
 #include <condition_variable>
-#include <atomic>

 #ifndef SERVER_VERBOSE
 #define SERVER_VERBOSE 1
@@ -39,7 +38,7 @@ using json = nlohmann::json;
 struct server_params
 {
    std::string hostname = "127.0.0.1";
-    std::vector<std::string> api_keys;
+    std::string api_key;
    std::string public_path = "examples/server/public";
    int32_t port = 8080;
    int32_t read_timeout = 600;
@@ -147,15 +146,9 @@ static std::vector<uint8_t> base64_decode(const std::string & encoded_string)
 // parallel
 //

-enum server_state {
-    SERVER_STATE_LOADING_MODEL,  // Server is starting up, model not fully loaded yet
-    SERVER_STATE_READY,          // Server is ready and model is loaded
-    SERVER_STATE_ERROR           // An error occurred, load_model failed
-};
-
 enum task_type {
-    TASK_TYPE_COMPLETION,
-    TASK_TYPE_CANCEL,
+    COMPLETION_TASK,
+    CANCEL_TASK
 };

 struct task_server {
@@ -454,14 +447,8 @@ struct llama_client_slot
    }

    bool has_budget(gpt_params &global_params) {
-        if (params.n_predict == -1 && global_params.n_predict == -1)
-        {
-            return true; // limitless
-        }
-
        n_remaining = -1;
-
-        if (params.n_predict != -1)
+        if(params.n_predict != -1)
        {
            n_remaining = params.n_predict - n_decoded;
        }
@@ -469,8 +456,7 @@ struct llama_client_slot
        {
            n_remaining = global_params.n_predict - n_decoded;
        }
-
-        return n_remaining > 0; // no budget
+        return n_remaining > 0 || n_remaining == -1; // no budget || limitless
    }

    bool available() const {
@@ -1116,7 +1102,7 @@ struct llama_server_context
        }

        // check the limits
-        if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params))
+        if (slot.n_decoded > 2 && slot.has_next_token && !slot.has_budget(params))
        {
            slot.stopped_limit = true;
            slot.has_next_token = false;
@@ -1279,7 +1265,7 @@ struct llama_server_context
        {
            std::vector<completion_token_output> probs_output = {};
            const std::vector<llama_token> to_send_toks = llama_tokenize(ctx, tkn.text_to_send, false);
-            size_t probs_pos      = std::min(slot.sent_token_probs_index,                       slot.generated_token_probs.size());
+            size_t probs_pos = std::min(slot.sent_token_probs_index, slot.generated_token_probs.size());
            size_t probs_stop_pos = std::min(slot.sent_token_probs_index + to_send_toks.size(), slot.generated_token_probs.size());
            if (probs_pos < probs_stop_pos)
            {
@@ -1339,7 +1325,7 @@ struct llama_server_context
            {
                probs = std::vector<completion_token_output>(
                                    slot.generated_token_probs.begin(),
-                                    slot.generated_token_probs.end());
+                                    slot.generated_token_probs.begin() + slot.sent_token_probs_index);
            }
            res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs);
        }
@@ -1402,11 +1388,11 @@ struct llama_server_context
        task.data = std::move(data);
        task.infill_mode = infill;
        task.embedding_mode = embedding;
-        task.type = TASK_TYPE_COMPLETION;
+        task.type = COMPLETION_TASK;
        task.multitask_id = multitask_id;

        // when a completion task's prompt array is not a singleton, we split it into multiple requests
-        if (task.data.count("prompt") && task.data.at("prompt").size() > 1)
+        if (task.data.at("prompt").size() > 1)
        {
            lock.unlock(); // entering new func scope
            return split_multiprompt_task(task);
@@ -1524,7 +1510,7 @@ struct llama_server_context
        std::unique_lock<std::mutex> lock(mutex_tasks);
        task_server task;
        task.id = id_gen++;
-        task.type = TASK_TYPE_CANCEL;
+        task.type = CANCEL_TASK;
        task.target_id = task_id;
        queue_tasks.push_back(task);
        condition_tasks.notify_one();
@@ -1560,7 +1546,7 @@ struct llama_server_context
            queue_tasks.erase(queue_tasks.begin());
            switch (task.type)
            {
-                case TASK_TYPE_COMPLETION: {
+                case COMPLETION_TASK: {
                    llama_client_slot *slot = get_slot(json_value(task.data, "slot_id", -1));
                    if (slot == nullptr)
                    {
@@ -1577,9 +1563,9 @@ struct llama_server_context

                    slot->reset();

-                    slot->infill       = task.infill_mode;
-                    slot->embedding    = task.embedding_mode;
-                    slot->task_id      = task.id;
+                    slot->infill = task.infill_mode;
+                    slot->embedding = task.embedding_mode;
+                    slot->task_id = task.id;
                    slot->multitask_id = task.multitask_id;

                    if (!launch_slot_with_data(slot, task.data))
@@ -1589,7 +1575,7 @@ struct llama_server_context
                        break;
                    }
                } break;
-                case TASK_TYPE_CANCEL: { // release slot linked with the task id
+                case CANCEL_TASK: { // release slot linked with the task id
                    for (auto & slot : slots)
                    {
                        if (slot.task_id == task.target_id)
@@ -1717,6 +1703,7 @@ struct llama_server_context

            llama_batch_add(batch, slot.sampled, system_tokens.size() + slot.n_past, { slot.id }, true);

+            slot.n_decoded += 1;
            slot.n_past += 1;
        }

@@ -1731,8 +1718,7 @@ struct llama_server_context
                const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get<std::string>().empty()) || !slot.images.empty();

                // empty prompt passed -> release the slot and send empty response
-                // note: infill mode allows empty prompt
-                if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt && !slot.infill)
+                if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt)
                {
                    slot.release();
                    slot.print_timings();
@@ -1935,7 +1921,6 @@ struct llama_server_context

                llama_sampling_accept(slot.ctx_sampling, ctx, id, true);

-                slot.n_decoded += 1;
                if (slot.n_decoded == 1)
                {
                    slot.t_start_genereration = ggml_time_us();
@@ -2022,7 +2007,6 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
    printf("  --port PORT           port to listen (default  (default: %d)\n", sparams.port);
    printf("  --path PUBLIC_PATH    path from which to serve static files (default %s)\n", sparams.public_path.c_str());
    printf("  --api-key API_KEY     optional api key to enhance server security. If set, requests must include this key for access.\n");
-    printf("  --api-key-file FNAME  path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access.\n");
    printf("  -to N, --timeout N    server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
    printf("  --embedding           enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
    printf("  -np N, --parallel N   number of slots for process requests (default: %d)\n", params.n_parallel);
@@ -2083,28 +2067,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
                invalid_param = true;
                break;
            }
-            sparams.api_keys.push_back(argv[i]);
-        }
-        else if (arg == "--api-key-file")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            std::ifstream key_file(argv[i]);
-            if (!key_file) {
-                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
-                invalid_param = true;
-                break;
-            }
-            std::string key;
-            while (std::getline(key_file, key)) {
-               if (key.size() > 0) {
-                   sparams.api_keys.push_back(key);
-               }
-            }
-            key_file.close();
+            sparams.api_key = argv[i];
        }
        else if (arg == "--timeout" || arg == "-to")
        {
@@ -2483,6 +2446,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
    }
 }

+
 static std::string random_string()
 {
    static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
@@ -2538,7 +2502,7 @@ json oaicompat_completion_params_parse(
    //
    // https://platform.openai.com/docs/api-reference/chat/create
    llama_sampling_params default_sparams;
-    llama_params["model"]             = json_value(body, "model", std::string("unknown"));
+    llama_params["model"]             = json_value(body, "model", std::string("uknown"));
    llama_params["prompt"]            = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
    llama_params["cache_prompt"]      = json_value(body, "cache_prompt", false);
    llama_params["temperature"]       = json_value(body, "temperature", 0.0);
@@ -2610,8 +2574,8 @@ static json format_final_response_oaicompat(const json &request, const task_resu
            {"object", streaming ? "chat.completion.chunk" : "chat.completion"},
            {"usage",
                json{{"completion_tokens", num_tokens_predicted},
-                     {"prompt_tokens",     num_prompt_tokens},
-                     {"total_tokens",      num_tokens_predicted + num_prompt_tokens}}},
+                    {"prompt_tokens", num_prompt_tokens},
+                    {"total_tokens", num_tokens_predicted + num_prompt_tokens}}},
            {"id", gen_chatcmplid()}};

    if (server_verbose) {
@@ -2819,131 +2783,20 @@ int main(int argc, char **argv)
                                {"system_info", llama_print_system_info()},
                            });

-    httplib::Server svr;
-
-    std::atomic<server_state> state{SERVER_STATE_LOADING_MODEL};
-
-    svr.set_default_headers({{"Server", "llama.cpp"}});
-
-    // CORS preflight
-    svr.Options(R"(.*)", [](const httplib::Request &req, httplib::Response &res) {
-        res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
-        res.set_header("Access-Control-Allow-Credentials", "true");
-        res.set_header("Access-Control-Allow-Methods", "POST");
-        res.set_header("Access-Control-Allow-Headers", "*");
-    });
-
-    svr.Get("/health", [&](const httplib::Request&, httplib::Response& res) {
-        server_state current_state = state.load();
-        switch(current_state) {
-            case SERVER_STATE_READY:
-                res.set_content(R"({"status": "ok"})", "application/json");
-                res.status = 200; // HTTP OK
-                break;
-            case SERVER_STATE_LOADING_MODEL:
-                res.set_content(R"({"status": "loading model"})", "application/json");
-                res.status = 503; // HTTP Service Unavailable
-                break;
-            case SERVER_STATE_ERROR:
-                res.set_content(R"({"status": "error", "error": "Model failed to load"})", "application/json");
-                res.status = 500; // HTTP Internal Server Error
-                break;
-        }
-    });
-
-    svr.set_logger(log_server_request);
-
-    svr.set_exception_handler([](const httplib::Request &, httplib::Response &res, std::exception_ptr ep)
-            {
-                const char fmt[] = "500 Internal Server Error\n%s";
-                char buf[BUFSIZ];
-                try
-                {
-                    std::rethrow_exception(std::move(ep));
-                }
-                catch (std::exception &e)
-                {
-                    snprintf(buf, sizeof(buf), fmt, e.what());
-                }
-                catch (...)
-                {
-                    snprintf(buf, sizeof(buf), fmt, "Unknown Exception");
-                }
-                res.set_content(buf, "text/plain; charset=utf-8");
-                res.status = 500;
-            });
-
-    svr.set_error_handler([](const httplib::Request &, httplib::Response &res)
-            {
-                if (res.status == 401)
-                {
-                    res.set_content("Unauthorized", "text/plain; charset=utf-8");
-                }
-                if (res.status == 400)
-                {
-                    res.set_content("Invalid request", "text/plain; charset=utf-8");
-                }
-                else if (res.status == 404)
-                {
-                    res.set_content("File Not Found", "text/plain; charset=utf-8");
-                    res.status = 404;
-                }
-            });
-
-    // set timeouts and change hostname and port
-    svr.set_read_timeout (sparams.read_timeout);
-    svr.set_write_timeout(sparams.write_timeout);
-
-    if (!svr.bind_to_port(sparams.hostname, sparams.port))
-    {
-        fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n", sparams.hostname.c_str(), sparams.port);
-        return 1;
-    }
-
-    // Set the base directory for serving static files
-    svr.set_base_dir(sparams.public_path);
-
-    // to make it ctrl+clickable:
-    LOG_TEE("\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port);
-
-    std::unordered_map<std::string, std::string> log_data;
-    log_data["hostname"] = sparams.hostname;
-    log_data["port"] = std::to_string(sparams.port);
-
-    if (sparams.api_keys.size() == 1) {
-        log_data["api_key"] = "api_key: ****" + sparams.api_keys[0].substr(sparams.api_keys[0].length() - 4);
-    } else if (sparams.api_keys.size() > 1) {
-        log_data["api_key"] = "api_key: " + std::to_string(sparams.api_keys.size()) + " keys loaded";
-    }
-
-    LOG_INFO("HTTP server listening", log_data);
-    // run the HTTP server in a thread - see comment below
-    std::thread t([&]()
-            {
-                if (!svr.listen_after_bind())
-                {
-                    state.store(SERVER_STATE_ERROR);
-                    return 1;
-                }
-
-                return 0;
-            });
-
    // load the model
    if (!llama.load_model(params))
    {
-        state.store(SERVER_STATE_ERROR);
        return 1;
-    } else {
-        llama.initialize();
-        state.store(SERVER_STATE_READY);
-        LOG_INFO("model loaded", {});
    }

+    llama.initialize();
+
+    httplib::Server svr;
+
    // Middleware for API key validation
    auto validate_api_key = [&sparams](const httplib::Request &req, httplib::Response &res) -> bool {
        // If API key is not set, skip validation
-        if (sparams.api_keys.empty()) {
+        if (sparams.api_key.empty()) {
            return true;
        }

@@ -2952,7 +2805,7 @@ int main(int argc, char **argv)
        std::string prefix = "Bearer ";
        if (auth_header.substr(0, prefix.size()) == prefix) {
            std::string received_api_key = auth_header.substr(prefix.size());
-            if (std::find(sparams.api_keys.begin(), sparams.api_keys.end(), received_api_key) != sparams.api_keys.end()) {
+            if (received_api_key == sparams.api_key) {
                return true; // API key is valid
            }
        }
@@ -2966,6 +2819,10 @@ int main(int argc, char **argv)
        return false;
    };

+    svr.set_default_headers({{"Server", "llama.cpp"},
+                             {"Access-Control-Allow-Origin", "*"},
+                             {"Access-Control-Allow-Headers", "content-type"}});
+
    // this is only called if no index.html is found in the public --path
    svr.Get("/", [](const httplib::Request &, httplib::Response &res)
            {
@@ -2994,9 +2851,9 @@ int main(int argc, char **argv)
                return false;
            });

-    svr.Get("/props", [&llama](const httplib::Request & req, httplib::Response &res)
+    svr.Get("/props", [&llama](const httplib::Request & /*req*/, httplib::Response &res)
            {
-                res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
+                res.set_header("Access-Control-Allow-Origin", "*");
                json data = {
                    { "user_name",      llama.name_user.c_str() },
                    { "assistant_name", llama.name_assistant.c_str() }
@@ -3006,7 +2863,6 @@ int main(int argc, char **argv)

    svr.Post("/completion", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res)
            {
-                res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
                if (!validate_api_key(req, res)) {
                    return;
                }
@@ -3074,9 +2930,10 @@ int main(int argc, char **argv)
                }
            });

-    svr.Get("/v1/models", [&params](const httplib::Request& req, httplib::Response& res)
+
+
+    svr.Get("/v1/models", [&params](const httplib::Request&, httplib::Response& res)
            {
-                res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
                std::time_t t = std::time(0);

                json models = {
@@ -3094,11 +2951,9 @@ int main(int argc, char **argv)
                res.set_content(models.dump(), "application/json; charset=utf-8");
            });

-
    // TODO: add mount point without "/v1" prefix -- how?
    svr.Post("/v1/chat/completions", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res)
            {
-                res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
                if (!validate_api_key(req, res)) {
                    return;
                }
@@ -3172,7 +3027,6 @@ int main(int argc, char **argv)

    svr.Post("/infill", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res)
            {
-                res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
                if (!validate_api_key(req, res)) {
                    return;
                }
@@ -3245,7 +3099,6 @@ int main(int argc, char **argv)

    svr.Post("/tokenize", [&llama](const httplib::Request &req, httplib::Response &res)
            {
-                res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
                const json body = json::parse(req.body);
                std::vector<llama_token> tokens;
                if (body.count("content") != 0)
@@ -3258,7 +3111,6 @@ int main(int argc, char **argv)

    svr.Post("/detokenize", [&llama](const httplib::Request &req, httplib::Response &res)
            {
-                res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
                const json body = json::parse(req.body);
                std::string content;
                if (body.count("tokens") != 0)
@@ -3273,7 +3125,6 @@ int main(int argc, char **argv)

    svr.Post("/embedding", [&llama](const httplib::Request &req, httplib::Response &res)
            {
-                res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
                const json body = json::parse(req.body);
                json prompt;
                if (body.count("content") != 0)
@@ -3299,6 +3150,81 @@ int main(int argc, char **argv)
                return res.set_content(result.result_json.dump(), "application/json; charset=utf-8");
            });

+    svr.set_logger(log_server_request);
+
+    svr.set_exception_handler([](const httplib::Request &, httplib::Response &res, std::exception_ptr ep)
+            {
+                const char fmt[] = "500 Internal Server Error\n%s";
+                char buf[BUFSIZ];
+                try
+                {
+                    std::rethrow_exception(std::move(ep));
+                }
+                catch (std::exception &e)
+                {
+                    snprintf(buf, sizeof(buf), fmt, e.what());
+                }
+                catch (...)
+                {
+                    snprintf(buf, sizeof(buf), fmt, "Unknown Exception");
+                }
+                res.set_content(buf, "text/plain; charset=utf-8");
+                res.status = 500;
+            });
+
+    svr.set_error_handler([](const httplib::Request &, httplib::Response &res)
+            {
+                if (res.status == 401)
+                {
+                    res.set_content("Unauthorized", "text/plain; charset=utf-8");
+                }
+                if (res.status == 400)
+                {
+                    res.set_content("Invalid request", "text/plain; charset=utf-8");
+                }
+                else if (res.status == 404)
+                {
+                    res.set_content("File Not Found", "text/plain; charset=utf-8");
+                    res.status = 404;
+                }
+            });
+
+    // set timeouts and change hostname and port
+    svr.set_read_timeout (sparams.read_timeout);
+    svr.set_write_timeout(sparams.write_timeout);
+
+    if (!svr.bind_to_port(sparams.hostname, sparams.port))
+    {
+        fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n", sparams.hostname.c_str(), sparams.port);
+        return 1;
+    }
+
+    // Set the base directory for serving static files
+    svr.set_base_dir(sparams.public_path);
+
+    // to make it ctrl+clickable:
+    LOG_TEE("\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port);
+
+    std::unordered_map<std::string, std::string> log_data;
+    log_data["hostname"] = sparams.hostname;
+    log_data["port"] = std::to_string(sparams.port);
+
+    if (!sparams.api_key.empty()) {
+        log_data["api_key"] = "api_key: ****" + sparams.api_key.substr(sparams.api_key.length() - 4);
+    }
+
+    LOG_INFO("HTTP server listening", log_data);
+    // run the HTTP server in a thread - see comment below
+    std::thread t([&]()
+            {
+                if (!svr.listen_after_bind())
+                {
+                    return 1;
+                }
+
+                return 0;
+            });
+
    // GG: if I put the main loop inside a thread, it crashes on the first request when build in Debug!?
    //     "Bus error: 10" - this is on macOS, it does not crash on Linux
    //std::thread t2([&]()
--- a/ggml-backend-impl.h
+++ b/ggml-backend-impl.h
@@ -90,7 +90,7 @@ extern "C" {
        void                      (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);

        // compute graph without a plan
-        bool (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
+        void (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);

        // check if the backend supports an operation
        bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -195,14 +195,11 @@ void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_
    ggml_backend_synchronize(backend);
 }

-bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    if (!backend->iface.graph_compute(backend, cgraph)) {
-        return false;
-    }
+void ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+    backend->iface.graph_compute(backend, cgraph);

    // TODO: optional sync
    ggml_backend_synchronize(backend);
-    return true;
 }

 bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
@@ -600,7 +597,7 @@ static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_bac
    GGML_UNUSED(backend);
 }

-static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;

    struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
@@ -614,7 +611,6 @@ static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_c
    cplan.work_data = cpu_ctx->work_data;

    ggml_graph_compute(cgraph, &cplan);
-    return true;
 }

 static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
--- a/ggml-backend.h
+++ b/ggml-backend.h
@@ -58,7 +58,7 @@ extern "C" {

    GGML_API void ggml_backend_graph_plan_free   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
    GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-    GGML_API bool ggml_backend_graph_compute     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+    GGML_API void ggml_backend_graph_compute     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
    GGML_API bool ggml_backend_supports_op       (ggml_backend_t backend, const struct ggml_tensor * op);

    // tensor copy between different backends
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
--- a/ggml-impl.h
+++ b/ggml-impl.h
@@ -5,7 +5,6 @@
 // GGML internal header

 #include <assert.h>
-#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
 #include <stddef.h>
 #include <stdbool.h>
 #include <string.h> // memcpy
--- a/ggml-metal.h
+++ b/ggml-metal.h
@@ -87,7 +87,7 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx);

 // same as ggml_graph_compute but uses Metal
 // creates gf->n_threads command buffers in parallel
-bool ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
+void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);

 //
 // backend API
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -87,9 +87,6 @@ struct ggml_metal_context {
    GGML_METAL_DECL_KERNEL(get_rows_q4_K);
    GGML_METAL_DECL_KERNEL(get_rows_q5_K);
    GGML_METAL_DECL_KERNEL(get_rows_q6_K);
-    GGML_METAL_DECL_KERNEL(get_rows_i32);
-    GGML_METAL_DECL_KERNEL(get_rows_iq2_xxs);
-    GGML_METAL_DECL_KERNEL(get_rows_iq2_xs);
    GGML_METAL_DECL_KERNEL(rms_norm);
    GGML_METAL_DECL_KERNEL(group_norm);
    GGML_METAL_DECL_KERNEL(norm);
@@ -108,8 +105,6 @@ struct ggml_metal_context {
    GGML_METAL_DECL_KERNEL(mul_mv_q4_K_f32);
    GGML_METAL_DECL_KERNEL(mul_mv_q5_K_f32);
    GGML_METAL_DECL_KERNEL(mul_mv_q6_K_f32);
-    GGML_METAL_DECL_KERNEL(mul_mv_iq2_xxs_f32);
-    GGML_METAL_DECL_KERNEL(mul_mv_iq2_xs_f32);
    GGML_METAL_DECL_KERNEL(mul_mv_id_f32_f32);
    //GGML_METAL_DECL_KERNEL(mul_mv_id_f16_f16);
    GGML_METAL_DECL_KERNEL(mul_mv_id_f16_f32);
@@ -125,8 +120,6 @@ struct ggml_metal_context {
    GGML_METAL_DECL_KERNEL(mul_mv_id_q4_K_f32);
    GGML_METAL_DECL_KERNEL(mul_mv_id_q5_K_f32);
    GGML_METAL_DECL_KERNEL(mul_mv_id_q6_K_f32);
-    GGML_METAL_DECL_KERNEL(mul_mv_id_iq2_xxs_f32);
-    GGML_METAL_DECL_KERNEL(mul_mv_id_iq2_xs_f32);
    GGML_METAL_DECL_KERNEL(mul_mm_f32_f32);
    GGML_METAL_DECL_KERNEL(mul_mm_f16_f32);
    GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32);
@@ -139,8 +132,6 @@ struct ggml_metal_context {
    GGML_METAL_DECL_KERNEL(mul_mm_q4_K_f32);
    GGML_METAL_DECL_KERNEL(mul_mm_q5_K_f32);
    GGML_METAL_DECL_KERNEL(mul_mm_q6_K_f32);
-    GGML_METAL_DECL_KERNEL(mul_mm_iq2_xxs_f32);
-    GGML_METAL_DECL_KERNEL(mul_mm_iq2_xs_f32);
    GGML_METAL_DECL_KERNEL(mul_mm_id_f32_f32);
    GGML_METAL_DECL_KERNEL(mul_mm_id_f16_f32);
    GGML_METAL_DECL_KERNEL(mul_mm_id_q4_0_f32);
@@ -153,8 +144,6 @@ struct ggml_metal_context {
    GGML_METAL_DECL_KERNEL(mul_mm_id_q4_K_f32);
    GGML_METAL_DECL_KERNEL(mul_mm_id_q5_K_f32);
    GGML_METAL_DECL_KERNEL(mul_mm_id_q6_K_f32);
-    GGML_METAL_DECL_KERNEL(mul_mm_id_iq2_xxs_f32);
-    GGML_METAL_DECL_KERNEL(mul_mm_id_iq2_xs_f32);
    GGML_METAL_DECL_KERNEL(rope_f32);
    GGML_METAL_DECL_KERNEL(rope_f16);
    GGML_METAL_DECL_KERNEL(alibi_f32);
@@ -268,14 +257,14 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
 #endif
        NSError * error = nil;
-        NSString * libPath = [bundle pathForResource:@"default" ofType:@"metallib"];
+        NSString * libPath = [bundle pathForResource:@"ggml" ofType:@"metallib"];
        if (libPath != nil) {
            // pre-compiled library found
            NSURL * libURL = [NSURL fileURLWithPath:libPath];
            GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]);
            ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
        } else {
-            GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
+            GGML_METAL_LOG_INFO("%s: ggml.metallib not found, loading from source\n", __func__);

            NSString * sourcePath;
            NSString * ggmlMetalPathResources = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
@@ -305,7 +294,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
 #endif
            // try to disable fast-math
            // NOTE: this seems to have no effect whatsoever
-            //       instead, in order to disable fast-math, we have to build default.metallib from the command line
+            //       instead, in order to disable fast-math, we have to build ggml.metallib from the command line
            //       using xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
            //       and go through the "pre-compiled library found" path above
            //[options setFastMathEnabled:false];
@@ -388,9 +377,6 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(get_rows_q4_K);
        GGML_METAL_ADD_KERNEL(get_rows_q5_K);
        GGML_METAL_ADD_KERNEL(get_rows_q6_K);
-        GGML_METAL_ADD_KERNEL(get_rows_i32);
-        GGML_METAL_ADD_KERNEL(get_rows_iq2_xxs);
-        GGML_METAL_ADD_KERNEL(get_rows_iq2_xs);
        GGML_METAL_ADD_KERNEL(rms_norm);
        GGML_METAL_ADD_KERNEL(group_norm);
        GGML_METAL_ADD_KERNEL(norm);
@@ -409,8 +395,6 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(mul_mv_q4_K_f32);
        GGML_METAL_ADD_KERNEL(mul_mv_q5_K_f32);
        GGML_METAL_ADD_KERNEL(mul_mv_q6_K_f32);
-        GGML_METAL_ADD_KERNEL(mul_mv_iq2_xxs_f32);
-        GGML_METAL_ADD_KERNEL(mul_mv_iq2_xs_f32);
        GGML_METAL_ADD_KERNEL(mul_mv_id_f32_f32);
        //GGML_METAL_ADD_KERNEL(mul_mv_id_f16_f16);
        GGML_METAL_ADD_KERNEL(mul_mv_id_f16_f32);
@@ -426,8 +410,6 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(mul_mv_id_q4_K_f32);
        GGML_METAL_ADD_KERNEL(mul_mv_id_q5_K_f32);
        GGML_METAL_ADD_KERNEL(mul_mv_id_q6_K_f32);
-        GGML_METAL_ADD_KERNEL(mul_mv_id_iq2_xxs_f32);
-        GGML_METAL_ADD_KERNEL(mul_mv_id_iq2_xs_f32);
        if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) {
            GGML_METAL_ADD_KERNEL(mul_mm_f32_f32);
            GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
@@ -441,8 +423,6 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
            GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32);
            GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32);
            GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32);
-            GGML_METAL_ADD_KERNEL(mul_mm_iq2_xxs_f32);
-            GGML_METAL_ADD_KERNEL(mul_mm_iq2_xs_f32);
            GGML_METAL_ADD_KERNEL(mul_mm_id_f32_f32);
            GGML_METAL_ADD_KERNEL(mul_mm_id_f16_f32);
            GGML_METAL_ADD_KERNEL(mul_mm_id_q4_0_f32);
@@ -455,8 +435,6 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
            GGML_METAL_ADD_KERNEL(mul_mm_id_q4_K_f32);
            GGML_METAL_ADD_KERNEL(mul_mm_id_q5_K_f32);
            GGML_METAL_ADD_KERNEL(mul_mm_id_q6_K_f32);
-            GGML_METAL_ADD_KERNEL(mul_mm_id_iq2_xxs_f32);
-            GGML_METAL_ADD_KERNEL(mul_mm_id_iq2_xs_f32);
        }
        GGML_METAL_ADD_KERNEL(rope_f32);
        GGML_METAL_ADD_KERNEL(rope_f16);
@@ -521,9 +499,6 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
    GGML_METAL_DEL_KERNEL(get_rows_q4_K);
    GGML_METAL_DEL_KERNEL(get_rows_q5_K);
    GGML_METAL_DEL_KERNEL(get_rows_q6_K);
-    GGML_METAL_DEL_KERNEL(get_rows_i32);
-    GGML_METAL_DEL_KERNEL(get_rows_iq2_xxs);
-    GGML_METAL_DEL_KERNEL(get_rows_iq2_xs);
    GGML_METAL_DEL_KERNEL(rms_norm);
    GGML_METAL_DEL_KERNEL(group_norm);
    GGML_METAL_DEL_KERNEL(norm);
@@ -542,8 +517,6 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
    GGML_METAL_DEL_KERNEL(mul_mv_q4_K_f32);
    GGML_METAL_DEL_KERNEL(mul_mv_q5_K_f32);
    GGML_METAL_DEL_KERNEL(mul_mv_q6_K_f32);
-    GGML_METAL_DEL_KERNEL(mul_mv_iq2_xxs_f32);
-    GGML_METAL_DEL_KERNEL(mul_mv_iq2_xs_f32);
    GGML_METAL_DEL_KERNEL(mul_mv_id_f32_f32);
    //GGML_METAL_DEL_KERNEL(mul_mv_id_f16_f16);
    GGML_METAL_DEL_KERNEL(mul_mv_id_f16_f32);
@@ -559,8 +532,6 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
    GGML_METAL_DEL_KERNEL(mul_mv_id_q4_K_f32);
    GGML_METAL_DEL_KERNEL(mul_mv_id_q5_K_f32);
    GGML_METAL_DEL_KERNEL(mul_mv_id_q6_K_f32);
-    GGML_METAL_DEL_KERNEL(mul_mv_id_iq2_xxs_f32);
-    GGML_METAL_DEL_KERNEL(mul_mv_id_iq2_xs_f32);
    if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) {
        GGML_METAL_DEL_KERNEL(mul_mm_f32_f32);
        GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
@@ -574,8 +545,6 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
        GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32);
        GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32);
        GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32);
-        GGML_METAL_DEL_KERNEL(mul_mm_iq2_xxs_f32);
-        GGML_METAL_DEL_KERNEL(mul_mm_iq2_xs_f32);
        GGML_METAL_DEL_KERNEL(mul_mm_id_f32_f32);
        GGML_METAL_DEL_KERNEL(mul_mm_id_f16_f32);
        GGML_METAL_DEL_KERNEL(mul_mm_id_q4_0_f32);
@@ -588,8 +557,6 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
        GGML_METAL_DEL_KERNEL(mul_mm_id_q4_K_f32);
        GGML_METAL_DEL_KERNEL(mul_mm_id_q5_K_f32);
        GGML_METAL_DEL_KERNEL(mul_mm_id_q6_K_f32);
-        GGML_METAL_DEL_KERNEL(mul_mm_id_iq2_xxs_f32);
-        GGML_METAL_DEL_KERNEL(mul_mm_id_iq2_xs_f32);
    }
    GGML_METAL_DEL_KERNEL(rope_f32);
    GGML_METAL_DEL_KERNEL(rope_f16);
@@ -1007,7 +974,7 @@ static bool ggml_metal_supports_op(const struct ggml_tensor * op) {
            return false;
    }
 }
-bool ggml_metal_graph_compute(
+void ggml_metal_graph_compute(
        struct ggml_metal_context * ctx,
               struct ggml_cgraph * gf) {
    @autoreleasepool {
@@ -1082,10 +1049,6 @@ bool ggml_metal_graph_compute(
                    GGML_ASSERT(!"unsupported op");
                }

-#ifndef GGML_METAL_NDEBUG
-                [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(dst) encoding:NSUTF8StringEncoding]];
-#endif
-
                const int64_t  ne00 = src0 ? src0->ne[0] : 0;
                const int64_t  ne01 = src0 ? src0->ne[1] : 0;
                const int64_t  ne02 = src0 ? src0->ne[2] : 0;
@@ -1575,8 +1538,6 @@ bool ggml_metal_graph_compute(
                                    case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_K_f32]; break;
                                    case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q5_K_f32]; break;
                                    case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q6_K_f32]; break;
-                                    case GGML_TYPE_IQ2_XXS: [encoder setComputePipelineState:ctx->pipeline_mul_mm_iq2_xxs_f32]; break;
-                                    case GGML_TYPE_IQ2_XS : [encoder setComputePipelineState:ctx->pipeline_mul_mm_iq2_xs_f32]; break;
                                    default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
                                }
                                [encoder setBuffer:id_src0 offset:offs_src0    atIndex:0];
@@ -1689,18 +1650,6 @@ bool ggml_metal_graph_compute(
                                            nth1 = 32;
                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q6_K_f32];
                                        } break;
-                                    case GGML_TYPE_IQ2_XXS:
-                                        {
-                                            nth0 = 4;
-                                            nth1 = 16;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_iq2_xxs_f32];
-                                        } break;
-                                    case GGML_TYPE_IQ2_XS:
-                                        {
-                                            nth0 = 4;
-                                            nth1 = 16;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_iq2_xs_f32];
-                                        } break;
                                    default:
                                        {
                                            GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t);
@@ -1708,10 +1657,6 @@ bool ggml_metal_graph_compute(
                                        }
                                };

-                                if (ggml_is_quantized(src0t)) {
-                                    GGML_ASSERT(ne00 >= nth0*nth1);
-                                }
-
                                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                                [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
@@ -1737,11 +1682,6 @@ bool ggml_metal_graph_compute(
                                    src0t == GGML_TYPE_Q2_K) { // || src0t == GGML_TYPE_Q4_K) {
                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                }
-                                else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) {
-                                    const int mem_size = src0t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128;
-                                    [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                                }
                                else if (src0t == GGML_TYPE_Q4_K) {
                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                }
@@ -1775,9 +1715,6 @@ bool ggml_metal_graph_compute(
                            // TODO: make this more general
                            GGML_ASSERT(n_as <= 8);

-                            // max size of the src1ids array in the kernel stack
-                            GGML_ASSERT(ne11 <= 512);
-
                            struct ggml_tensor * src2 = gf->nodes[i]->src[2];

                            const int64_t  ne20 = src2 ? src2->ne[0] : 0;
@@ -1795,6 +1732,9 @@ bool ggml_metal_graph_compute(
                            GGML_ASSERT(!ggml_is_transposed(src2));
                            GGML_ASSERT(!ggml_is_transposed(src1));

+                            GGML_ASSERT(ne20 % 32 == 0);
+                            // !!!!!!!!! TODO: this assert is probably required but not sure!
+                            //GGML_ASSERT(ne20 >= 64);
                            GGML_ASSERT(src1t == GGML_TYPE_F32);

                            const uint r2 = ne12/ne22;
@@ -1802,22 +1742,22 @@ bool ggml_metal_graph_compute(

                            // find the break-even point where the matrix-matrix kernel becomes more efficient compared
                            // to the matrix-vector kernel
-                            int ne11_mm_min = n_as;
+                            int ne11_mm_min = 1;

                            const int idx = ((int32_t *) dst->op_params)[0];

                            // batch size
                            GGML_ASSERT(ne01 == ne11);

+                            const int64_t _ne1 = 1; // kernel_mul_mm_impl needs a reference in constant memory
+
                            // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
                            // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
                            // !!!
                            // TODO: for now, always use mat-vec kernels until we figure out how to improve the
                            //       indirect matrix multiplication
                            // !!!
-                            if ([ctx->device supportsFamily:MTLGPUFamilyApple7] &&
-                                ne20 % 32 == 0 && ne20 >= 64 &&
-                                ne11 > ne11_mm_min) {
+                            if ([ctx->device supportsFamily:MTLGPUFamilyApple7] && _ne1 > ne11_mm_min) {
                                switch (src2->type) {
                                    case GGML_TYPE_F32:  [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_f32_f32];  break;
                                    case GGML_TYPE_F16:  [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_f16_f32];  break;
@@ -1831,8 +1771,6 @@ bool ggml_metal_graph_compute(
                                    case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q4_K_f32]; break;
                                    case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q5_K_f32]; break;
                                    case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q6_K_f32]; break;
-                                    case GGML_TYPE_IQ2_XXS: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_iq2_xxs_f32]; break;
-                                    case GGML_TYPE_IQ2_XS : [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_iq2_xs_f32]; break;
                                    default: GGML_ASSERT(false && "MUL_MAT_ID not implemented");
                                }
                                [encoder setBuffer:id_src0 offset:offs_src0    atIndex:0];
@@ -1849,7 +1787,7 @@ bool ggml_metal_graph_compute(
                                [encoder setBytes:&nb11    length:sizeof(nb11) atIndex:11];
                                [encoder setBytes:&nb12    length:sizeof(nb12) atIndex:12];
                                [encoder setBytes:&ne0     length:sizeof(ne0)  atIndex:13];
-                                [encoder setBytes:&ne1     length:sizeof(ne1)  atIndex:14];
+                                [encoder setBytes:&_ne1    length:sizeof(_ne1) atIndex:14];
                                [encoder setBytes:&nb1     length:sizeof(nb1)  atIndex:15];
                                [encoder setBytes:&r2      length:sizeof(r2)   atIndex:16];
                                [encoder setBytes:&r3      length:sizeof(r3)   atIndex:17];
@@ -1867,7 +1805,8 @@ bool ggml_metal_graph_compute(

                                [encoder setThreadgroupMemoryLength:8192 atIndex:0];

-                                [encoder dispatchThreadgroups:MTLSizeMake((ne11 + 31)/32, (ne21 + 63)/64, n_as*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
+                                // TODO: processing one row at a time (ne11 -> 1) is not efficient
+                                [encoder dispatchThreadgroups:MTLSizeMake( (_ne1 + 31)/32, (ne21 + 63)/64, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
                            } else {
                                int nth0 = 32;
                                int nth1 = 1;
@@ -1948,31 +1887,13 @@ bool ggml_metal_graph_compute(
                                            nth1 = 32;
                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q6_K_f32];
                                        } break;
-                                    case GGML_TYPE_IQ2_XXS:
-                                        {
-                                            nth0 = 4;
-                                            nth1 = 16;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_iq2_xxs_f32];
-                                        } break;
-                                    case GGML_TYPE_IQ2_XS:
-                                        {
-                                            nth0 = 4;
-                                            nth1 = 16;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_iq2_xs_f32];
-                                        } break;
                                    default:
                                        {
-                                            GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t);
+                                            GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t);
                                            GGML_ASSERT(false && "not implemented");
                                        }
                                };

-                                if (ggml_is_quantized(src2t)) {
-                                    GGML_ASSERT(ne20 >= nth0*nth1);
-                                }
-
-                                const int64_t _ne1 = 1; // kernels needs a reference in constant memory
-
                                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                                [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
@@ -2012,11 +1933,6 @@ bool ggml_metal_graph_compute(
                                    src2t == GGML_TYPE_Q2_K) { // || src2t == GGML_TYPE_Q4_K) {
                                    [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                }
-                                else if (src2t == GGML_TYPE_IQ2_XXS || src2t == GGML_TYPE_IQ2_XS) {
-                                    const int mem_size = src2t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128;
-                                    [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                                }
                                else if (src2t == GGML_TYPE_Q4_K) {
                                    [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                }
@@ -2053,9 +1969,6 @@ bool ggml_metal_graph_compute(
                                case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_K]; break;
                                case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_K]; break;
                                case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_K]; break;
-                                case GGML_TYPE_I32:  [encoder setComputePipelineState:ctx->pipeline_get_rows_i32];  break;
-                                case GGML_TYPE_IQ2_XXS: [encoder setComputePipelineState:ctx->pipeline_get_rows_iq2_xxs]; break;
-                                case GGML_TYPE_IQ2_XS : [encoder setComputePipelineState:ctx->pipeline_get_rows_iq2_xs]; break;
                                default: GGML_ASSERT(false && "not implemented");
                            }

@@ -2457,10 +2370,6 @@ bool ggml_metal_graph_compute(
                            GGML_ASSERT(false);
                        }
                }
-
-#ifndef GGML_METAL_NDEBUG
-                [encoder popDebugGroup];
-#endif
            }

            if (encoder != nil) {
@@ -2483,11 +2392,10 @@ bool ggml_metal_graph_compute(
        MTLCommandBufferStatus status = (MTLCommandBufferStatus) [ctx->command_buffers[i] status];
        if (status != MTLCommandBufferStatusCompleted) {
            GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
-            return false;
+            GGML_ASSERT(false);
        }
    }

-    return true;
    }
 }

@@ -2767,10 +2675,10 @@ static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggm
    UNUSED(backend);
 }

-static bool ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+static void ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
    struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;

-    return ggml_metal_graph_compute(metal_ctx, cgraph);
+    ggml_metal_graph_compute(metal_ctx, cgraph);
 }

 static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -2340,322 +2340,6 @@ size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t *
    return (n/QK_K*sizeof(block_q6_K));
 }

-// ====================== "True" 2-bit (de)-quantization
-
-static const  uint64_t iq2xxs_grid[256] = {
-    0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
-    0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x08080808082b0808,
-    0x08080808082b082b, 0x08080808082b2b08, 0x08080808082b2b2b, 0x0808080819080819,
-    0x0808080819081908, 0x0808080819190808, 0x0808080819192b08, 0x08080808192b0819,
-    0x08080808192b1908, 0x080808082b080808, 0x080808082b08082b, 0x080808082b082b2b,
-    0x080808082b2b082b, 0x0808081908080819, 0x0808081908081908, 0x0808081908190808,
-    0x0808081908191919, 0x0808081919080808, 0x080808192b081908, 0x080808192b192b08,
-    0x0808082b08080808, 0x0808082b0808082b, 0x0808082b082b082b, 0x0808082b2b08082b,
-    0x0808190808080819, 0x0808190808081908, 0x0808190808190808, 0x08081908082b0819,
-    0x08081908082b1908, 0x0808190819080808, 0x080819081908082b, 0x0808190819082b08,
-    0x08081908192b0808, 0x080819082b080819, 0x080819082b081908, 0x080819082b190808,
-    0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b, 0x0808191908082b08,
-    0x08081919082b0808, 0x080819191908192b, 0x08081919192b2b19, 0x080819192b080808,
-    0x080819192b190819, 0x0808192b08082b19, 0x0808192b08190808, 0x0808192b19080808,
-    0x0808192b2b081908, 0x0808192b2b2b1908, 0x08082b0808080808, 0x08082b0808081919,
-    0x08082b0808082b08, 0x08082b0808191908, 0x08082b08082b2b08, 0x08082b0819080819,
-    0x08082b0819081908, 0x08082b0819190808, 0x08082b081919082b, 0x08082b082b082b08,
-    0x08082b1908081908, 0x08082b1919080808, 0x08082b2b0808082b, 0x08082b2b08191908,
-    0x0819080808080819, 0x0819080808081908, 0x0819080808190808, 0x08190808082b0819,
-    0x0819080819080808, 0x08190808192b0808, 0x081908082b081908, 0x081908082b190808,
-    0x081908082b191919, 0x0819081908080808, 0x0819081908082b08, 0x08190819082b0808,
-    0x0819081919190808, 0x0819081919192b2b, 0x081908192b080808, 0x0819082b082b1908,
-    0x0819082b19081919, 0x0819190808080808, 0x0819190808082b08, 0x08191908082b0808,
-    0x08191908082b1919, 0x0819190819082b19, 0x081919082b080808, 0x0819191908192b08,
-    0x08191919192b082b, 0x0819192b08080808, 0x0819192b0819192b, 0x08192b0808080819,
-    0x08192b0808081908, 0x08192b0808190808, 0x08192b0819080808, 0x08192b082b080819,
-    0x08192b1908080808, 0x08192b1908081919, 0x08192b192b2b0808, 0x08192b2b19190819,
-    0x082b080808080808, 0x082b08080808082b, 0x082b080808082b2b, 0x082b080819081908,
-    0x082b0808192b0819, 0x082b08082b080808, 0x082b08082b08082b, 0x082b0819082b2b19,
-    0x082b081919082b08, 0x082b082b08080808, 0x082b082b0808082b, 0x082b190808080819,
-    0x082b190808081908, 0x082b190808190808, 0x082b190819080808, 0x082b19081919192b,
-    0x082b191908080808, 0x082b191919080819, 0x082b1919192b1908, 0x082b192b2b190808,
-    0x082b2b0808082b08, 0x082b2b08082b0808, 0x082b2b082b191908, 0x082b2b2b19081908,
-    0x1908080808080819, 0x1908080808081908, 0x1908080808190808, 0x1908080808192b08,
-    0x19080808082b0819, 0x19080808082b1908, 0x1908080819080808, 0x1908080819082b08,
-    0x190808081919192b, 0x19080808192b0808, 0x190808082b080819, 0x190808082b081908,
-    0x190808082b190808, 0x1908081908080808, 0x19080819082b0808, 0x19080819192b0819,
-    0x190808192b080808, 0x190808192b081919, 0x1908082b08080819, 0x1908082b08190808,
-    0x1908082b19082b08, 0x1908082b1919192b, 0x1908082b192b2b08, 0x1908190808080808,
-    0x1908190808082b08, 0x19081908082b0808, 0x190819082b080808, 0x190819082b192b19,
-    0x190819190819082b, 0x19081919082b1908, 0x1908192b08080808, 0x19082b0808080819,
-    0x19082b0808081908, 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919,
-    0x19082b1908080808, 0x19082b1919192b08, 0x19082b19192b0819, 0x19082b192b08082b,
-    0x19082b2b19081919, 0x19082b2b2b190808, 0x1919080808080808, 0x1919080808082b08,
-    0x1919080808190819, 0x1919080808192b19, 0x19190808082b0808, 0x191908082b080808,
-    0x191908082b082b08, 0x1919081908081908, 0x191908191908082b, 0x191908192b2b1908,
-    0x1919082b2b190819, 0x191919082b190808, 0x191919082b19082b, 0x1919191908082b2b,
-    0x1919192b08080819, 0x1919192b19191908, 0x19192b0808080808, 0x19192b0808190819,
-    0x19192b0808192b19, 0x19192b08192b1908, 0x19192b1919080808, 0x19192b2b08082b08,
-    0x192b080808081908, 0x192b080808190808, 0x192b080819080808, 0x192b0808192b2b08,
-    0x192b081908080808, 0x192b081919191919, 0x192b082b08192b08, 0x192b082b192b0808,
-    0x192b190808080808, 0x192b190808081919, 0x192b191908190808, 0x192b19190819082b,
-    0x192b19192b081908, 0x192b2b081908082b, 0x2b08080808080808, 0x2b0808080808082b,
-    0x2b08080808082b2b, 0x2b08080819080819, 0x2b0808082b08082b, 0x2b08081908081908,
-    0x2b08081908192b08, 0x2b08081919080808, 0x2b08082b08190819, 0x2b08190808080819,
-    0x2b08190808081908, 0x2b08190808190808, 0x2b08190808191919, 0x2b08190819080808,
-    0x2b081908192b0808, 0x2b08191908080808, 0x2b0819191908192b, 0x2b0819192b191908,
-    0x2b08192b08082b19, 0x2b08192b19080808, 0x2b08192b192b0808, 0x2b082b080808082b,
-    0x2b082b1908081908, 0x2b082b2b08190819, 0x2b19080808081908, 0x2b19080808190808,
-    0x2b190808082b1908, 0x2b19080819080808, 0x2b1908082b2b0819, 0x2b1908190819192b,
-    0x2b1908192b080808, 0x2b19082b19081919, 0x2b19190808080808, 0x2b191908082b082b,
-    0x2b19190819081908, 0x2b19191919190819, 0x2b192b082b080819, 0x2b192b19082b0808,
-    0x2b2b08080808082b, 0x2b2b080819190808, 0x2b2b08082b081919, 0x2b2b081908082b19,
-    0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808, 0x2b2b2b1908081908,
-};
-
-static const uint64_t iq2xs_grid[512] = {
-    0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
-    0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
-    0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
-    0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
-    0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
-    0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808,
-    0x080808082b08082b, 0x080808082b081919, 0x080808082b082b08, 0x080808082b190819,
-    0x080808082b191908, 0x080808082b192b19, 0x080808082b2b0808, 0x0808081908080819,
-    0x0808081908081908, 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808,
-    0x080808190819082b, 0x0808081908191919, 0x0808081908192b08, 0x0808081908192b2b,
-    0x08080819082b0819, 0x08080819082b1908, 0x0808081919080808, 0x080808191908082b,
-    0x0808081919081919, 0x0808081919082b08, 0x0808081919190819, 0x0808081919191908,
-    0x08080819192b0808, 0x08080819192b2b08, 0x080808192b080819, 0x080808192b081908,
-    0x080808192b190808, 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b08081919,
-    0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908, 0x0808082b082b0808,
-    0x0808082b19080819, 0x0808082b19081908, 0x0808082b19190808, 0x0808082b19191919,
-    0x0808082b2b080808, 0x0808082b2b082b2b, 0x0808190808080819, 0x0808190808081908,
-    0x080819080808192b, 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b,
-    0x0808190808191919, 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908,
-    0x0808190819080808, 0x080819081908082b, 0x0808190819081919, 0x0808190819082b08,
-    0x0808190819190819, 0x0808190819191908, 0x080819081919192b, 0x08081908192b0808,
-    0x080819082b080819, 0x080819082b081908, 0x080819082b190808, 0x0808191908080808,
-    0x080819190808082b, 0x0808191908081919, 0x0808191908082b08, 0x0808191908190819,
-    0x0808191908191908, 0x08081919082b0808, 0x0808191919080819, 0x0808191919081908,
-    0x0808191919190808, 0x08081919192b0819, 0x080819192b080808, 0x0808192b08080819,
-    0x0808192b08081908, 0x0808192b08190808, 0x0808192b082b192b, 0x0808192b19080808,
-    0x0808192b1908082b, 0x0808192b2b081908, 0x08082b0808080808, 0x08082b080808082b,
-    0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808082b2b, 0x08082b0808190819,
-    0x08082b0808191908, 0x08082b08082b0808, 0x08082b08082b1919, 0x08082b0819080819,
-    0x08082b0819081908, 0x08082b0819190808, 0x08082b0819192b08, 0x08082b082b080808,
-    0x08082b082b2b0808, 0x08082b082b2b2b2b, 0x08082b1908080819, 0x08082b1908081908,
-    0x08082b1908190808, 0x08082b1919080808, 0x08082b192b080819, 0x08082b192b082b19,
-    0x08082b2b08080808, 0x08082b2b082b0808, 0x08082b2b082b2b08, 0x08082b2b2b19192b,
-    0x08082b2b2b2b0808, 0x0819080808080819, 0x0819080808081908, 0x081908080808192b,
-    0x0819080808082b19, 0x0819080808190808, 0x081908080819082b, 0x0819080808191919,
-    0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908, 0x0819080819080808,
-    0x081908081908082b, 0x0819080819081919, 0x0819080819082b08, 0x0819080819190819,
-    0x0819080819191908, 0x08190808192b0808, 0x08190808192b2b2b, 0x081908082b080819,
-    0x081908082b081908, 0x081908082b190808, 0x0819081908080808, 0x081908190808082b,
-    0x0819081908081919, 0x0819081908082b08, 0x0819081908190819, 0x0819081908191908,
-    0x08190819082b0808, 0x0819081919080819, 0x0819081919081908, 0x0819081919190808,
-    0x081908192b080808, 0x081908192b191908, 0x081908192b19192b, 0x0819082b08080819,
-    0x0819082b08081908, 0x0819082b0808192b, 0x0819082b08190808, 0x0819082b19080808,
-    0x0819082b192b0808, 0x0819190808080808, 0x081919080808082b, 0x0819190808081919,
-    0x0819190808082b08, 0x0819190808190819, 0x0819190808191908, 0x08191908082b0808,
-    0x0819190819080819, 0x0819190819081908, 0x0819190819082b19, 0x0819190819190808,
-    0x08191908192b1908, 0x081919082b080808, 0x0819191908080819, 0x0819191908081908,
-    0x0819191908190808, 0x0819191919080808, 0x0819192b08080808, 0x0819192b08191908,
-    0x0819192b19082b19, 0x08192b0808080819, 0x08192b0808081908, 0x08192b0808190808,
-    0x08192b080819082b, 0x08192b0819080808, 0x08192b0819191908, 0x08192b082b08192b,
-    0x08192b1908080808, 0x08192b1908081919, 0x08192b19192b192b, 0x08192b2b19190819,
-    0x08192b2b2b2b2b19, 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919,
-    0x082b080808082b08, 0x082b080808082b2b, 0x082b080808190819, 0x082b080808191908,
-    0x082b0808082b0808, 0x082b080819080819, 0x082b080819081908, 0x082b080819190808,
-    0x082b08082b080808, 0x082b08082b2b0808, 0x082b081908080819, 0x082b081908081908,
-    0x082b081908190808, 0x082b081919080808, 0x082b081919082b08, 0x082b0819192b1919,
-    0x082b082b08080808, 0x082b082b082b082b, 0x082b082b2b080808, 0x082b082b2b2b2b08,
-    0x082b190808080819, 0x082b190808081908, 0x082b190808190808, 0x082b1908082b2b19,
-    0x082b190819080808, 0x082b191908080808, 0x082b191919080819, 0x082b19191919082b,
-    0x082b19192b192b19, 0x082b192b08080819, 0x082b192b08192b2b, 0x082b192b2b2b192b,
-    0x082b2b0808080808, 0x082b2b0808082b08, 0x082b2b0808082b2b, 0x082b2b08082b0808,
-    0x082b2b0819191919, 0x082b2b082b082b08, 0x082b2b082b2b082b, 0x082b2b19192b2b08,
-    0x082b2b192b190808, 0x082b2b2b08082b08, 0x082b2b2b082b0808, 0x082b2b2b2b08082b,
-    0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, 0x1908080808081908,
-    0x190808080808192b, 0x1908080808082b19, 0x1908080808190808, 0x190808080819082b,
-    0x1908080808191919, 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908,
-    0x1908080819080808, 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08,
-    0x1908080819082b2b, 0x1908080819190819, 0x1908080819191908, 0x19080808192b0808,
-    0x19080808192b1919, 0x190808082b080819, 0x190808082b081908, 0x190808082b190808,
-    0x1908081908080808, 0x190808190808082b, 0x1908081908081919, 0x1908081908082b08,
-    0x1908081908190819, 0x1908081908191908, 0x19080819082b0808, 0x1908081919080819,
-    0x1908081919081908, 0x1908081919190808, 0x190808192b080808, 0x190808192b081919,
-    0x190808192b2b082b, 0x1908082b08080819, 0x1908082b08081908, 0x1908082b08190808,
-    0x1908082b0819082b, 0x1908082b082b2b19, 0x1908082b19080808, 0x1908190808080808,
-    0x190819080808082b, 0x1908190808081919, 0x1908190808082b08, 0x1908190808190819,
-    0x1908190808191908, 0x1908190808192b19, 0x19081908082b0808, 0x1908190819080819,
-    0x1908190819081908, 0x1908190819190808, 0x190819082b080808, 0x190819082b191908,
-    0x1908191908080819, 0x1908191908081908, 0x1908191908190808, 0x19081919082b1908,
-    0x1908191919080808, 0x190819192b192b2b, 0x1908192b08080808, 0x1908192b08082b2b,
-    0x1908192b19081908, 0x1908192b19190808, 0x19082b0808080819, 0x19082b0808081908,
-    0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, 0x19082b0819191908,
-    0x19082b08192b082b, 0x19082b1908080808, 0x19082b1908190819, 0x19082b1919081908,
-    0x19082b1919190808, 0x19082b19192b2b19, 0x19082b2b08081908, 0x1919080808080808,
-    0x191908080808082b, 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819,
-    0x1919080808191908, 0x19190808082b0808, 0x19190808082b2b08, 0x1919080819080819,
-    0x1919080819081908, 0x1919080819190808, 0x191908082b080808, 0x1919081908080819,
-    0x1919081908081908, 0x1919081908190808, 0x1919081908191919, 0x1919081919080808,
-    0x191908191908082b, 0x1919082b08080808, 0x1919082b19081908, 0x1919082b2b2b2b2b,
-    0x1919190808080819, 0x1919190808081908, 0x1919190808190808, 0x19191908082b0819,
-    0x1919190819080808, 0x19191908192b0808, 0x191919082b080819, 0x191919082b2b0819,
-    0x1919191908080808, 0x1919191908082b08, 0x191919192b080808, 0x191919192b082b08,
-    0x1919192b082b0819, 0x1919192b192b2b08, 0x1919192b2b2b0819, 0x19192b0808080808,
-    0x19192b0808191908, 0x19192b0819080819, 0x19192b0819190808, 0x19192b082b192b19,
-    0x19192b1908192b2b, 0x19192b1919080808, 0x19192b191908082b, 0x19192b2b2b081919,
-    0x192b080808080819, 0x192b080808081908, 0x192b080808190808, 0x192b080819080808,
-    0x192b080819191908, 0x192b0808192b082b, 0x192b08082b08192b, 0x192b08082b2b2b19,
-    0x192b081908080808, 0x192b082b082b1908, 0x192b082b19082b2b, 0x192b082b2b19082b,
-    0x192b190808080808, 0x192b19080819192b, 0x192b191908190808, 0x192b191919080808,
-    0x192b191919081919, 0x192b19192b2b1908, 0x192b2b0808080819, 0x192b2b08192b2b2b,
-    0x192b2b19082b1919, 0x192b2b2b0808192b, 0x192b2b2b19191908, 0x192b2b2b192b082b,
-    0x2b08080808080808, 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08,
-    0x2b08080808190819, 0x2b08080808191908, 0x2b080808082b0808, 0x2b080808082b2b2b,
-    0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808082b080808,
-    0x2b0808082b08082b, 0x2b0808082b2b2b08, 0x2b0808082b2b2b2b, 0x2b08081908080819,
-    0x2b08081908081908, 0x2b0808190808192b, 0x2b08081908190808, 0x2b08081919080808,
-    0x2b08081919190819, 0x2b08081919192b19, 0x2b08082b08080808, 0x2b08082b082b0808,
-    0x2b08082b2b080808, 0x2b08082b2b08082b, 0x2b08082b2b2b0808, 0x2b08082b2b2b2b08,
-    0x2b08190808080819, 0x2b08190808081908, 0x2b08190808190808, 0x2b0819080819082b,
-    0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808, 0x2b0819082b082b19,
-    0x2b08191908080808, 0x2b08191919081908, 0x2b0819192b2b1919, 0x2b08192b08192b08,
-    0x2b08192b192b2b2b, 0x2b082b0808080808, 0x2b082b0808082b08, 0x2b082b08082b1919,
-    0x2b082b0819192b2b, 0x2b082b082b080808, 0x2b082b082b08082b, 0x2b082b082b2b2b08,
-    0x2b082b190808192b, 0x2b082b2b082b082b, 0x2b082b2b2b080808, 0x2b082b2b2b082b08,
-    0x2b082b2b2b19192b, 0x2b082b2b2b2b2b08, 0x2b19080808080819, 0x2b19080808081908,
-    0x2b19080808190808, 0x2b19080819080808, 0x2b1908081919192b, 0x2b1908082b081908,
-    0x2b19081908080808, 0x2b190819082b082b, 0x2b190819192b1908, 0x2b19082b1919192b,
-    0x2b19082b2b082b19, 0x2b19190808080808, 0x2b19190808081919, 0x2b19190819081908,
-    0x2b19190819190808, 0x2b19190819192b08, 0x2b191919082b2b19, 0x2b1919192b190808,
-    0x2b1919192b19082b, 0x2b19192b19080819, 0x2b192b0819190819, 0x2b192b082b2b192b,
-    0x2b192b1919082b19, 0x2b192b2b08191919, 0x2b192b2b192b0808, 0x2b2b080808080808,
-    0x2b2b08080808082b, 0x2b2b080808082b08, 0x2b2b080808082b2b, 0x2b2b0808082b0808,
-    0x2b2b0808082b2b2b, 0x2b2b08082b2b0808, 0x2b2b081919190819, 0x2b2b081919192b19,
-    0x2b2b08192b2b192b, 0x2b2b082b08080808, 0x2b2b082b0808082b, 0x2b2b082b08082b08,
-    0x2b2b082b082b2b2b, 0x2b2b082b2b080808, 0x2b2b082b2b2b0808, 0x2b2b190819080808,
-    0x2b2b19082b191919, 0x2b2b192b192b1919, 0x2b2b192b2b192b08, 0x2b2b2b0808082b2b,
-    0x2b2b2b08082b0808, 0x2b2b2b08082b082b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b0808,
-    0x2b2b2b082b2b2b08, 0x2b2b2b1908081908, 0x2b2b2b192b081908, 0x2b2b2b192b08192b,
-    0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
-};
-
-static const uint8_t ksigns_iq2xs[128] = {
-      0, 129, 130,   3, 132,   5,   6, 135, 136,   9,  10, 139,  12, 141, 142,  15,
-    144,  17,  18, 147,  20, 149, 150,  23,  24, 153, 154,  27, 156,  29,  30, 159,
-    160,  33,  34, 163,  36, 165, 166,  39,  40, 169, 170,  43, 172,  45,  46, 175,
-     48, 177, 178,  51, 180,  53,  54, 183, 184,  57,  58, 187,  60, 189, 190,  63,
-    192,  65,  66, 195,  68, 197, 198,  71,  72, 201, 202,  75, 204,  77,  78, 207,
-     80, 209, 210,  83, 212,  85,  86, 215, 216,  89,  90, 219,  92, 221, 222,  95,
-     96, 225, 226,  99, 228, 101, 102, 231, 232, 105, 106, 235, 108, 237, 238, 111,
-    240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
-};
-
-static const uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128};
-
-void quantize_row_iq2_xxs_reference(const float * restrict x, block_iq2_xxs * restrict y, int k) {
-    (void)x;
-    (void)y;
-    (void)k;
-    assert(k % QK_K == 0);
-    //fprintf(stderr, "=========================== %s: not implemented\n", __func__);
-}
-
-void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k) {
-    assert(k % QK_K == 0);
-    const int nb = k / QK_K;
-
-    uint32_t aux32[2];
-    const uint8_t * aux8 = (const uint8_t *)aux32;
-
-    for (int i = 0; i < nb; i++) {
-
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-
-        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
-            memcpy(aux32, x[i].qs + 4*ib32, 2*sizeof(uint32_t));
-            const float db = d * (0.5f + (aux32[1] >> 28)) * 0.25f;
-            for (int l = 0; l < 4; ++l) {
-                const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
-                const uint8_t  signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
-                for (int j = 0; j < 8; ++j) {
-                    y[j] = db * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
-                }
-                y += 8;
-            }
-        }
-    }
-}
-
-void quantize_row_iq2_xxs(const float * restrict x, void * restrict vy, int k) {
-    assert(k % QK_K == 0);
-    block_iq2_xxs * restrict y = vy;
-    quantize_row_iq2_xxs_reference(x, y, k);
-}
-
-size_t ggml_quantize_iq2_xxs(const float * src, void * dst, int n, int k, int64_t * hist) {
-    assert(k % QK_K == 0);
-    (void)hist; // TODO: collect histograms
-
-    for (int j = 0; j < n; j += k) {
-        block_iq2_xxs * restrict y = (block_iq2_xxs *)dst + j/QK_K;
-        quantize_row_iq2_xxs_reference(src + j, y, k);
-    }
-    return (n/QK_K*sizeof(block_iq2_xxs));
-}
-
-// ====================== 2.3125 bpw (de)-quantization
-
-void quantize_row_iq2_xs_reference(const float * restrict x, block_iq2_xs * restrict y, int k) {
-    (void)x;
-    (void)y;
-    (void)k;
-    assert(k % QK_K == 0);
-    //fprintf(stderr, "=========================== %s: not implemented\n", __func__);
-}
-
-void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, int k) {
-    assert(k % QK_K == 0);
-    const int nb = k / QK_K;
-
-    float db[2];
-
-    for (int i = 0; i < nb; i++) {
-
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-
-        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
-            db[0] = d * (0.5f + (x[i].scales[ib32] & 0xf)) * 0.25f;
-            db[1] = d * (0.5f + (x[i].scales[ib32] >>  4)) * 0.25f;
-            for (int l = 0; l < 4; ++l) {
-                const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (x[i].qs[4*ib32 + l] & 511));
-                const uint8_t  signs = ksigns_iq2xs[x[i].qs[4*ib32 + l] >> 9];
-                for (int j = 0; j < 8; ++j) {
-                    y[j] = db[l/2] * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
-                }
-                y += 8;
-            }
-        }
-    }
-}
-
-void quantize_row_iq2_xs(const float * restrict x, void * restrict vy, int k) {
-    assert(k % QK_K == 0);
-    block_iq2_xs * restrict y = vy;
-    quantize_row_iq2_xs_reference(x, y, k);
-}
-
-size_t ggml_quantize_iq2_xs(const float * src, void * dst, int n, int k, int64_t * hist) {
-    assert(k % QK_K == 0);
-    (void)hist; // TODO: collect histograms
-
-    for (int j = 0; j < n; j += k) {
-        block_iq2_xs * restrict y = (block_iq2_xs *)dst + j/QK_K;
-        quantize_row_iq2_xs_reference(src + j, y, k);
-    }
-    return (n/QK_K*sizeof(block_iq2_xs));
-}
-
 //===================================== Q8_K ==============================================

 void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) {
@@ -2678,9 +2362,7 @@ void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict
            x += QK_K;
            continue;
        }
-        //const float iscale = -128.f/max;
-        // We need this change for IQ2_XXS, else the AVX implementation becomes very awkward
-        const float iscale = -127.f/max;
+        const float iscale = -128.f/max;
        for (int j = 0; j < QK_K; ++j) {
            int v = nearest_int(iscale*x[j]);
            y[i].qs[j] = MIN(127, v);
@@ -7383,319 +7065,3 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
 }

 #endif
-
-static const int8_t keven_signs_q2xs[1024] = {
-     1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1,  1,  1,
-     1,  1, -1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1,  1,  1,  1, -1, -1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1, -1,
-     1,  1,  1, -1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1, -1,
-     1,  1, -1, -1,  1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1, -1,  1, -1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1,  1,
-     1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1,  1, -1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1, -1,
-     1,  1, -1,  1, -1,  1,  1,  1, -1,  1, -1,  1, -1,  1,  1, -1,  1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1,  1,
-     1,  1,  1, -1, -1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1, -1,  1, -1,  1, -1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1,  1,
-     1,  1, -1, -1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1,  1,  1, -1, -1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1, -1,
-     1,  1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1,  1, -1,  1,  1,  1, -1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1,  1, -1,  1, -1,
-     1,  1, -1,  1,  1, -1,  1,  1, -1,  1, -1,  1,  1, -1,  1, -1,  1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1,  1,
-     1,  1,  1, -1,  1, -1,  1,  1, -1,  1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1,  1,
-     1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1,  1,  1,  1, -1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1, -1,
-     1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1,  1, -1, -1,  1, -1,  1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1,  1,  1,
-     1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1,  1, -1, -1,  1,  1,  1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1, -1,
-     1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1,  1,  1,  1, -1,  1, -1, -1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1, -1,
-     1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1, -1,  1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1,  1,
-     1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1,  1, -1, -1,
-     1,  1, -1,  1,  1,  1, -1,  1, -1,  1, -1,  1,  1,  1, -1, -1,  1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1,  1,
-     1,  1,  1, -1,  1,  1, -1,  1, -1,  1,  1, -1,  1,  1, -1, -1,  1, -1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1, -1,  1,
-     1,  1, -1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1,  1,  1, -1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1, -1,
-     1,  1,  1,  1, -1,  1, -1,  1, -1,  1,  1,  1, -1,  1, -1, -1,  1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1,  1,
-     1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1,  1,  1, -1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1, -1,
-     1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1,  1, -1,  1,  1, -1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1, -1,
-     1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1, -1,  1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1,  1,
-     1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1,  1, -1, -1, -1,  1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1, -1,  1,
-     1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1, -1, -1,  1,  1, -1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1, -1,
-     1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1, -1,  1,  1, -1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1,  1, -1, -1, -1,
-     1,  1, -1, -1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1, -1, -1,  1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1,  1,
-     1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1, -1, -1,  1,  1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1, -1,
-     1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1, -1, -1, -1,  1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1,  1,
-     1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1, -1,  1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1,  1,
-     1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1,  1,  1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1,
-};
-
-void ggml_vec_dot_iq2_xxs_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
-    assert(n % QK_K == 0);
-
-    const block_iq2_xxs * restrict x = vx;
-    const block_q8_K    * restrict y = vy;
-
-    const int nb = n / QK_K;
-
-#if defined(__ARM_NEON)
-
-    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
-
-    uint32_t aux32[4];
-    const uint8_t * aux8 = (const uint8_t *)aux32;
-
-    ggml_int8x16x4_t q2u;
-    ggml_int8x16x4_t q2s;
-    ggml_int8x16x4_t q8b;
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint16_t * restrict q2 = x[i].qs;
-        const int8_t   * restrict q8 = y[i].qs;
-        float sumf1 = 0, sumf2 = 0;
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
-            q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
-            memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8;
-            q2u.val[0] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[ 0])), vld1_s8((const void *)(iq2xxs_grid + aux8[ 1])));
-            q2u.val[1] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[ 2])), vld1_s8((const void *)(iq2xxs_grid + aux8[ 3])));
-            q2u.val[2] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[ 8])), vld1_s8((const void *)(iq2xxs_grid + aux8[ 9])));
-            q2u.val[3] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[10])), vld1_s8((const void *)(iq2xxs_grid + aux8[11])));
-            q2s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >>  0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >>  7) & 127))));
-            q2s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 21) & 127))));
-            q2s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[3] >>  0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[3] >>  7) & 127))));
-            q2s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[3] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[3] >> 21) & 127))));
-            q2u.val[0] = vmulq_s8(q2u.val[0], q2s.val[0]);
-            q2u.val[1] = vmulq_s8(q2u.val[1], q2s.val[1]);
-            q2u.val[2] = vmulq_s8(q2u.val[2], q2s.val[2]);
-            q2u.val[3] = vmulq_s8(q2u.val[3], q2s.val[3]);
-            const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[0], q8b.val[0]), q2u.val[1], q8b.val[1]);
-            const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[2], q8b.val[2]), q2u.val[3], q8b.val[3]);
-            sumf1 += vaddvq_s32(p1) * (0.5f + (aux32[1] >> 28));
-            sumf2 += vaddvq_s32(p2) * (0.5f + (aux32[3] >> 28));
-        }
-        sumf += d*(sumf1 + sumf2);
-    }
-    *s = 0.25f * sumf;
-
-#elif defined(__AVX2__)
-
-    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
-
-    uint32_t aux32[4];
-    const uint8_t * aux8 = (const uint8_t *)aux32;
-
-    __m256 accumf = _mm256_setzero_ps();
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint16_t * restrict q2 = x[i].qs;
-        const int8_t   * restrict q8 = y[i].qs;
-        __m256i sumi1 = _mm256_setzero_si256();
-        __m256i sumi2 = _mm256_setzero_si256();
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
-            const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-            const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-            memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8;
-            const __m256i q2_1 = _mm256_set_epi64x(iq2xxs_grid[aux8[ 3]], iq2xxs_grid[aux8[ 2]], iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]);
-            const __m256i q2_2 = _mm256_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]], iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]);
-            const __m256i s2_1 = _mm256_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127],
-                                                   signs64[(aux32[1] >>  7) & 127], signs64[(aux32[1] >>  0) & 127]);
-            const __m256i s2_2 = _mm256_set_epi64x(signs64[(aux32[3] >> 21) & 127], signs64[(aux32[3] >> 14) & 127],
-                                                   signs64[(aux32[3] >>  7) & 127], signs64[(aux32[3] >>  0) & 127]);
-            const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1);
-            const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2);
-            const __m256i dot1  = _mm256_maddubs_epi16(q2_1, q8s_1);
-            const __m256i dot2  = _mm256_maddubs_epi16(q2_2, q8s_2);
-            const uint16_t ls1 = aux32[1] >> 28;
-            const uint16_t ls2 = aux32[3] >> 28;
-            const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
-            const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
-            sumi1 = _mm256_add_epi32(sumi1, p1);
-            sumi2 = _mm256_add_epi32(sumi2, p2);
-        }
-
-        accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
-
-    }
-
-    *s = 0.125f * hsum_float_8(accumf);
-
-#else
-
-    uint32_t aux32[2];
-    const uint8_t * aux8 = (const uint8_t *)aux32;
-
-    float sumf = 0.f;
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint16_t * restrict q2 = x[i].qs;
-        const int8_t   * restrict q8 = y[i].qs;
-        int32_t bsum = 0;
-        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
-            memcpy(aux32, q2, 2*sizeof(uint32_t));
-            q2 += 4;
-            const uint32_t ls = 2*(aux32[1] >> 28) + 1;
-            int32_t sumi = 0;
-            for (int l = 0; l < 4; ++l) {
-                const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
-                const uint8_t  signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
-                for (int j = 0; j < 8; ++j) {
-                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
-                }
-                q8 += 8;
-            }
-            bsum += sumi * ls;
-        }
-        sumf += d * bsum;
-    }
-    *s = 0.125f * sumf;
-#endif
-}
-
-void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
-    assert(n % QK_K == 0);
-
-    const block_iq2_xs * restrict x = vx;
-    const block_q8_K   * restrict y = vy;
-
-    const int nb = n / QK_K;
-
-#if defined(__ARM_NEON)
-
-    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
-
-    int8x16x4_t q2u;
-    int8x16x4_t q2s;
-    int8x16x4_t q8b;
-
-    int32x4x4_t scales32;
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint16_t * restrict q2 = x[i].qs;
-        const int8_t   * restrict q8 = y[i].qs;
-        const uint8x8_t scales8 = vld1_u8(x[i].scales);
-        const uint8x8_t scales_l = vand_u8(scales8, vdup_n_u8(0xf));
-        const uint8x8_t scales_h = vshr_n_u8(scales8, 4);
-        uint8x16_t scales = vcombine_u8(vzip1_u8(scales_l, scales_h), vzip2_u8(scales_l, scales_h));
-        scales = vaddq_u8(vshlq_n_u8(scales, 1), vdupq_n_u8(1));
-        const uint16x8_t scales1 = vmovl_u8(vget_low_u8(scales));
-        const uint16x8_t scales2 = vmovl_u8(vget_high_u8(scales));
-        scales32.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(scales1)));
-        scales32.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales1)));
-        scales32.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(scales2)));
-        scales32.val[3] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales2)));
-        int32x4_t sumi = vdupq_n_s32(0);
-        for (int ib64 = 0; ib64 < QK_K/64; ++ib64) {
-            q8b = vld1q_s8_x4(q8); q8 += 64;
-            q2u.val[0] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[0] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[1] & 511))));
-            q2u.val[1] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[2] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[3] & 511))));
-            q2u.val[2] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[4] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[5] & 511))));
-            q2u.val[3] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[6] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[7] & 511))));
-            q2s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[0] >> 9))), vld1_s8((const void *)(signs64 + (q2[1] >> 9))));
-            q2s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[2] >> 9))), vld1_s8((const void *)(signs64 + (q2[3] >> 9))));
-            q2s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[4] >> 9))), vld1_s8((const void *)(signs64 + (q2[5] >> 9))));
-            q2s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[6] >> 9))), vld1_s8((const void *)(signs64 + (q2[7] >> 9))));
-            q2u.val[0] = vmulq_s8(q2u.val[0], q2s.val[0]);
-            q2u.val[1] = vmulq_s8(q2u.val[1], q2s.val[1]);
-            q2u.val[2] = vmulq_s8(q2u.val[2], q2s.val[2]);
-            q2u.val[3] = vmulq_s8(q2u.val[3], q2s.val[3]);
-            const int32x4_t p1 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[0], q8b.val[0]);
-            const int32x4_t p2 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[1], q8b.val[1]);
-            const int32x4_t p3 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[2], q8b.val[2]);
-            const int32x4_t p4 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[3], q8b.val[3]);
-            const int32x4_t p = vpaddq_s32(vpaddq_s32(p1, p2), vpaddq_s32(p3, p4));
-            sumi = vmlaq_s32(sumi, p, scales32.val[ib64]);
-            q2 += 8;
-        }
-        sumf += d*vaddvq_s32(sumi);
-    }
-    *s = 0.125f * sumf;
-
-#elif defined(__AVX2__)
-
-    const __m128i m4 = _mm_set1_epi8(0xf);
-    const __m128i m1 = _mm_set1_epi8(1);
-    const __m128i m511 = _mm_set1_epi16(511);
-    const __m128i m127 = _mm_set1_epi16(127);
-
-    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
-
-    uint64_t aux64;
-
-    // somewhat hacky, but gives a significant boost in performance
-    __m128i aux_gindex, aux_sindex;
-    const uint16_t * gindex = (const uint16_t *)&aux_gindex;
-    const uint16_t * sindex = (const uint16_t *)&aux_sindex;
-
-    __m256 accumf = _mm256_setzero_ps();
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint16_t * restrict q2 = x[i].qs;
-        const int8_t   * restrict q8 = y[i].qs;
-
-        memcpy(&aux64, x[i].scales, 8);
-        __m128i stmp = _mm_set1_epi64x(aux64);
-        stmp = _mm_unpacklo_epi8(_mm_and_si128(stmp, m4), _mm_and_si128(_mm_srli_epi16(stmp, 4), m4));
-        const __m128i scales = _mm_add_epi8(_mm_slli_epi16(stmp, 1), m1);
-
-        __m256i sumi1 = _mm256_setzero_si256();
-        __m256i sumi2 = _mm256_setzero_si256();
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
-            const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-            const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-            const __m128i q2_data = _mm_loadu_si128((const __m128i*)q2);  q2 +=  8;
-            aux_gindex = _mm_and_si128(q2_data, m511);
-            aux_sindex = _mm_and_si128(_mm_srli_epi16(q2_data, 9), m127);
-            const __m256i q2_1 = _mm256_set_epi64x(iq2xs_grid[gindex[3]], iq2xs_grid[gindex[2]], iq2xs_grid[gindex[1]], iq2xs_grid[gindex[0]]);
-            const __m256i q2_2 = _mm256_set_epi64x(iq2xs_grid[gindex[7]], iq2xs_grid[gindex[6]], iq2xs_grid[gindex[5]], iq2xs_grid[gindex[4]]);
-            const __m256i s2_1 = _mm256_set_epi64x(signs64[sindex[3]], signs64[sindex[2]], signs64[sindex[1]], signs64[sindex[0]]);
-            const __m256i s2_2 = _mm256_set_epi64x(signs64[sindex[7]], signs64[sindex[6]], signs64[sindex[5]], signs64[sindex[4]]);
-            const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1);
-            const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2);
-            const __m256i dot1  = _mm256_maddubs_epi16(q2_1, q8s_1);
-            const __m256i dot2  = _mm256_maddubs_epi16(q2_2, q8s_2);
-
-            const __m256i sc1 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+0)));
-            const __m256i sc2 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+1)));
-
-            sumi1 = _mm256_add_epi32(sumi1, _mm256_madd_epi16(dot1, sc1));
-            sumi2 = _mm256_add_epi32(sumi2, _mm256_madd_epi16(dot2, sc2));
-        }
-
-        accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
-
-    }
-
-    *s = 0.125f * hsum_float_8(accumf);
-
-#else
-
-    float sumf = 0.f;
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint16_t * restrict q2 = x[i].qs;
-        const uint8_t  * restrict sc = x[i].scales;
-        const int8_t   * restrict q8 = y[i].qs;
-        int32_t bsum = 0;
-        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
-            const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
-            const uint16_t ls2 = 2*(sc[ib32] >>  4) + 1;
-            int32_t sumi = 0;
-            for (int l = 0; l < 2; ++l) {
-                const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
-                const uint8_t  signs = ksigns_iq2xs[q2[l] >> 9];
-                for (int j = 0; j < 8; ++j) {
-                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
-                }
-                q8 += 8;
-            }
-            bsum += sumi * ls1;
-            sumi = 0;
-            for (int l = 2; l < 4; ++l) {
-                const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
-                const uint8_t  signs = ksigns_iq2xs[q2[l] >> 9];
-                for (int j = 0; j < 8; ++j) {
-                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
-                }
-                q8 += 8;
-            }
-            bsum += sumi * ls2;
-            q2 += 4;
-        }
-        sumf += d * bsum;
-    }
-    *s = 0.125f * sumf;
-#endif
-}
--- a/ggml-quants.h
+++ b/ggml-quants.h
@@ -70,7 +70,7 @@ static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block s
 // 2-bit quantization
 // weight is represented as x = a * q + b
 // 16 blocks of 16 elements each
-// Effectively 2.625 bits per weight
+// Effectively 2.5625 bits per weight
 typedef struct {
    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
    uint8_t qs[QK_K/4];      // quants
@@ -165,22 +165,6 @@ typedef struct {
 } block_q8_K;
 static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding");

-// (Almost) "true" 2-bit quantization.
-// Due to the need to use blocks as per ggml dsign, it ends up using
-// 2.0625 bpw because of the 16-bit scale for each block of 256.
-typedef struct {
-    ggml_fp16_t d;
-    uint16_t qs[QK_K/8];
-} block_iq2_xxs;
-static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding");
-
-// 2.3125 bpw quants
-typedef struct {
-    ggml_fp16_t d;
-    uint16_t qs[QK_K/8];
-    uint8_t  scales[QK_K/32];
-} block_iq2_xs;
-static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");

 // Quantization
 void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k);
@@ -196,8 +180,6 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
 void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
 void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
 void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
-void quantize_row_iq2_xxs_reference(const float * restrict x, block_iq2_xxs * restrict y, int k);
-void quantize_row_iq2_xs_reference (const float * restrict x, block_iq2_xs  * restrict y, int k);

 void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
 void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
@@ -212,8 +194,6 @@ void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
 void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
 void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
 void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
-void quantize_row_iq2_xxs(const float * restrict x, void * restrict y, int k);
-void quantize_row_iq2_xs (const float * restrict x, void * restrict y, int k);

 // Dequantization
 void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
@@ -229,8 +209,6 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int
 void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k);
 void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k);
 void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
-void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k);
-void dequantize_row_iq2_xs (const block_iq2_xs  * restrict x, float * restrict y, int k);

 // Dot product
 void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
@@ -244,5 +222,3 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx,
 void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_iq2_xs_q8_K (int n, float * restrict s, const void * restrict vx, const void * restrict vy);
--- a/ggml.c
+++ b/ggml.c
@@ -132,7 +132,7 @@ void ggml_print_backtrace(void) {
            "-ex", "bt -frame-info source-and-location",
            "-ex", "detach",
            "-ex", "quit",
-            (char *) NULL);
+            NULL);
    } else {
        waitpid(pid, NULL, 0);
    }
@@ -394,12 +394,6 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
 static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y);
 static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y);

-ggml_collect_imatrix_t g_imatrix_collect = NULL;
-
-void ggml_set_imatrix_collection(ggml_collect_imatrix_t imatrix_collect) {
-    g_imatrix_collect = imatrix_collect;
-}
-
 static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
    [GGML_TYPE_I8] = {
        .type_name                = "i8",
@@ -579,28 +573,6 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .vec_dot                  = ggml_vec_dot_q6_K_q8_K,
        .vec_dot_type             = GGML_TYPE_Q8_K,
    },
-    [GGML_TYPE_IQ2_XXS] = {
-        .type_name                = "iq2_xxs",
-        .blck_size                = QK_K,
-        .type_size                = sizeof(block_iq2_xxs),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_iq2_xxs,
-        .from_float               = quantize_row_iq2_xxs,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_iq2_xxs_reference,
-        .vec_dot                  = ggml_vec_dot_iq2_xxs_q8_K,
-        .vec_dot_type             = GGML_TYPE_Q8_K,
-    },
-    [GGML_TYPE_IQ2_XS] = {
-        .type_name                = "iq2_xs",
-        .blck_size                = QK_K,
-        .type_size                = sizeof(block_iq2_xs),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_iq2_xs,
-        .from_float               = quantize_row_iq2_xs,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_iq2_xs_reference,
-        .vec_dot                  = ggml_vec_dot_iq2_xs_q8_K,
-        .vec_dot_type             = GGML_TYPE_Q8_K,
-    },
    [GGML_TYPE_Q8_K] = {
        .type_name                = "q8_K",
        .blck_size                = QK_K,
@@ -2139,8 +2111,6 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
        case GGML_FTYPE_MOSTLY_Q4_K:          wtype = GGML_TYPE_Q4_K;  break;
        case GGML_FTYPE_MOSTLY_Q5_K:          wtype = GGML_TYPE_Q5_K;  break;
        case GGML_FTYPE_MOSTLY_Q6_K:          wtype = GGML_TYPE_Q6_K;  break;
-        case GGML_FTYPE_MOSTLY_IQ2_XXS:       wtype = GGML_TYPE_IQ2_XXS;  break;
-        case GGML_FTYPE_MOSTLY_IQ2_XS:        wtype = GGML_TYPE_IQ2_XS;   break;
        case GGML_FTYPE_UNKNOWN:              wtype = GGML_TYPE_COUNT; break;
        case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
    }
@@ -4329,13 +4299,13 @@ struct ggml_tensor * ggml_set_2d_inplace(
 static struct ggml_tensor * ggml_cpy_impl(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
-        struct ggml_tensor  * b) {
+        struct ggml_tensor  * b,
+        bool inplace) {
    GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b));

    bool is_node = false;

-    if (a->grad || b->grad) {
-        // inplace is false and either one have a grad
+    if (!inplace && (a->grad || b->grad)) {
        is_node = true;
    }

@@ -4359,21 +4329,29 @@ struct ggml_tensor * ggml_cpy(
        struct ggml_context * ctx,
        struct ggml_tensor * a,
        struct ggml_tensor * b) {
-    return ggml_cpy_impl(ctx, a, b);
+    return ggml_cpy_impl(ctx, a, b, false);
+}
+
+struct ggml_tensor * ggml_cpy_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a,
+        struct ggml_tensor * b) {
+    return ggml_cpy_impl(ctx, a, b, true);
 }

 // ggml_cont

 static struct ggml_tensor * ggml_cont_impl(
        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
+        struct ggml_tensor  * a,
+        bool inplace) {
    bool is_node = false;

-    if (a->grad) {
+    if (!inplace && a->grad) {
        is_node = true;
    }

-    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
    ggml_format_name(result, "%s (cont)", a->name);

    result->op   = GGML_OP_CONT;
@@ -4386,7 +4364,13 @@ static struct ggml_tensor * ggml_cont_impl(
 struct ggml_tensor * ggml_cont(
        struct ggml_context * ctx,
        struct ggml_tensor * a) {
-    return ggml_cont_impl(ctx, a);
+    return ggml_cont_impl(ctx, a, false);
+}
+
+struct ggml_tensor * ggml_cont_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a) {
+    return ggml_cont_impl(ctx, a, true);
 }

 // make contiguous, with new shape
@@ -4782,11 +4766,8 @@ struct ggml_tensor * ggml_get_rows(
    }

    // TODO: implement non F32 return
-    enum ggml_type type = GGML_TYPE_F32;
-    if (a->type == GGML_TYPE_I32) {
-        type = a->type;
-    }
-    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, type, a->ne[0], b->ne[0], b->ne[1], b->ne[2]);
+    //struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]);
+    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, a->ne[0], b->ne[0], b->ne[1], b->ne[2]);

    result->op   = GGML_OP_GET_ROWS;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6957,165 +6938,14 @@ static void ggml_compute_forward_dup_f32(
    }
 }

-// A simplified version of ggml_compute_forward_dup that doesn't do float upcasting, and just plain old memcpy.
-static void ggml_compute_forward_dup_bytes(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
-    GGML_ASSERT(src0->type == dst->type);
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) {
-        ggml_compute_forward_dup_same_cont(params, src0, dst);
-        return;
-    }
-
-    GGML_TENSOR_UNARY_OP_LOCALS;
-
-    const size_t type_size = ggml_type_size(src0->type);
-    const int ith = params->ith; // thread index
-    const int nth = params->nth; // number of threads
-
-
-    // parallelize by rows
-    const int nr = ne01;
-    // number of rows per thread
-    const int dr = (nr + nth - 1) / nth;
-    // row range for this thread
-    const int ir0 = dr * ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    if (src0->type == dst->type &&
-        ne00 == ne0 &&
-        nb00 == type_size && nb0 == type_size) {
-        // copy by rows
-        const size_t rs = ne00 * type_size;
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                    memcpy(
-                        ((char *)  dst->data + i01*nb1  + i02*nb2  + i03*nb3),
-                        ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03),
-                        rs);
-                }
-            }
-        }
-        return;
-    }
-
-    if (ggml_is_contiguous(dst)) {
-        size_t id = 0;
-        char * dst_ptr = (char *) dst->data;
-        const size_t rs = ne00 * type_size;
-
-        if (nb00 == type_size) {
-            // src0 is contigous on first dimension, copy by rows
-            for (int64_t i03 = 0; i03 < ne03; i03++) {
-                for (int64_t i02 = 0; i02 < ne02; i02++) {
-                    id += rs * ir0;
-                    for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                        const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
-                        memcpy(dst_ptr + id, src0_ptr, rs);
-                        id += rs;
-                    }
-                    id += rs * (ne01 - ir1);
-                }
-            }
-        } else {
-            //printf("%s: this is not optimal - fix me\n", __func__);
-
-            for (int64_t i03 = 0; i03 < ne03; i03++) {
-                for (int64_t i02 = 0; i02 < ne02; i02++) {
-                    id += rs * ir0;
-                    for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                        for (int64_t i00 = 0; i00 < ne00; i00++) {
-                            const char * src0_ptr = (char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03;
-                            memcpy(dst_ptr + id, src0_ptr, type_size);
-
-                            id += type_size;
-                        }
-                    }
-                    id += rs * (ne01 - ir1);
-                }
-            }
-        }
-
-        return;
-    }
-
-    // dst counters
-
-    int64_t i10 = 0;
-    int64_t i11 = 0;
-    int64_t i12 = 0;
-    int64_t i13 = 0;
-
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        for (int64_t i02 = 0; i02 < ne02; i02++) {
-            i10 += ne00 * ir0;
-            while (i10 >= ne0) {
-                i10 -= ne0;
-                if (++i11 == ne1) {
-                    i11 = 0;
-                    if (++i12 == ne2) {
-                        i12 = 0;
-                        if (++i13 == ne3) {
-                            i13 = 0;
-                        }
-                    }
-                }
-            }
-            for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                for (int64_t i00 = 0; i00 < ne00; i00++) {
-                    const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-                          char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
-
-                    memcpy(dst_ptr, src0_ptr, type_size);
-
-                    if (++i10 == ne0) {
-                        i10 = 0;
-                        if (++i11 == ne1) {
-                            i11 = 0;
-                            if (++i12 == ne2) {
-                                i12 = 0;
-                                if (++i13 == ne3) {
-                                    i13 = 0;
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-            i10 += ne00 * (ne01 - ir1);
-            while (i10 >= ne0) {
-                i10 -= ne0;
-                if (++i11 == ne1) {
-                    i11 = 0;
-                    if (++i12 == ne2) {
-                        i12 = 0;
-                        if (++i13 == ne3) {
-                            i13 = 0;
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
 static void ggml_compute_forward_dup(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
        struct ggml_tensor * dst) {
-    if (src0->type == dst->type) {
-        ggml_compute_forward_dup_bytes(params, src0, dst);
+    if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
+        ggml_compute_forward_dup_same_cont(params, src0, dst);
        return;
    }
-
    switch (src0->type) {
        case GGML_TYPE_F16:
            {
@@ -7452,8 +7282,6 @@ static void ggml_compute_forward_add(
        case GGML_TYPE_Q4_K:
        case GGML_TYPE_Q5_K:
        case GGML_TYPE_Q6_K:
-        case GGML_TYPE_IQ2_XXS:
-        case GGML_TYPE_IQ2_XS:
            {
                ggml_compute_forward_add_q_f32(params, src0, src1, dst);
            } break;
@@ -7718,8 +7546,6 @@ static void ggml_compute_forward_add1(
        case GGML_TYPE_Q4_K:
        case GGML_TYPE_Q5_K:
        case GGML_TYPE_Q6_K:
-        case GGML_TYPE_IQ2_XXS:
-        case GGML_TYPE_IQ2_XS:
            {
                ggml_compute_forward_add1_q_f32(params, src0, src1, dst);
            } break;
@@ -7834,8 +7660,6 @@ static void ggml_compute_forward_acc(
        case GGML_TYPE_Q4_K:
        case GGML_TYPE_Q5_K:
        case GGML_TYPE_Q6_K:
-        case GGML_TYPE_IQ2_XXS:
-        case GGML_TYPE_IQ2_XS:
        default:
            {
                GGML_ASSERT(false);
@@ -8580,12 +8404,10 @@ static void ggml_compute_forward_repeat(
        struct ggml_tensor * dst) {
    switch (src0->type) {
        case GGML_TYPE_F16:
-        case GGML_TYPE_I16:
            {
                ggml_compute_forward_repeat_f16(params, src0, dst);
            } break;
        case GGML_TYPE_F32:
-        case GGML_TYPE_I32:
            {
                ggml_compute_forward_repeat_f32(params, src0, dst);
            } break;
@@ -8728,7 +8550,6 @@ static void ggml_compute_forward_concat(
    struct ggml_tensor* dst) {
    switch (src0->type) {
        case GGML_TYPE_F32:
-        case GGML_TYPE_I32:
            {
                ggml_compute_forward_concat_f32(params, src0, src1, dst);
            } break;
@@ -9726,10 +9547,10 @@ static void ggml_compute_forward_group_norm(
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
 // helper function to determine if it is better to use BLAS or not
 // for large matrices, BLAS is faster
-static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) {
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
+static bool ggml_compute_forward_mul_mat_use_blas(
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+              struct ggml_tensor * dst) {
    //const int64_t ne00 = src0->ne[0];
    //const int64_t ne01 = src0->ne[1];

@@ -9769,10 +9590,6 @@ static void ggml_compute_forward_mul_mat(
    const int ith = params->ith;
    const int nth = params->nth;

-    if (ith == 1 && g_imatrix_collect) {
-        g_imatrix_collect(src0, src1);
-    }
-
    const enum ggml_type type = src0->type;

    const bool src1_cont = ggml_is_contiguous(src1);
@@ -9813,7 +9630,7 @@ static void ggml_compute_forward_mul_mat(
 #endif

 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-    if (ggml_compute_forward_mul_mat_use_blas(dst)) {
+    if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
        if (params->ith != 0) {
            return;
        }
@@ -10076,10 +9893,6 @@ static void ggml_compute_forward_mul_mat_id(

        const struct ggml_tensor * src0_cur = dst->src[cur_a + 2];

-        if (ith == 1 && g_imatrix_collect) {
-            g_imatrix_collect(src0_cur, src1);
-        }
-
        const void * wdata    = (src1->type == vec_dot_type) ? src1->data : params->wdata;
        const size_t row_size = ggml_row_size(vec_dot_type, ne10);

@@ -10485,8 +10298,6 @@ static void ggml_compute_forward_out_prod(
        case GGML_TYPE_Q4_K:
        case GGML_TYPE_Q5_K:
        case GGML_TYPE_Q6_K:
-        case GGML_TYPE_IQ2_XXS:
-        case GGML_TYPE_IQ2_XS:
            {
                ggml_compute_forward_out_prod_q_f32(params, src0, src1, dst);
            } break;
@@ -10661,8 +10472,6 @@ static void ggml_compute_forward_set(
        case GGML_TYPE_Q4_K:
        case GGML_TYPE_Q5_K:
        case GGML_TYPE_Q6_K:
-        case GGML_TYPE_IQ2_XXS:
-        case GGML_TYPE_IQ2_XS:
        default:
            {
                GGML_ASSERT(false);
@@ -10857,8 +10666,6 @@ static void ggml_compute_forward_get_rows(
        case GGML_TYPE_Q4_K:
        case GGML_TYPE_Q5_K:
        case GGML_TYPE_Q6_K:
-        case GGML_TYPE_IQ2_XXS:
-        case GGML_TYPE_IQ2_XS:
            {
                ggml_compute_forward_get_rows_q(params, src0, src1, dst);
            } break;
@@ -10867,7 +10674,6 @@ static void ggml_compute_forward_get_rows(
                ggml_compute_forward_get_rows_f16(params, src0, src1, dst);
            } break;
        case GGML_TYPE_F32:
-        case GGML_TYPE_I32:
            {
                ggml_compute_forward_get_rows_f32(params, src0, src1, dst);
            } break;
@@ -11495,8 +11301,6 @@ static void ggml_compute_forward_alibi(
        case GGML_TYPE_Q4_K:
        case GGML_TYPE_Q5_K:
        case GGML_TYPE_Q6_K:
-        case GGML_TYPE_IQ2_XXS:
-        case GGML_TYPE_IQ2_XS:
        case GGML_TYPE_Q8_K:
        case GGML_TYPE_I8:
        case GGML_TYPE_I16:
@@ -11571,8 +11375,6 @@ static void ggml_compute_forward_clamp(
        case GGML_TYPE_Q4_K:
        case GGML_TYPE_Q5_K:
        case GGML_TYPE_Q6_K:
-        case GGML_TYPE_IQ2_XXS:
-        case GGML_TYPE_IQ2_XS:
        case GGML_TYPE_Q8_K:
        case GGML_TYPE_I8:
        case GGML_TYPE_I16:
@@ -16341,6 +16143,24 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {

                //n_tasks = MIN(n_threads, MAX(1, nr0/128));
                //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
+
+#if defined(GGML_USE_CUBLAS)
+                if (ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) {
+                    n_tasks = 1; // TODO: this actually is doing nothing
+                                 //       the threads are still spinning
+                }
+#elif defined(GGML_USE_CLBLAST)
+                if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
+                    n_tasks = 1; // TODO: this actually is doing nothing
+                                 //       the threads are still spinning
+                }
+#endif
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
+                if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) {
+                    n_tasks = 1; // TODO: this actually is doing nothing
+                                 //       the threads are still spinning
+                }
+#endif
            } break;
        case GGML_OP_MUL_MAT_ID:
            {
@@ -16513,7 +16333,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
            state->shared->node_n += 1;
            return (thread_ret_t) GGML_EXIT_ABORTED;
        }
-
        if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
            // all other threads are finished and spinning
            // do finalize and init here so we don't have synchronize again
@@ -16579,18 +16398,14 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
        } else {
            // wait for other threads to finish
            const int last = node_n;
-
-            const bool do_yield = last < 0 || cgraph->nodes[last]->op == GGML_OP_MUL_MAT;
-
            while (true) {
                // TODO: this sched_yield can have significant impact on the performance - either positive or negative
                //       depending on the workload and the operating system.
                //       since it is not clear what is the best approach, it should potentially become user-configurable
                //       ref: https://github.com/ggerganov/ggml/issues/291
-                // UPD:  adding the do_yield flag seems to resolve the issue universally
-                if (do_yield) {
-                    sched_yield();
-                }
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
+                sched_yield();
+#endif

                node_n = atomic_load(&state->shared->node_n);
                if (node_n != last) break;
@@ -16669,7 +16484,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
                    } else
 #endif
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-                    if (ggml_compute_forward_mul_mat_use_blas(node)) {
+                    if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) {
                        if (node->src[0]->type != GGML_TYPE_F32) {
                            // here we need memory just for single 2D matrix from src0
                            cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
@@ -18688,18 +18503,6 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
                block_q6_K * block = (block_q6_K*)dst + start / QK_K;
                result = ggml_quantize_q6_K(src + start, block, n, n, hist);
            } break;
-        case GGML_TYPE_IQ2_XXS:
-            {
-                GGML_ASSERT(start % QK_K == 0);
-                block_iq2_xxs * block = (block_iq2_xxs*)dst + start / QK_K;
-                result = ggml_quantize_iq2_xxs(src + start, block, n, n, hist);
-            } break;
-        case GGML_TYPE_IQ2_XS:
-            {
-                GGML_ASSERT(start % QK_K == 0);
-                block_iq2_xs * block = (block_iq2_xs*)dst + start / QK_K;
-                result = ggml_quantize_iq2_xs(src + start, block, n, n, hist);
-            } break;
        case GGML_TYPE_F16:
            {
                int elemsize = sizeof(ggml_fp16_t);
@@ -19055,8 +18858,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                (int64_t) info->ne[3];

            if (ne % ggml_blck_size(info->type) != 0) {
-                fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%d)\n",
-                        __func__, info->name.data, (int)info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type));
+                fprintf(stderr, "%s: tensor '%s' number of elements (%" PRId64 ") is not a multiple of block size (%d)\n",
+                        __func__, info->name.data, ne, ggml_blck_size(info->type));
                fclose(file);
                gguf_free(ctx);
                return NULL;
--- a/ggml.h
+++ b/ggml.h
@@ -218,9 +218,7 @@
 #define GGML_MAX_PARAMS         2048
 #define GGML_MAX_CONTEXTS       64
 #define GGML_MAX_SRC            10
-#ifndef GGML_MAX_NAME
 #define GGML_MAX_NAME           64
-#endif
 #define GGML_MAX_OP_PARAMS      64
 #define GGML_DEFAULT_N_THREADS  4
 #define GGML_DEFAULT_GRAPH_SIZE 2048
@@ -341,8 +339,6 @@ extern "C" {
        GGML_TYPE_Q5_K = 13,
        GGML_TYPE_Q6_K = 14,
        GGML_TYPE_Q8_K = 15,
-        GGML_TYPE_IQ2_XXS = 16,
-        GGML_TYPE_IQ2_XS  = 17,
        GGML_TYPE_I8,
        GGML_TYPE_I16,
        GGML_TYPE_I32,
@@ -377,8 +373,6 @@ extern "C" {
        GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
        GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
        GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
-        GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
-        GGML_FTYPE_MOSTLY_IQ2_XS  = 16, // except 1d tensors
    };

    // available tensor operations:
@@ -1165,11 +1159,22 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);

+    // a -> b, in-place, return view(b)
+    GGML_API struct ggml_tensor * ggml_cpy_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
    // make contiguous
    GGML_API struct ggml_tensor * ggml_cont(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

+    // make contiguous, in-place
+    GGML_API struct ggml_tensor * ggml_cont_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
    // make contiguous, with new shape
    GGML_API struct ggml_tensor * ggml_cont_1d(
            struct ggml_context * ctx,
@@ -2062,17 +2067,9 @@ extern "C" {
    GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
    GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
    GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
-    GGML_API size_t ggml_quantize_iq2_xxs(const float * src, void * dst, int n, int k, int64_t * hist);
-    GGML_API size_t ggml_quantize_iq2_xs (const float * src, void * dst, int n, int k, int64_t * hist);

    GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);

-    //
-    // Importance matrix
-    //
-    typedef void(*ggml_collect_imatrix_t)(const struct ggml_tensor * src0, const struct ggml_tensor * src1);
-    GGML_API void ggml_set_imatrix_collection(ggml_collect_imatrix_t imatrix_collect);
-
    //
    // gguf
    //
--- a/llama.cpp
+++ b/llama.cpp
@@ -68,7 +68,6 @@
 #include <initializer_list>
 #include <map>
 #include <memory>
-#include <mutex>
 #include <numeric>
 #include <queue>
 #include <random>
@@ -1903,28 +1902,6 @@ static void llama_kv_cache_seq_shift(
    cache.head = new_head != cache.size ? new_head : 0;
 }

-static void llama_kv_cache_seq_div(
-        struct llama_kv_cache & cache,
-                 llama_seq_id   seq_id,
-                    llama_pos   p0,
-                    llama_pos   p1,
-                          int   d) {
-    if (p0 < 0) p0 = 0;
-    if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
-
-    for (uint32_t i = 0; i < cache.size; ++i) {
-        if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
-            cache.has_shift = true;
-
-            {
-                llama_pos p_old = cache.cells[i].pos;
-                cache.cells[i].pos   /= d;
-                cache.cells[i].delta += cache.cells[i].pos - p_old;
-            }
-        }
-    }
-}
-
 //
 // model loading and saving
 //
@@ -2202,11 +2179,7 @@ struct llama_model_loader {
                    type_max   = type;
                }

-                // TODO: make runtime configurable
-#if 0
-                struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
-                LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, ggml_get_name(meta), ggml_type_name(type), llama_format_tensor_shape(meta).c_str());
-#endif
+                // LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, name, ggml_type_name(meta->type), llama_format_tensor_shape(meta).c_str());
            }

            switch (type_max) {
@@ -2222,8 +2195,6 @@ struct llama_model_loader {
                case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break;
                case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
                case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K;   break;
-                case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
-                case GGML_TYPE_IQ2_XS:  ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS;  break;
                default:
                    {
                        LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
@@ -2586,8 +2557,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
        case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";

        // K-quants
-        case LLAMA_FTYPE_MOSTLY_Q2_K:   return "Q2_K - Medium";
-        case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
+        case LLAMA_FTYPE_MOSTLY_Q2_K:   return "Q2_K";
        case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
        case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
        case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
@@ -2596,8 +2566,6 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
        case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
        case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
        case LLAMA_FTYPE_MOSTLY_Q6_K:   return "Q6_K";
-        case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XSS - 2.0625 bpw";
-        case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";

        default: return "unknown, may not work";
    }
@@ -2832,7 +2800,6 @@ static void llm_load_hparams(
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);

                switch (hparams.n_layer) {
-                    case 24: model.type = e_model::MODEL_1B; break;
                    case 32: model.type = e_model::MODEL_3B; break;
                    default: model.type = e_model::MODEL_UNKNOWN;
                }
@@ -3149,15 +3116,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
    LLAMA_LOG_INFO("%s: rope_finetuned   = %s\n",     __func__, hparams.rope_finetuned ? "yes" : "unknown");
    LLAMA_LOG_INFO("%s: model type       = %s\n",     __func__, llama_model_type_name(model.type));
    LLAMA_LOG_INFO("%s: model ftype      = %s\n",     __func__, llama_model_ftype_name(model.ftype).c_str());
-    if (ml.n_elements >= 1e12) {
-        LLAMA_LOG_INFO("%s: model params     = %.2f T\n", __func__, ml.n_elements*1e-12);
-    } else if (ml.n_elements >= 1e9) {
-        LLAMA_LOG_INFO("%s: model params     = %.2f B\n", __func__, ml.n_elements*1e-9);
-    } else if (ml.n_elements >= 1e6) {
-        LLAMA_LOG_INFO("%s: model params     = %.2f M\n", __func__, ml.n_elements*1e-6);
-    } else {
-        LLAMA_LOG_INFO("%s: model params     = %.2f K\n", __func__, ml.n_elements*1e-3);
-    }
+    LLAMA_LOG_INFO("%s: model params     = %.2f B\n", __func__, ml.n_elements*1e-9);
    if (ml.n_bytes < GiB) {
        LLAMA_LOG_INFO("%s: model size       = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0,        ml.n_bytes*8.0/ml.n_elements);
    } else {
@@ -4812,6 +4771,7 @@ struct llm_build_context {
        const int64_t n_embd_head = hparams.n_embd_head_v;
        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_gqa  == n_embd);

        struct ggml_tensor * cur;
        struct ggml_tensor * inpL;
@@ -4935,6 +4895,7 @@ struct llm_build_context {
        const int64_t n_embd_head = hparams.n_embd_head_v;
        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_gqa  == n_embd);

        struct ggml_tensor * cur;
        struct ggml_tensor * pos;
@@ -5033,7 +4994,9 @@ struct llm_build_context {
        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);

        const int64_t n_embd_head = hparams.n_embd_head_v;
+        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_gqa  == n_embd);

        const int64_t n_rot = n_embd_head_k / 2;

@@ -5245,7 +5208,9 @@ struct llm_build_context {
        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);

        const int64_t n_embd_head = hparams.n_embd_head_v;
+        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_gqa  == n_embd);

        struct ggml_tensor * cur;
        struct ggml_tensor * inpL;
@@ -5338,6 +5303,7 @@ struct llm_build_context {
        const int64_t n_embd_head = hparams.n_embd_head_v;
        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_gqa  == n_embd);

        struct ggml_tensor * cur;
        struct ggml_tensor * inpL;
@@ -5433,6 +5399,7 @@ struct llm_build_context {
        const int64_t n_embd_head = hparams.n_embd_head_v;
        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_gqa  == n_embd);

        struct ggml_tensor * cur;
        struct ggml_tensor * inpL;
@@ -5759,6 +5726,7 @@ struct llm_build_context {
        const int64_t n_embd_head = hparams.n_embd_head_v;
        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_gqa  == n_embd);

        struct ggml_tensor * cur;
        struct ggml_tensor * attn_norm_output;
@@ -5982,6 +5950,7 @@ struct llm_build_context {
        const int64_t n_embd_head = hparams.n_embd_head_v;
        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_gqa  == n_embd);

        struct ggml_tensor * cur;
        struct ggml_tensor * pos;
@@ -8060,7 +8029,7 @@ void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * c
    }
 }

-void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int32_t k, size_t min_keep) {
+void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep) {
    const int64_t t_start_sample_us = ggml_time_us();

    k = std::max(k, (int) min_keep);
@@ -8420,7 +8389,7 @@ void llama_sample_classifier_free_guidance(
    }
 }

-llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
+llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) {
    GGML_ASSERT(ctx);

    auto N = float(llama_n_vocab(llama_get_model(ctx)));
@@ -8956,13 +8925,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
            // TODO: explore better strategies
            new_type = GGML_TYPE_Q8_0;
        }
-    } else if (name.find("ffn_down") != std::string::npos) {
+    } else if (name.find("ffn_down.weight") != std::string::npos) {
        if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
-            if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) new_type = GGML_TYPE_Q4_K;
-        }
        else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
-            new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q5_K
+            new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
                     : arch != LLM_ARCH_FALCON || use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q4_K
                     : GGML_TYPE_Q3_K;
        }
@@ -8971,14 +8937,14 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
        }
        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
            if (arch == LLM_ARCH_FALCON) {
-                new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q6_K :
+                new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
                           use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
            } else {
                if (use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
            }
        }
        else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) {
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < 4) {
            new_type = GGML_TYPE_Q5_K;
        }
        ++qs.i_feed_forward_w2;
@@ -8996,10 +8962,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
        else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
    }
-    // IK: let's remove this, else Q2_K is almost the same as Q3_K_S
-    //else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
-    //    if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
-    //}
+    else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
+        if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
+    }
    // This can be used to reduce the size of the Q5_K_S model.
    // The associated PPL increase is fully in line with the size reduction
    //else {
@@ -9048,7 +9013,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s

        // K-quants
        case LLAMA_FTYPE_MOSTLY_Q2_K:   quantized_type = GGML_TYPE_Q2_K; break;
-        case LLAMA_FTYPE_MOSTLY_Q2_K_S: quantized_type = GGML_TYPE_Q2_K; break;
        case LLAMA_FTYPE_MOSTLY_Q3_K_S:
        case LLAMA_FTYPE_MOSTLY_Q3_K_M:
        case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
@@ -9057,8 +9021,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        case LLAMA_FTYPE_MOSTLY_Q5_K_S:
        case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
        case LLAMA_FTYPE_MOSTLY_Q6_K:   quantized_type = GGML_TYPE_Q6_K; break;
-        case LLAMA_FTYPE_MOSTLY_IQ2_XXS:quantized_type = GGML_TYPE_IQ2_XXS; break;
-        case LLAMA_FTYPE_MOSTLY_IQ2_XS :quantized_type = GGML_TYPE_IQ2_XS;  break;

        default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
    }
@@ -9107,7 +9069,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
            ++qs.n_attention_wv;
        }
-        else if (name.find("ffn_down") != std::string::npos) {
+        else if (name.find("ffn_down.weight") != std::string::npos) {
            ++qs.n_feed_forward_w2;
        }
    }
@@ -9122,7 +9084,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s

    std::vector<std::thread> workers;
    workers.reserve(nthread);
-    std::mutex mutex;

    int idx = 0;

@@ -9196,7 +9157,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
            new_size = ggml_nbytes(tensor);
            LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
        } else {
-            const size_t nelements = ggml_nelements(tensor);
+            const size_t ne = ggml_nelements(tensor);

            float * f32_data;

@@ -9205,53 +9166,60 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
            } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
                throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
            } else {
-                llama_convert_tensor_internal(tensor, f32_conv_buf, workers, nelements, nthread);
+                llama_convert_tensor_internal(tensor, f32_conv_buf, workers, ne, nthread);
                f32_data = (float *) f32_conv_buf.data();
            }

            LLAMA_LOG_INFO("quantizing to %s .. ", ggml_type_name(new_type));
            fflush(stdout);

-            if (work.size() < nelements * 4) {
-                work.resize(nelements * 4); // upper bound on size
+            if (work.size() < ne * 4) {
+                work.resize(ne * 4); // upper bound on size
            }
            new_data = work.data();
+
            std::array<int64_t, 1 << 4> hist_cur = {};

-            static const int chunk_size = 32 * 512;
-            const int nchunk = (nelements + chunk_size - 1)/chunk_size;
-            const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
-            if (nthread_use < 2) {
-                new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data());
-            } else {
-                size_t counter = 0;
-                new_size = 0;
-                auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements]() {
-                    std::array<int64_t, 1 << 4> local_hist = {};
-                    size_t local_size = 0;
-                    while (true) {
-                        std::unique_lock<std::mutex> lock(mutex);
-                        size_t first = counter; counter += chunk_size;
-                        if (first >= nelements) {
-                            if (local_size > 0) {
-                                for (int j=0; j<int(local_hist.size()); ++j) {
-                                    hist_cur[j] += local_hist[j];
-                                }
-                                new_size += local_size;
-                            }
-                            break;
-                        }
-                        lock.unlock();
-                        size_t last = std::min(nelements, first + chunk_size);
+            {
+                static const size_t chunk_size = 32*512;
+
+                const int nchunk = GGML_PAD(ne, chunk_size)/chunk_size;
+                const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
+
+                std::vector<size_t> size_th(nthread_use, 0);
+                std::vector<std::array<int64_t, 1 << 4>> hist_cur_th(nthread_use);
+
+                auto compute = [&size_th, &hist_cur_th, new_type, f32_data, new_data, ne, nchunk, nthread_use](int tid) {
+                    auto & local_size = size_th[tid];
+                    auto & local_hist = hist_cur_th[tid];
+
+                    for (int ch = tid; ch < nchunk; ch += nthread_use) {
+                        const size_t first = ch * chunk_size;
+                        const size_t last  = std::min(ne, first + chunk_size);
+
                        local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
                    }
                };
+
                for (int it = 0; it < nthread_use - 1; ++it) {
-                    workers.emplace_back(compute);
+                    workers.emplace_back(compute, it);
                }
-                compute();
-                for (auto & w : workers) { w.join(); }
+
+                compute(nthread_use - 1);
+
+                for (auto & w : workers) {
+                    w.join();
+                }
+
                workers.clear();
+
+                new_size = 0;
+                for (int it = 0; it < nthread_use; ++it) {
+                    for (int j = 0; j < int(hist_cur.size()); ++j) {
+                        hist_cur[j] += hist_cur_th[it][j];
+                    }
+                    new_size += size_th[it];
+                }
            }

            LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
@@ -9263,7 +9231,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s

            if (tot_count > 0) {
                for (size_t i = 0; i < hist_cur.size(); i++) {
-                    LLAMA_LOG_INFO("%5.3f ", hist_cur[i] / float(nelements));
+                    LLAMA_LOG_INFO("%5.3f ", hist_cur[i] / float(ne));
                }
            }
            LLAMA_LOG_INFO("\n");
@@ -9635,7 +9603,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
    return result;
 }

-int32_t llama_max_devices(void) {
+int llama_max_devices(void) {
    return LLAMA_MAX_DEVICES;
 }

@@ -9946,15 +9914,15 @@ enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
    return model->vocab.type;
 }

-int32_t llama_n_vocab(const struct llama_model * model) {
+int llama_n_vocab(const struct llama_model * model) {
    return model->vocab.id_to_token.size();
 }

-int32_t llama_n_ctx_train(const struct llama_model * model) {
+int llama_n_ctx_train(const struct llama_model * model) {
    return model->hparams.n_ctx_train;
 }

-int32_t llama_n_embd(const struct llama_model * model) {
+int llama_n_embd(const struct llama_model * model) {
    return model->hparams.n_embd;
 }

@@ -9962,7 +9930,7 @@ float llama_rope_freq_scale_train(const struct llama_model * model) {
    return model->hparams.rope_freq_scale_train;
 }

-int32_t llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) {
+int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) {
    const auto & it = model->gguf_kv.find(key);
    if (it == model->gguf_kv.end()) {
        if (buf_size > 0) {
@@ -9973,11 +9941,11 @@ int32_t llama_model_meta_val_str(const struct llama_model * model, const char *
    return snprintf(buf, buf_size, "%s", it->second.c_str());
 }

-int32_t llama_model_meta_count(const struct llama_model * model) {
+int llama_model_meta_count(const struct llama_model * model) {
    return (int)model->gguf_kv.size();
 }

-int32_t llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
+int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
    if (i < 0 || i >= (int)model->gguf_kv.size()) {
        if (buf_size > 0) {
            buf[0] = '\0';
@@ -9989,7 +9957,7 @@ int32_t llama_model_meta_key_by_index(const struct llama_model * model, int i, c
    return snprintf(buf, buf_size, "%s", it->first.c_str());
 }

-int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size) {
+int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
    if (i < 0 || i >= (int)model->gguf_kv.size()) {
        if (buf_size > 0) {
            buf[0] = '\0';
@@ -10001,10 +9969,9 @@ int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int3
    return snprintf(buf, buf_size, "%s", it->second.c_str());
 }

-int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
-    return snprintf(buf, buf_size, "%s %s%s %s",
+int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
+    return snprintf(buf, buf_size, "%s %s %s",
            llama_model_arch_name(model->arch).c_str(),
-            model->hparams.n_expert > 0 ? (std::to_string(model->hparams.n_expert) + "x").c_str() : "",
            llama_model_type_name(model->type),
            llama_model_ftype_name(model->ftype).c_str());
 }
@@ -10029,7 +9996,7 @@ struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const ch
    return ggml_get_tensor(model->ctx, name);
 }

-uint32_t llama_model_quantize(
+int llama_model_quantize(
        const char * fname_inp,
        const char * fname_out,
        const llama_model_quantize_params * params) {
@@ -10042,7 +10009,7 @@ uint32_t llama_model_quantize(
    }
 }

-int32_t llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
+int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int n_threads) {
    try {
        return llama_apply_lora_from_file_internal(ctx->model, path_lora, scale, path_base_model, n_threads);
    } catch (const std::exception & err) {
@@ -10051,7 +10018,7 @@ int32_t llama_apply_lora_from_file(struct llama_context * ctx, const char * path
    }
 }

-int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
+int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int n_threads) {
    try {
        return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
    } catch (const std::exception & err) {
@@ -10149,7 +10116,7 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k
    }
 }

-int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx) {
+int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
    int result = 0;

    for (uint32_t i = 0; i < ctx->kv_self.size; i++) {
@@ -10159,7 +10126,7 @@ int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx) {
    return result;
 }

-int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx) {
+int llama_get_kv_cache_used_cells(const struct llama_context * ctx) {
    return ctx->kv_self.used;
 }

@@ -10183,21 +10150,9 @@ void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) {
 }

 void llama_kv_cache_seq_shift(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
-    if (delta == 0) {
-        return;
-    }
-
    llama_kv_cache_seq_shift(ctx->kv_self, seq_id, p0, p1, delta);
 }

-void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
-    if (d == 1) {
-        return;
-    }
-
-    llama_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d);
-}
-
 // Returns the *maximum* size of the state
 size_t llama_get_state_size(const struct llama_context * ctx) {
    // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
@@ -10653,7 +10608,7 @@ int llama_eval(
        struct llama_context * ctx,
                 llama_token * tokens,
                     int32_t   n_tokens,
-                     int32_t   n_past) {
+                         int   n_past) {
    llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);

    const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0));
@@ -10668,7 +10623,7 @@ int llama_eval_embd(
            struct llama_context * ctx,
                           float * embd,
                         int32_t   n_tokens,
-                         int32_t   n_past) {
+                             int   n_past) {
    llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);

    llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
@@ -10739,7 +10694,7 @@ void llama_batch_free(struct llama_batch batch) {
    if (batch.logits)   free(batch.logits);
 }

-int32_t llama_decode(
+int llama_decode(
        struct llama_context * ctx,
          struct llama_batch   batch) {
    const int ret = llama_decode_internal(*ctx, batch);
@@ -10787,11 +10742,11 @@ llama_token llama_token_nl(const struct llama_model * model) {
    return model->vocab.linefeed_id;
 }

-int32_t llama_add_bos_token(const struct llama_model * model) {
+int llama_add_bos_token(const struct llama_model * model) {
    return model->vocab.special_add_bos;
 }

-int32_t llama_add_eos_token(const struct llama_model * model) {
+int llama_add_eos_token(const struct llama_model * model) {
    return model->vocab.special_add_eos;
 }

@@ -10811,12 +10766,12 @@ llama_token llama_token_eot(const struct llama_model * model) {
    return model->vocab.special_eot_id;
 }

-int32_t llama_tokenize(
+int llama_tokenize(
    const struct llama_model * model,
                  const char * text,
-                     int32_t   text_len,
+                         int   text_len,
                 llama_token * tokens,
-                     int32_t   n_max_tokens,
+                         int   n_max_tokens,
                        bool   add_bos,
                        bool   special) {
    auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos, special);
@@ -10844,7 +10799,7 @@ static std::string llama_decode_text(const std::string & text) {
 }

 // does not write null-terminator to buf
-int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length) {
+int llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int length) {
    if (0 <= token && token < llama_n_vocab(model)) {
        switch (llama_vocab_get_type(model->vocab)) {
        case LLAMA_VOCAB_TYPE_SPM: {
@@ -10930,7 +10885,7 @@ void llama_print_timings(struct llama_context * ctx) {
            __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
    LLAMA_LOG_INFO("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
            __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
-    LLAMA_LOG_INFO("%s:       total time = %10.2f ms / %5d tokens\n", __func__, (timings.t_end_ms - timings.t_start_ms), (timings.n_p_eval + timings.n_eval));
+    LLAMA_LOG_INFO("%s:       total time = %10.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
 }

 void llama_reset_timings(struct llama_context * ctx) {
--- a/llama.h
+++ b/llama.h
@@ -103,9 +103,6 @@ extern "C" {
        LLAMA_FTYPE_MOSTLY_Q5_K_S        = 16, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q5_K_M        = 17, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q6_K          = 18, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_IQ2_XXS       = 19, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_IQ2_XS        = 20, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q2_K_S        = 21, // except 1d tensors

        LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
    };
@@ -229,7 +226,7 @@ extern "C" {

    // model quantization parameters
    typedef struct llama_model_quantize_params {
-        int32_t nthread;             // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
+        int nthread;                 // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
        enum llama_ftype ftype;      // quantize to this llama_ftype
        bool allow_requantize;       // allow quantizing non-f32/f16 tensors
        bool quantize_output_tensor; // quantize output.weight
@@ -313,20 +310,21 @@ extern "C" {

    LLAMA_API int64_t llama_time_us(void);

-    LLAMA_API int32_t  llama_max_devices(void);
+    LLAMA_API int  llama_max_devices    (void);
    LLAMA_API bool llama_mmap_supported (void);
    LLAMA_API bool llama_mlock_supported(void);

    LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);

+    // TODO: become more consistent with returned int types across the API
    LLAMA_API uint32_t llama_n_ctx      (const struct llama_context * ctx);
    LLAMA_API uint32_t llama_n_batch    (const struct llama_context * ctx);

    LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);

-    LLAMA_API int32_t llama_n_vocab    (const struct llama_model * model);
-    LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
-    LLAMA_API int32_t llama_n_embd     (const struct llama_model * model);
+    LLAMA_API int llama_n_vocab    (const struct llama_model * model);
+    LLAMA_API int llama_n_ctx_train(const struct llama_model * model);
+    LLAMA_API int llama_n_embd     (const struct llama_model * model);

    // Get the model's RoPE frequency scaling factor
    LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
@@ -337,19 +335,19 @@ extern "C" {
    // - GGUF array values are not supported by these functions

    // Get metadata value as a string by key name
-    LLAMA_API int32_t llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size);
+    LLAMA_API int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size);

    // Get the number of metadata key/value pairs
-    LLAMA_API int32_t llama_model_meta_count(const struct llama_model * model);
+    LLAMA_API int llama_model_meta_count(const struct llama_model * model);

    // Get metadata key name by index
-    LLAMA_API int32_t llama_model_meta_key_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size);
+    LLAMA_API int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size);

    // Get metadata value as a string by index
-    LLAMA_API int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size);
+    LLAMA_API int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size);

    // Get a string describing the model type
-    LLAMA_API int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
+    LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);

    // Returns the total size of all the tensors in the model in bytes
    LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
@@ -361,7 +359,7 @@ extern "C" {
    LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);

    // Returns 0 on success
-    LLAMA_API uint32_t llama_model_quantize(
+    LLAMA_API int llama_model_quantize(
            const char * fname_inp,
            const char * fname_out,
            const llama_model_quantize_params * params);
@@ -372,20 +370,20 @@ extern "C" {
    // The model needs to be reloaded before applying a new adapter, otherwise the adapter
    // will be applied on top of the previous one
    // Returns 0 on success
-    LLAMA_API DEPRECATED(int32_t llama_apply_lora_from_file(
+    LLAMA_API DEPRECATED(int llama_apply_lora_from_file(
            struct llama_context * ctx,
                      const char * path_lora,
                           float   scale,
                      const char * path_base_model,
-                         int32_t   n_threads),
+                             int   n_threads),
            "use llama_model_apply_lora_from_file instead");

-    LLAMA_API int32_t llama_model_apply_lora_from_file(
+    LLAMA_API int llama_model_apply_lora_from_file(
            const struct llama_model * model,
                      const char * path_lora,
                           float   scale,
                      const char * path_base_model,
-                         int32_t   n_threads);
+                             int   n_threads);

    //
    // KV cache
@@ -441,10 +439,10 @@ extern "C" {

    // Returns the number of tokens in the KV cache (slow, use only for debug)
    // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
-    LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx);
+    LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);

    // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
-    LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx);
+    LLAMA_API int llama_get_kv_cache_used_cells(const struct llama_context * ctx);

    // Clear the KV cache
    LLAMA_API void llama_kv_cache_clear(
@@ -487,17 +485,6 @@ extern "C" {
                       llama_pos   p1,
                       llama_pos   delta);

-    // Integer division of the positions by factor of `d > 1`
-    // If the KV cache is RoPEd, the KV data is updated accordingly
-    // p0 < 0 : [0,  p1]
-    // p1 < 0 : [p0, inf)
-    LLAMA_API void llama_kv_cache_seq_div(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id,
-                       llama_pos   p0,
-                       llama_pos   p1,
-                             int   d);
-
    //
    // State / sessions
    //
@@ -546,7 +533,7 @@ extern "C" {
            struct llama_context * ctx,
                     llama_token * tokens,
                         int32_t   n_tokens,
-                         int32_t   n_past),
+                             int   n_past),
            "use llama_decode() instead");

    // Same as llama_eval, but use float matrix input directly.
@@ -555,7 +542,7 @@ extern "C" {
            struct llama_context * ctx,
                           float * embd,
                         int32_t   n_tokens,
-                         int32_t   n_past),
+                             int   n_past),
            "use llama_decode() instead");

    // Return batch for single sequence of tokens starting at pos_0
@@ -587,7 +574,7 @@ extern "C" {
    //   0 - success
    //   1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
    // < 0 - error
-    LLAMA_API int32_t llama_decode(
+    LLAMA_API int llama_decode(
            struct llama_context * ctx,
              struct llama_batch   batch);

@@ -627,10 +614,10 @@ extern "C" {
    LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line

    // Returns -1 if unknown, 1 for true or 0 for false.
-    LLAMA_API int32_t         llama_add_bos_token(const struct llama_model * model);
+    LLAMA_API int         llama_add_bos_token(const struct llama_model * model);

    // Returns -1 if unknown, 1 for true or 0 for false.
-    LLAMA_API int32_t         llama_add_eos_token(const struct llama_model * model);
+    LLAMA_API int         llama_add_eos_token(const struct llama_model * model);

    // codellama infill tokens
    LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
@@ -648,12 +635,12 @@ extern "C" {
    /// @return Returns a negative number on failure - the number of tokens that would have been returned
    /// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
    ///                Does not insert a leading space.
-    LLAMA_API int32_t llama_tokenize(
+    LLAMA_API int llama_tokenize(
        const struct llama_model * model,
                      const char * text,
-                         int32_t   text_len,
+                             int   text_len,
                     llama_token * tokens,
-                         int32_t   n_max_tokens,
+                             int   n_max_tokens,
                            bool   add_bos,
                            bool   special);

@@ -661,11 +648,11 @@ extern "C" {
    // Uses the vocabulary in the provided context.
    // Does not write null terminator to the buffer.
    // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
-    LLAMA_API int32_t llama_token_to_piece(
+    LLAMA_API int llama_token_to_piece(
              const struct llama_model * model,
                           llama_token   token,
                                  char * buf,
-                               int32_t   length);
+                                  int    length);

    //
    // Grammar
@@ -717,7 +704,7 @@ extern "C" {
    LLAMA_API void llama_sample_top_k(
            struct llama_context * ctx,
          llama_token_data_array * candidates,
-                         int32_t   k,
+                             int   k,
                          size_t   min_keep);

    /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
@@ -776,7 +763,7 @@ extern "C" {
          llama_token_data_array * candidates,
                           float   tau,
                           float   eta,
-                         int32_t   m,
+                             int   m,
                           float * mu);

    /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
@@ -849,8 +836,8 @@ extern "C" {
        llama_beam_search_callback_fn_t   callback,
                                   void * callback_data,
                                 size_t   n_beams,
-                                int32_t   n_past,
-                                int32_t   n_predict);
+                                    int   n_past,
+                                    int   n_predict);

    // Performance information
    LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
--- a/scripts/compare-llama-bench.py
+++ b/scripts/compare-llama-bench.py
@@ -1,356 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-import heapq
-import sys
-import os
-from glob import glob
-import sqlite3
-
-try:
-    import git
-    from tabulate import tabulate
-except ImportError:
-    print("ERROR: the following Python libraries are required: GitPython, tabulate.")
-    sys.exit(1)
-
-# Properties by which to differentiate results per commit:
-KEY_PROPERTIES = [
-    "cuda", "opencl", "metal", "gpu_blas", "blas", "cpu_info", "gpu_info", "model_filename",
-    "model_type", "model_size", "model_n_params", "n_batch", "n_threads", "type_k", "type_v",
-    "n_gpu_layers", "main_gpu", "no_kv_offload", "mul_mat_q", "tensor_split", "n_prompt", "n_gen"
-]
-
-# Properties that are boolean and are converted to Yes/No for the table:
-BOOL_PROPERTIES = ["cuda", "opencl", "metal", "gpu_blas", "blas"]
-
-# Header names for the table:
-PRETTY_NAMES = {
-    "cuda": "CUDA", "opencl": "OpenCL", "metal": "Metal", "gpu_blas": "GPU BLAS", "blas": "BLAS",
-    "cpu_info": "CPU", "gpu_info": "GPU", "model_filename": "File", "model_type": "Model",
-    "model_size": "Model Size [GiB]", "model_n_params": "Num. of Parameters",
-    "n_batch": "Batch size", "n_threads": "Threads", "type_k": "K type", "type_v": "V type",
-    "n_gpu_layers": "GPU layers", "main_gpu": "Main GPU", "no_kv_offload": "NKVO",
-    "mul_mat_q": "MMQ", "tensor_split": "Tensor split"
-}
-
-DEFAULT_SHOW = ["model_type"]  # Always show these properties by default.
-DEFAULT_HIDE = ["model_filename"]  # Always hide these properties by default.
-GPU_NAME_STRIP = ["NVIDIA GeForce ", "Tesla ", "AMD Radeon "]  # Strip prefixes for smaller tables.
-
-DESCRIPTION = """Creates tables from llama-bench data written to an SQLite database. Example usage (Linux):
-
-$ git checkout master
-$ make clean && make llama-bench
-$ ./llama-bench -o sql | sqlite3 llama-bench.sqlite
-$ git checkout some_branch
-$ make clean && make llama-bench
-$ ./llama-bench -o sql | sqlite3 llama-bench.sqlite
-$ ./scripts/compare-llama-bench.py
-
-Performance numbers from multiple runs per commit are averaged WITHOUT being weighted by the --repetitions parameter of llama-bench.
-"""
-
-parser = argparse.ArgumentParser(
-    description=DESCRIPTION, formatter_class=argparse.RawDescriptionHelpFormatter)
-help_b = (
-    "The baseline commit to compare performance to. "
-    "Accepts either a branch name, tag name, or commit hash. "
-    "Defaults to latest master commit with data."
-)
-parser.add_argument("-b", "--baseline", help=help_b)
-help_c = (
-    "The commit whose performance is to be compared to the baseline. "
-    "Accepts either a branch name, tag name, or commit hash. "
-    "Defaults to the non-master commit for which llama-bench was run most recently."
-)
-parser.add_argument("-c", "--compare", help=help_c)
-help_i = (
-    "Input SQLite file for comparing commits. "
-    "Defaults to 'llama-bench.sqlite' in the current working directory. "
-    "If no such file is found and there is exactly one .sqlite file in the current directory, "
-    "that file is instead used as input."
-)
-parser.add_argument("-i", "--input", help=help_i)
-help_o = (
-    "Output format for the table. "
-    "Defaults to 'pipe' (GitHub compatible). "
-    "Also supports e.g. 'latex' or 'mediawiki'. "
-    "See tabulate documentation for full list."
-)
-parser.add_argument("-o", "--output", help=help_o, default="pipe")
-help_s = (
-    "Columns to add to the table. "
-    "Accepts a comma-separated list of values. "
-    f"Legal values: {', '.join(KEY_PROPERTIES[:-2])}. "
-    "Defaults to model name (model_type) and CPU and/or GPU name (cpu_info, gpu_info) "
-    "plus any column where not all data points are the same. "
-    "If the columns are manually specified, then the results for each unique combination of the "
-    "specified values are averaged WITHOUT weighing by the --repetitions parameter of llama-bench."
-)
-parser.add_argument("-s", "--show", help=help_s)
-
-known_args, unknown_args = parser.parse_known_args()
-
-if unknown_args:
-    print(f"ERROR: Received unknown args: {unknown_args}.")
-    print()
-    parser.print_help()
-    sys.exit(1)
-
-input_file = known_args.input
-if input_file is None and os.path.exists("./llama-bench.sqlite"):
-    input_file = "llama-bench.sqlite"
-if input_file is None:
-    sqlite_files = glob("*.sqlite")
-    if len(sqlite_files) == 1:
-        input_file = sqlite_files[0]
-
-if input_file is None:
-    print("ERROR: Cannot find a suitable input file, please provide one.")
-    print()
-    parser.print_help()
-    sys.exit(1)
-
-connection = sqlite3.connect(input_file)
-cursor = connection.cursor()
-builds = cursor.execute("SELECT DISTINCT build_commit FROM test;").fetchall()
-
-try:
-    repo = git.Repo(".", search_parent_directories=True)
-except git.exc.InvalidGitRepositoryError:
-    repo = None
-
-
-def find_parent_in_data(commit):
-    """Helper function to find the most recent parent measured in number of commits for which there is data."""
-    heap = [(0, commit)]
-    seen_hexsha8 = set()
-    while heap:
-        depth, current_commit = heapq.heappop(heap)
-        current_hexsha8 = commit.hexsha[:8]
-        if (current_hexsha8,) in builds:
-            return current_hexsha8
-        for parent in commit.parents:
-            parent_hexsha8 = parent.hexsha[:8]
-            if parent_hexsha8 not in seen_hexsha8:
-                seen_hexsha8.add(parent_hexsha8)
-                heapq.heappush(heap, (depth + 1, parent))
-    return None
-
-
-def get_all_parent_hexsha8s(commit):
-    """Helper function to recursively get hexsha8 values for all parents of a commit."""
-    unvisited = [commit]
-    visited   = []
-
-    while unvisited:
-        current_commit = unvisited.pop(0)
-        visited.append(current_commit.hexsha[:8])
-        for parent in current_commit.parents:
-            if parent.hexsha[:8] not in visited:
-                unvisited.append(parent)
-
-    return visited
-
-
-def get_commit_name(hexsha8):
-    """Helper function to find a human-readable name for a commit if possible."""
-    if repo is None:
-        return hexsha8
-    for h in repo.heads:
-        if h.commit.hexsha[:8] == hexsha8:
-            return h.name
-    for t in repo.tags:
-        if t.commit.hexsha[:8] == hexsha8:
-            return t.name
-    return hexsha8
-
-
-def get_commit_hexsha8(name):
-    """Helper function to search for a commit given a human-readable name."""
-    if repo is None:
-        return None
-    for h in repo.heads:
-        if h.name == name:
-            return h.commit.hexsha[:8]
-    for t in repo.tags:
-        if t.name == name:
-            return t.commit.hexsha[:8]
-    return None
-
-
-hexsha8_baseline = name_baseline = None
-
-# If the user specified a baseline, try to find a commit for it:
-if known_args.baseline is not None:
-    if (known_args.baseline,) in builds:
-        hexsha8_baseline = known_args.baseline
-    if hexsha8_baseline is None:
-        hexsha8_baseline = get_commit_hexsha8(known_args.baseline)
-        name_baseline = known_args.baseline
-    if hexsha8_baseline is None:
-        print(f"ERROR: cannot find data for baseline={known_args.baseline}.")
-        sys.exit(1)
-# Otherwise, search for the most recent parent of master for which there is data:
-elif repo is not None:
-    hexsha8_baseline = find_parent_in_data(repo.heads.master.commit)
-
-    if hexsha8_baseline is None:
-        print("ERROR: No baseline was provided and did not find data for any master branch commits.")
-        print()
-        parser.print_help()
-        sys.exit(1)
-else:
-    print(
-        "ERROR: No baseline was provided and the current working directory "
-        "is not part of a git repository from which a baseline could be inferred."
-    )
-    print()
-    parser.print_help()
-    sys.exit(1)
-
-
-name_baseline = get_commit_name(hexsha8_baseline)
-
-hexsha8_compare = name_compare = None
-
-# If the user has specified a compare value, try to find a corresponding commit:
-if known_args.compare is not None:
-    if (known_args.compare,) in builds:
-        hexsha8_compare = known_args.compare
-    if hexsha8_compare is None:
-        hexsha8_compare = get_commit_hexsha8(known_args.compare)
-        name_compare = known_args.compare
-    if hexsha8_compare is None:
-        print(f"ERROR: cannot find data for baseline={known_args.compare}.")
-        sys.exit(1)
-# Otherwise, search for the commit for llama-bench was most recently run
-# and that is not a parent of master:
-elif repo is not None:
-    hexsha8s_master = get_all_parent_hexsha8s(repo.heads.master.commit)
-    builds_timestamp = cursor.execute(
-        "SELECT build_commit, test_time FROM test ORDER BY test_time;").fetchall()
-    for (hexsha8, _) in reversed(builds_timestamp):
-        if hexsha8 not in hexsha8s_master:
-            hexsha8_compare = hexsha8
-            break
-
-    if hexsha8_compare is None:
-        print("ERROR: No compare target was provided and did not find data for any non-master commits.")
-        print()
-        parser.print_help()
-        sys.exit(1)
-else:
-    print(
-        "ERROR: No compare target was provided and the current working directory "
-        "is not part of a git repository from which a compare target could be inferred."
-    )
-    print()
-    parser.print_help()
-    sys.exit(1)
-
-name_compare = get_commit_name(hexsha8_compare)
-
-
-def get_rows(properties):
-    """
-    Helper function that gets table rows for some list of properties.
-    Rows are created by combining those where all provided properties are equal.
-    The resulting rows are then grouped by the provided properties and the t/s values are averaged.
-    The returned rows are unique in terms of property combinations.
-    """
-    select_string = ", ".join(
-        [f"tb.{p}" for p in properties] + ["tb.n_prompt", "tb.n_gen", "AVG(tb.avg_ts)", "AVG(tc.avg_ts)"])
-    equal_string = " AND ".join(
-        [f"tb.{p} = tc.{p}" for p in KEY_PROPERTIES] + [
-            f"tb.build_commit = '{hexsha8_baseline}'", f"tc.build_commit = '{hexsha8_compare}'"]
-    )
-    group_order_string = ", ".join([f"tb.{p}" for p in properties] + ["tb.n_gen", "tb.n_prompt"])
-    query = (f"SELECT {select_string} FROM test tb JOIN test tc ON {equal_string} "
-             f"GROUP BY {group_order_string} ORDER BY {group_order_string};")
-    return cursor.execute(query).fetchall()
-
-
-# If the user provided columns to group the results by, use them:
-if known_args.show is not None:
-    show = known_args.show.split(",")
-    unknown_cols = []
-    for prop in show:
-        if prop not in KEY_PROPERTIES[:-2]:  # Last two values are n_prompt, n_gen.
-            unknown_cols.append(prop)
-    if unknown_cols:
-        print(f"ERROR: Unknown values for --show: {', '.join(unknown_cols)}")
-        print()
-        parser.print_usage()
-        sys.exit(1)
-    rows_show = get_rows(show)
-# Otherwise, select those columns where the values are not all the same:
-else:
-    rows_full = get_rows(KEY_PROPERTIES)
-    properties_different = []
-    for i, kp_i in enumerate(KEY_PROPERTIES):
-        if kp_i in DEFAULT_SHOW or kp_i == "n_prompt" or kp_i == "n_gen":
-            continue
-        for row_full in rows_full:
-            if row_full[i] != rows_full[0][i]:
-                properties_different.append(kp_i)
-                break
-
-    show = []
-    # Show CPU and/or GPU by default even if the hardware for all results is the same:
-    if "gpu_blas" not in properties_different and "n_gpu_layers" not in properties_different:
-        gpu_blas = bool(rows_full[0][KEY_PROPERTIES.index("gpu_blas")])
-        ngl = int(rows_full[0][KEY_PROPERTIES.index("n_gpu_layers")])
-
-        if not gpu_blas or ngl != 99 and "cpu_info" not in properties_different:
-            show.append("cpu_info")
-        if gpu_blas and "gpu_info" not in properties_different:
-            show.append("gpu_info")
-
-    show += DEFAULT_SHOW
-    show += properties_different
-    for prop in DEFAULT_HIDE:
-        try:
-            show.remove(prop)
-        except ValueError:
-            pass
-    rows_show = get_rows(show)
-
-table = []
-for row in rows_show:
-    n_prompt = int(row[-4])
-    n_gen    = int(row[-3])
-    assert n_prompt == 0 or n_gen == 0
-    test_name = f"tg{n_gen}" if n_prompt == 0 else f"pp{n_prompt}"
-    #           Regular columns    test name    avg t/s values              Speedup
-    #            VVVVVVVVVVVVV     VVVVVVVVV    VVVVVVVVVVVVVV              VVVVVVV
-    table.append(list(row[:-4]) + [test_name] + list(row[-2:]) + [float(row[-1]) / float(row[-2])])
-
-# Some a-posteriori fixes to make the table contents prettier:
-for bool_property in BOOL_PROPERTIES:
-    if bool_property in show:
-        ip = show.index(bool_property)
-        for row_table in table:
-            row_table[ip] = "Yes" if int(row_table[ip]) == 1 else "No"
-
-if "model_size" in show:
-    ip = show.index("model_size")
-    for row_table in table:
-        row_table[ip] = float(row_table[ip]) / 1024 ** 3
-
-if "gpu_info" in show:
-    ip = show.index("gpu_info")
-    for gns in GPU_NAME_STRIP:
-        for row_table in table:
-            row_table[ip] = row_table[ip].replace(gns, "")
-
-headers  = [PRETTY_NAMES[p] for p in show]
-headers += ["Test", f"t/s {name_baseline}", f"t/s {name_compare}", "Speedup"]
-
-print(tabulate(
-    table,
-    headers=headers,
-    floatfmt=".2f",
-    tablefmt=known_args.output
-))
--- a/scripts/get-pg.sh
+++ b/scripts/get-pg.sh
@@ -1,70 +0,0 @@
-#!/bin/bash
-
-function usage {
-    echo "usage: <n>$0"
-    echo "note: n is the number of essays to download"
-    echo "for specific n, the resulting pg.txt file will have the following number of tokens:"
-    echo "n   | tokens"
-    echo "--- | ---"
-    echo "1   | 6230"
-    echo "2   | 23619"
-    echo "5   | 25859"
-    echo "10  | 36888"
-    echo "15  | 50188"
-    echo "20  | 59094"
-    echo "25  | 88764"
-    echo "30  | 103121"
-    echo "32  | 108338"
-    echo "35  | 113403"
-    echo "40  | 127699"
-    echo "45  | 135896"
-    exit 1
-}
-
-function has_cmd {
-    if ! [ -x "$(command -v $1)" ]; then
-        echo "error: $1 is not available" >&2
-        exit 1
-    fi
-}
-
-# check for: curl, html2text, tail, sed, fmt
-has_cmd curl
-has_cmd html2text
-has_cmd tail
-has_cmd sed
-
-if [ $# -ne 1 ]; then
-    usage
-fi
-
-n=$1
-
-# get urls
-urls="$(curl http://www.aaronsw.com/2002/feeds/pgessays.rss | grep html | sed -e "s/.*http/http/" | sed -e "s/html.*/html/" | head -n $n)"
-
-printf "urls:\n%s\n" "$urls"
-
-if [ -f pg.txt ]; then
-    rm pg.txt
-fi
-
-c=1
-for url in $urls; do
-    echo "processing $url"
-
-    cc=$(printf "%03d" $c)
-
-    curl -L $url | html2text | tail -n +4 | sed -E "s/^[[:space:]]+//g" | fmt -w 80 >> pg-$cc-one.txt
-    cat pg-$cc-one.txt >> pg.txt
-
-    cp -v pg.txt pg-$cc-all.txt
-    c=$((c+1))
-
-    # don't flood the server
-    sleep 1
-done
-
-echo "done. data in pg.txt"
-
-exit 0
--- a/scripts/sync-ggml-am.sh
+++ b/scripts/sync-ggml-am.sh
@@ -27,7 +27,7 @@ echo "Syncing ggml changes since commit $lc"
 cd $SRC_GGML

 git log --oneline $lc..HEAD
-git log --oneline $lc..HEAD --reverse | grep -v "(llama/[0-9]*)" | cut -d' ' -f1 > $SRC_LLAMA/ggml-commits
+git log --oneline $lc..HEAD | grep -v "(llama/[0-9]*)" | cut -d' ' -f1 > $SRC_LLAMA/ggml-commits

 if [ ! -s $SRC_LLAMA/ggml-commits ]; then
    rm -v $SRC_LLAMA/ggml-commits
@@ -87,6 +87,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
    # src/ggml-impl.h             -> ggml-impl.h
    # src/ggml-metal.h            -> ggml-metal.h
    # src/ggml-metal.m            -> ggml-metal.m
+    # src/ggml-metal.metal        -> ggml-metal.metal
    # src/ggml-mpi.h              -> ggml-mpi.h
    # src/ggml-mpi.c              -> ggml-mpi.c
    # src/ggml-opencl.cpp         -> ggml-opencl.cpp
@@ -113,6 +114,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
        -e 's/src\/ggml-impl\.h/ggml-impl.h/g' \
        -e 's/src\/ggml-metal\.h/ggml-metal.h/g' \
        -e 's/src\/ggml-metal\.m/ggml-metal.m/g' \
+        -e 's/src\/ggml-metal\.metal/ggml-metal.metal/g' \
        -e 's/src\/ggml-mpi\.h/ggml-mpi.h/g' \
        -e 's/src\/ggml-mpi\.c/ggml-mpi.c/g' \
        -e 's/src\/ggml-opencl\.cpp/ggml-opencl.cpp/g' \
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-979cc23b345006504cfc1f67c0fdf627805e3319
+df098ea908764cba4a4889a1cbe7b026b2d31a14
--- a/spm-headers/ggml.h
+++ b/spm-headers/ggml.h
@@ -0,0 +1 @@
+../ggml.h
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -58,9 +58,6 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
        int64_t hist[16];
        ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size, hist);
        ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
-    } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
-        // This is going to create some weird integers though.
-        ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor));
    } else {
        GGML_ASSERT(false);
    }
@@ -90,13 +87,8 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
                        tv.push_back(*(float *) &buf[i]);
                    } else if (t->type == GGML_TYPE_I32) {
                        tv.push_back((float)*(int32_t *) &buf[i]);
-                    } else if (t->type == GGML_TYPE_I16) {
-                        tv.push_back((float)*(int16_t *) &buf[i]);
-                    } else if (t->type == GGML_TYPE_I8) {
-                        tv.push_back((float)*(int8_t *) &buf[i]);
                    } else if (quantized) {
-                        std::vector<float> vq(ggml_blck_size(t->type));
-                        tt.to_float(&buf[i], vq.data(), ggml_blck_size(t->type));
+                        tt.to_float(&buf[i], vq.data(), bs);
                        tv.insert(tv.end(), vq.begin(), vq.end());
                    } else {
                        GGML_ASSERT(false);
@@ -392,21 +384,15 @@ struct test_case {
        struct callback_userdata {
            bool   ok;
            double max_err;
-            ggml_backend_t backend1;
-            ggml_backend_t backend2;
        };

        callback_userdata ud {
            true,
            max_nmse_err(),
-            backend1,
-            backend2
        };

        auto callback = [](int index, ggml_tensor * t1, ggml_tensor * t2, void * user_data) -> bool {
            callback_userdata * ud = (callback_userdata *) user_data;
-            const char * bn1 = ggml_backend_name(ud->backend1);
-            const char * bn2 = ggml_backend_name(ud->backend2);

            if (t1->op == GGML_OP_NONE) {
                // sentinels must be unchanged
@@ -428,7 +414,7 @@ struct test_case {
            for (size_t i = 0; i < f1.size(); i++) {
                // check for nans
                if (std::isnan(f1[i]) || std::isnan(f2[i])) {
-                    printf("[%s] NaN at index %zu (%s=%f %s=%f) ", ggml_op_desc(t1), i, bn1, f1[i], bn2, f2[i]);
+                    printf("[%s] NaN at index %zu (%f %f) ", ggml_op_desc(t1), i, f1[i], f2[i]);
                    ud->ok = false;
                    return true;
                }
@@ -436,12 +422,12 @@ struct test_case {
                if (isinf_or_max(f1[i]) || isinf_or_max(f2[i])) {
                    if (isinf_or_max(f1[i]) && isinf_or_max(f2[i])) {
                        if (std::signbit(f1[i]) != std::signbit(f2[i])) {
-                            printf("[%s] inf sign mismatch: %s=%f %s=%f ", ggml_op_desc(t1), bn1, f1[i], bn2, f2[i]);
+                            printf("[%s] inf sign mismatch: %f %f ", ggml_op_desc(t1), f1[i], f2[i]);
                            ud->ok = false;
                            return true;
                        }
                    } else {
-                        printf("[%s] inf mismatch: %s=%f %s=%f ", ggml_op_desc(t1), bn1, f1[i], bn2, f2[i]);
+                        printf("[%s] inf mismatch: %f %f ", ggml_op_desc(t1), f1[i], f2[i]);
                        ud->ok = false;
                        return true;
                    }
@@ -450,7 +436,7 @@ struct test_case {

            double err = nmse(f1.data(), f2.data(), f1.size());
            if (err > ud->max_err) {
-                printf("[%s] NMSE = %.9f > %.9f ", ggml_op_desc(t1), err, ud->max_err);
+                printf("[%s] NMSE = %f ", ggml_op_desc(t1), err);
                //for (int i = 0; i < (int) f1.size(); i++) {
                //    printf("%5d %9.6f %9.6f, diff = %9.6f\n", i, f1[i], f2[i], f1[i] - f2[i]);
                //}
@@ -675,26 +661,17 @@ struct test_repeat : public test_case {
 struct test_dup : public test_case {
    const ggml_type type;
    const std::array<int64_t, 4> ne;
-    const std::array<int64_t, 4> permute;
-    bool _use_permute;

    std::string vars() override {
-        std::string v = VARS_TO_STR2(type, ne);
-        if (_use_permute) v += "," + VAR_TO_STR(permute);
-        return v;
+        return VARS_TO_STR2(type, ne);
    }

    test_dup(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 10, 10, 1},
-            std::array<int64_t, 4> permute = {0, 0, 0, 0})
-        : type(type), ne(ne), permute(permute),
-            _use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0) {}
+            std::array<int64_t, 4> ne = {10, 10, 10, 1})
+        : type(type), ne(ne) {}

    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
-        if (_use_permute) {
-            src = ggml_permute(ctx, src, permute[0], permute[1], permute[2], permute[3]);
-        }
        ggml_tensor * out = ggml_dup(ctx, src);
        return out;
    }
@@ -1449,7 +1426,6 @@ struct test_moe : public test_case {

 static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) {
    std::vector<std::unique_ptr<test_case>> test_cases;
-    std::default_random_engine rng(0);

    const ggml_type all_types[] = {
        GGML_TYPE_F32, GGML_TYPE_F16,
@@ -1474,26 +1450,14 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
            }
        }
    }
-    for (int b : {1, 7}) {
-        for (bool v : {false, true}) {
-            test_cases.emplace_back(new test_get_rows(GGML_TYPE_I32, 256, 5, 4, b, v));
-        }
-    }

    test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 1}));
    test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {2, 1, 1, 1}));
    test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 2, 1, 1}));
    test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 2, 1}));
    test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 2}));
-    test_cases.emplace_back(new test_repeat(GGML_TYPE_I32, {10, 10, 10, 10}, {2, 1, 1, 1}));
-    test_cases.emplace_back(new test_repeat(GGML_TYPE_I16, {10, 10, 10, 10}, {1, 1, 1, 2}));

-    test_cases.emplace_back(new test_dup(GGML_TYPE_F32));
-    test_cases.emplace_back(new test_dup(GGML_TYPE_F16));
-    test_cases.emplace_back(new test_dup(GGML_TYPE_I32));
-    test_cases.emplace_back(new test_dup(GGML_TYPE_I16));
-    test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {0, 2, 1, 3}));
-    test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {1, 2, 0, 3}));
+    test_cases.emplace_back(new test_dup());

    for (ggml_type type : all_types) {
       test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, type, {256, 10, 10, 1}));
@@ -1584,19 +1548,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
    test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 10,  1}, 5));
    test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 10, 10}, 5));

-    std::uniform_int_distribution<> dist_ne1(1, 50);
-    int exponent = 1;
-    while (exponent < (1 << 17)) {
-        std::uniform_int_distribution<> dist_ne0(exponent, 2*exponent);
-
-        for (int n = 0; n < 10; ++n) {
-            int64_t ne0 = dist_ne0(rng);
-            int64_t ne1 = dist_ne1(rng);
-            test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0, ne1, 1, 1}));
-        }
-
-        exponent <<= 1;
-    }
+    test_cases.emplace_back(new test_soft_max());

    for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
        test_cases.emplace_back(new test_rope(type, {128,  32, 10, 1}, 128, 0, 512)); // llama 7B
@@ -1613,8 +1565,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op

    test_cases.emplace_back(new test_alibi());
    test_cases.emplace_back(new test_im2col());
-    test_cases.emplace_back(new test_concat(GGML_TYPE_F32));
-    test_cases.emplace_back(new test_concat(GGML_TYPE_I32));
+    test_cases.emplace_back(new test_concat());

    for (ggml_sort_order order : {GGML_SORT_ASC, GGML_SORT_DESC}) {
        test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {8, 1, 1, 1}, order));
--- a/tests/test-quantize-fns.cpp
+++ b/tests/test-quantize-fns.cpp
@@ -134,12 +134,6 @@ int main(int argc, char * argv[]) {
            continue;
        }

-        const ggml_type ei = (ggml_type)i;
-        if (ei == GGML_TYPE_IQ2_XXS || ei == GGML_TYPE_IQ2_XS) {
-            printf("Skip %s due to missing quantization functionality\n", ggml_type_name(ei));
-            continue;
-        }
-
        printf("Testing %s\n", ggml_type_name((ggml_type) i));

        if (qfns.from_float && qfns.to_float) {