common : remove defaults

common : add HF arg helpers
2026-04-16 16:27:32 +03:00 · 2024-03-22 15:33:24 +02:00 · 2024-03-22 14:32:36 +02:00
40 changed files with 2055 additions and 3034 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -15,10 +15,6 @@ on:
    types: [opened, synchronize, reopened]
    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']

-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
 env:
  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
  GGML_NLOOP: 3
@@ -139,9 +135,6 @@ jobs:

  ubuntu-focal-make:
    runs-on: ubuntu-20.04
-    env:
-      LLAMA_NODE_AVAILABLE: true
-      LLAMA_PYTHON_AVAILABLE: true

    steps:
      - name: Clone
@@ -154,14 +147,6 @@ jobs:
          sudo apt-get update
          sudo apt-get install build-essential gcc-8

-      - uses: actions/setup-node@v4
-        with:
-          node-version: "20"
-
-      - uses: actions/setup-python@v4
-        with:
-          python-version: "3.11"
-
      - name: Build
        id: make_build
        env:
@@ -225,17 +210,6 @@ jobs:
          cd build
          ctest -L main --verbose --timeout 900

-      - name: Test llama2c conversion
-        id: llama2c_test
-        run: |
-          cd build
-          echo "Fetch tokenizer"
-          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
-          echo "Fetch llama2c model"
-          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
-          ./bin/convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
-          ./bin/main -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
-
 #  ubuntu-latest-cmake-sanitizer:
 #    runs-on: ubuntu-latest
 #
--- a/.github/workflows/code-coverage.yml
+++ b/.github/workflows/code-coverage.yml
@@ -5,10 +5,6 @@ env:
  GGML_NLOOP: 3
  GGML_N_THREADS: 1

-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
 jobs:
  run:
    runs-on: ubuntu-20.04
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -15,10 +15,6 @@ on:
    branches:
      - master

-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
 jobs:
  push_to_registry:
    name: Push Docker image to Docker Hub
--- a/.github/workflows/editorconfig.yml
+++ b/.github/workflows/editorconfig.yml
@@ -14,10 +14,6 @@ on:
    branches:
      - master

-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
 jobs:
  editorconfig:
    runs-on: ubuntu-latest
--- a/.github/workflows/nix-ci-aarch64.yml
+++ b/.github/workflows/nix-ci-aarch64.yml
@@ -17,10 +17,6 @@ on:
    types: [opened, synchronize, reopened]
    paths: ['**/*.nix', 'flake.lock']

-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
 jobs:
  nix-build-aarch64:
    runs-on: ubuntu-latest
--- a/.github/workflows/nix-ci.yml
+++ b/.github/workflows/nix-ci.yml
@@ -8,10 +8,6 @@ on:
  pull_request:
    types: [opened, synchronize, reopened]

-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
 jobs:
  nix-eval:
    strategy:
--- a/.github/workflows/python-check-requirements.yml
+++ b/.github/workflows/python-check-requirements.yml
@@ -16,10 +16,6 @@ on:
      - 'requirements.txt'
      - 'requirements/*.txt'

-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
 jobs:
  python-check-requirements:
    runs-on: ubuntu-latest
--- a/.github/workflows/python-lint.yml
+++ b/.github/workflows/python-lint.yml
@@ -2,10 +2,6 @@ name: flake8 Lint

 on: [push, pull_request]

-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
 jobs:
  flake8-lint:
    runs-on: ubuntu-latest
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -18,10 +18,6 @@ on:
  schedule:
    -  cron: '0 0 * * *'

-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
 jobs:
  server:
    runs-on: ubuntu-latest
--- a/.github/workflows/zig-build.yml
+++ b/.github/workflows/zig-build.yml
@@ -6,10 +6,6 @@ on:
    branches:
      - master

-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
 jobs:
  build:
    strategy:
--- a/.gitignore
+++ b/.gitignore
@@ -58,9 +58,6 @@ models-mnt
 /llava-cli
 /lookahead
 /lookup
-/lookup-create
-/lookup-merge
-/lookup-stats
 /main
 /metal
 /passkey
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -99,7 +99,6 @@ option(LLAMA_CUDA_F16                        "llama: use 16 bit floats for some
 set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
 set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
                                             "llama: max. batch size for using peer access")
-option(LLAMA_CUDA_NO_PEER_COPY               "llama: do not use peer to peer copies"            OFF)
 option(LLAMA_CURL                            "llama: use libcurl to download model from an URL" OFF)
 option(LLAMA_HIPBLAS                         "llama: use hipBLAS"                               OFF)
 option(LLAMA_HIP_UMA                         "llama: use HIP unified memory architecture"       OFF)
@@ -388,9 +387,6 @@ if (LLAMA_CUBLAS)
        endif()
        add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
        add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE})
-        if (LLAMA_CUDA_NO_PEER_COPY)
-            add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
-        endif()

        if (LLAMA_STATIC)
            if (WIN32)
@@ -535,10 +531,6 @@ if (LLAMA_HIPBLAS)
        add_compile_definitions(GGML_CUDA_FORCE_MMQ)
    endif()

-    if (LLAMA_CUDA_NO_PEER_COPY)
-        add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
-    endif()
-
    add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
    add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
    add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
--- a/22
+++ b/22
@@ -452,9 +452,9 @@ ifdef LLAMA_CUDA_PEER_MAX_BATCH_SIZE
 else
 	MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
 endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
-ifdef LLAMA_CUDA_NO_PEER_COPY
-	MK_NVCCFLAGS += -DGGML_CUDA_NO_PEER_COPY
-endif # LLAMA_CUDA_NO_PEER_COPY
+#ifdef LLAMA_CUDA_CUBLAS
+#	MK_NVCCFLAGS += -DGGML_CUDA_CUBLAS
+#endif # LLAMA_CUDA_CUBLAS
 ifdef LLAMA_CUDA_CCBIN
 	MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
 endif
@@ -535,9 +535,6 @@ endif # LLAMA_HIP_UMA
 ifdef LLAMA_CUDA_FORCE_DMMV
 	HIPFLAGS 	+= -DGGML_CUDA_FORCE_DMMV
 endif # LLAMA_CUDA_FORCE_DMMV
-ifdef LLAMA_CUDA_NO_PEER_COPY
-	HIPFLAGS 	+= -DGGML_CUDA_NO_PEER_COPY
-endif # LLAMA_CUDA_NO_PEER_COPY
 	OBJS        += ggml-cuda.o
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
 	$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
@@ -676,9 +673,6 @@ json-schema-to-grammar.o: common/json-schema-to-grammar.cpp common/json-schema-t
 train.o: common/train.cpp common/train.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

-ngram-cache.o: common/ngram-cache.cpp common/ngram-cache.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
 libllama.so: llama.o ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)

@@ -686,7 +680,7 @@ libllama.a: llama.o ggml.o $(OBJS) $(COMMON_DEPS)
 	ar rcs libllama.a llama.o ggml.o $(OBJS) $(COMMON_DEPS)

 clean:
-	rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
+	rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
 	find examples pocs -type f -name "*.o" -delete

 #
@@ -816,15 +810,9 @@ lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-lookup: examples/lookup/lookup.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
+lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-	$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-create.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp) -o lookup-create $(LDFLAGS)
-	$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-merge.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp) -o lookup-merge $(LDFLAGS)
-	$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-stats.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp) -o lookup-stats $(LDFLAGS)

 passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
--- a/README.md
+++ b/README.md
@@ -22,7 +22,6 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
 - Looking for contributions to add Deepseek support: https://github.com/ggerganov/llama.cpp/issues/5981
 - Quantization blind testing: https://github.com/ggerganov/llama.cpp/discussions/5962
 - Initial Mamba support has been added: https://github.com/ggerganov/llama.cpp/pull/5328
- Support loading sharded model, using `gguf-split` CLI https://github.com/ggerganov/llama.cpp/pull/6187

 ----

--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -65,8 +65,6 @@ add_library(${TARGET} STATIC
    json.hpp
    train.h
    train.cpp
-    ngram-cache.h
-    ngram-cache.cpp
    )

 if (BUILD_SHARED_LIBS)
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -963,22 +963,6 @@ static bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg,
        }
        return true;
    }
-    if (arg == "-lcs" || arg == "--lookup-cache-static") {
-        if (++i >= argc) {
-            invalid_param = true;
-            return true;
-        }
-        params.lookup_cache_static = argv[i];
-        return true;
-    }
-    if (arg == "-lcd" || arg == "--lookup-cache-dynamic") {
-        if (++i >= argc) {
-            invalid_param = true;
-            return true;
-        }
-        params.lookup_cache_dynamic = argv[i];
-        return true;
-    }
    if (arg == "--save-all-logits" || arg == "--kl-divergence-base") {
        if (++i >= argc) {
            invalid_param = true;
@@ -1236,11 +1220,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
            throw std::invalid_argument("error: unknown argument: " + arg);
        }
    }
-
    if (invalid_param) {
        throw std::invalid_argument("error: invalid parameter for argument: " + arg);
    }
-
    if (params.prompt_cache_all &&
            (params.interactive || params.interactive_first ||
             params.instruct)) {
@@ -1248,11 +1230,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
        throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
    }

-    // short-hand to avoid specifying --hf-file -> default it to --model
-    if (!params.hf_repo.empty() && params.hf_file.empty()) {
-        params.hf_file = params.model;
-    }
-
    if (params.escape) {
        process_escapes(params.prompt);
        process_escapes(params.input_prefix);
@@ -1452,10 +1429,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("                        Hugging Face model file (default: unused)\n");
    printf("  -ld LOGDIR, --logdir LOGDIR\n");
    printf("                        path under which to save YAML logs (no logging if unset)\n");
-    printf("  -lcs FNAME, --lookup-cache-static FNAME\n");
-    printf("                        path to static lookup cache to use for lookup decoding (not updated by generation)\n");
-    printf("  -lcd FNAME, --lookup-cache-dynamic FNAME\n");
-    printf("                        path to dynamic lookup cache to use for lookup decoding (updated by generation)\n");
    printf("  --override-kv KEY=TYPE:VALUE\n");
    printf("                        advanced option to override model metadata by key. may be specified multiple times.\n");
    printf("                        types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
--- a/common/common.h
+++ b/common/common.h
@@ -88,22 +88,20 @@ struct gpt_params {
    // // sampling parameters
    struct llama_sampling_params sparams;

-    std::string model                = "models/7B/ggml-model-f16.gguf"; // model path
-    std::string model_draft          = "";  // draft model for speculative decoding
-    std::string model_alias          = "unknown"; // model alias
-    std::string model_url            = "";  // model url to download
-    std::string hf_repo              = "";  // HF repo
-    std::string hf_file              = "";  // HF file
-    std::string prompt               = "";
-    std::string prompt_file          = "";  // store the external prompt file name
-    std::string path_prompt_cache    = "";  // path to file for saving/loading prompt eval state
-    std::string input_prefix         = "";  // string to prefix user inputs with
-    std::string input_suffix         = "";  // string to suffix user inputs with
+    std::string model             = "models/7B/ggml-model-f16.gguf"; // model path
+    std::string model_draft       = "";  // draft model for speculative decoding
+    std::string model_alias       = "unknown"; // model alias
+    std::string model_url         = "";  // model url to download
+    std::string hf_repo           = "";  // HF repo
+    std::string hf_file           = "";  // HF file
+    std::string prompt            = "";
+    std::string prompt_file       = "";  // store the external prompt file name
+    std::string path_prompt_cache = "";  // path to file for saving/loading prompt eval state
+    std::string input_prefix      = "";  // string to prefix user inputs with
+    std::string input_suffix      = "";  // string to suffix user inputs with
    std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
-    std::string logdir               = "";  // directory in which to save YAML log files
-    std::string lookup_cache_static  = ""; // path of static ngram cache file for lookup decoding
-    std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
-    std::string logits_file          = "";  // file for saving *all* logits
+    std::string logdir            = "";  // directory in which to save YAML log files
+    std::string logits_file       = "";  // file for saving *all* logits

    std::vector<llama_model_kv_override> kv_overrides;

--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -9,7 +9,7 @@
 #include <unordered_set>
 #include <vector>

-using json = nlohmann::ordered_json;
+using json = nlohmann::json;

 const std::string SPACE_RULE = "\" \"?";

@@ -124,7 +124,7 @@ static std::string replacePattern(const std::string & input, const std::regex &
 }

 static std::string format_literal(const std::string & literal) {
-    std::string escaped = replacePattern(literal, GRAMMAR_LITERAL_ESCAPE_RE, [&](const std::smatch & match) {
+    std::string escaped = replacePattern(json(literal).dump(), GRAMMAR_LITERAL_ESCAPE_RE, [&](const std::smatch & match) {
        char c = match.str()[0];
        return GRAMMAR_LITERAL_ESCAPES.at(c);
    });
@@ -137,7 +137,7 @@ private:
    std::function<json(const std::string &)> _fetch_json;
    bool _dotall;
    std::map<std::string, std::string> _rules;
-    std::unordered_map<std::string, json> _refs;
+    std::unordered_map<std::string, nlohmann::json> _refs;
    std::unordered_set<std::string> _refs_being_resolved;
    std::vector<std::string> _errors;
    std::vector<std::string> _warnings;
@@ -413,7 +413,7 @@ private:
            std::string prop_rule_name = visit(prop_schema, name + (name.empty() ? "" : "-") + prop_name);
            prop_kv_rule_names[prop_name] = _add_rule(
                name + (name.empty() ? "" : "-") + prop_name + "-kv",
-                format_literal(json(prop_name).dump()) + " space \":\" space " + prop_rule_name
+                format_literal(prop_name) + " space \":\" space " + prop_rule_name
            );
            if (required.find(prop_name) != required.end()) {
                required_props.push_back(prop_name);
@@ -495,7 +495,7 @@ public:
        _rules["space"] = SPACE_RULE;
    }

-    void resolve_refs(json & schema, const std::string & url) {
+    void resolve_refs(nlohmann::json & schema, const std::string & url) {
        /*
        * Resolves all $ref fields in the given schema, fetching any remote schemas,
        * replacing each $ref with absolute reference URL and populates _refs with the
@@ -557,7 +557,11 @@ public:
    }

    std::string _generate_constant_rule(const json & value) {
-        return format_literal(value.dump());
+        if (!value.is_string()) {
+            _errors.push_back("Only std::string constants are supported, got " + value.dump());
+            return "";
+        }
+        return format_literal(value.get<std::string>());
    }

    std::string visit(const json & schema, const std::string & name) {
--- a/common/json-schema-to-grammar.h
+++ b/common/json-schema-to-grammar.h
@@ -1,4 +1,4 @@
 #pragma once
 #include "json.hpp"

-std::string json_schema_to_grammar(const nlohmann::ordered_json& schema);
+std::string json_schema_to_grammar(const nlohmann::json& schema);
--- a/common/ngram-cache.cpp
+++ b/common/ngram-cache.cpp
@@ -1,280 +0,0 @@
-#include "ngram-cache.h"
-#include "log.h"
-
-#include <fstream>
-
-void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
-                              std::vector<llama_token> & inp, int nnew, bool print_progress) {
-    const int64_t t_start_ms = ggml_time_ms();
-    const int64_t inp_size = inp.size();
-
-    const int64_t n_todo = inp_size * (ngram_max - ngram_min + 1);
-    int64_t n_done = 0;
-
-    for (int64_t ngram_size = ngram_min; ngram_size <= ngram_max; ++ngram_size) {
-        const int64_t i_start = std::max(inp_size - nnew, ngram_size);
-        for (int64_t i = i_start; i < inp_size; ++i) {
-            const int64_t ngram_start = i - ngram_size;
-            llama_ngram ngram(&inp[ngram_start], ngram_size);
-            const llama_token token = inp[i];
-
-            llama_ngram_cache::iterator part_it = ngram_cache.find(ngram);
-            if (part_it == ngram_cache.end()) {
-                llama_ngram_cache_part part;
-                part.emplace(token, 1);
-                ngram_cache.emplace(ngram, part);
-            } else {
-                llama_ngram_cache_part::iterator token_count_it = part_it->second.find(token);
-                if (token_count_it == part_it->second.end()) {
-                    part_it->second.emplace(token, 1);
-                } else {
-                    token_count_it->second++;
-                }
-            }
-            ++n_done;
-
-            if (print_progress && n_done % 10000000 == 0) {
-                const int64_t t_now_ms = ggml_time_ms();
-                const int64_t eta_ms   = (inp_size*(ngram_max-ngram_min+1) - n_done) * (t_now_ms - t_start_ms) / n_done;
-                const int64_t eta_min  = eta_ms / (60*1000);
-                const int64_t eta_s    = (eta_ms - 60*1000*eta_min) / 1000;
-
-                fprintf(stderr, "%s: %" PRId64 "/%" PRId64 " done, ETA: %02" PRId64 ":%02" PRId64 "\n", __func__, n_done, n_todo, eta_min, eta_s);
-            }
-        }
-    }
-}
-
-// Helper function to get a token from the combined, speculative sequence of inp and draft.
-static llama_token get_token(const std::vector<llama_token> & inp, const std::vector<llama_token> & draft, const size_t i) {
-    return i < inp.size() ? inp[i] : draft[1 + i - inp.size()];
-}
-
-// If sample size or percentage are below these thresholds the draft is aborted early:
-constexpr int    draft_min_sample_size_lax[LLAMA_NGRAM_MAX] = { 2,  2,  1,  1};
-constexpr int        draft_min_percent_lax[LLAMA_NGRAM_MAX] = {66, 50, 50, 50};
-constexpr int draft_min_sample_size_strict[LLAMA_NGRAM_MAX] = { 4,  3,  2,  2};
-constexpr int     draft_min_percent_strict[LLAMA_NGRAM_MAX] = {75, 66, 66, 66};
-
-// Helper function that tries to draft a token from only the static ngram cache:
-static llama_token try_draft(llama_ngram_cache & nc_static, const llama_ngram ngram_static) {
-    llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
-    if (part_static_it == nc_static.end()) {
-        return -1;
-    }
-    const llama_ngram_cache_part part_static = part_static_it->second;
-
-    int max_count_static  = 0;
-    int sum_count_static  = 0;
-    llama_token max_token = -1;
-
-    for (std::pair<llama_token, int> token_count_static : part_static) {
-        const llama_token token = token_count_static.first;
-        const int32_t count_static  = token_count_static.second;
-
-        if (count_static > max_count_static) {
-            max_token        = token;
-            max_count_static = count_static;
-        }
-        sum_count_static += count_static;
-    }
-
-    if (sum_count_static < draft_min_sample_size_lax[LLAMA_NGRAM_STATIC-1]) {
-        return -1;
-    }
-    if (100*max_count_static < draft_min_percent_lax[LLAMA_NGRAM_STATIC-1]*sum_count_static) {
-        return -1;
-    }
-    return max_token;
-}
-
-// Try to draft a token from primary cache (context/dynamic), validate with static cache:
-static llama_token try_draft(
-    llama_ngram_cache & nc_primary, const std::vector<llama_ngram> & ngrams_primary, llama_ngram_cache_part & part_static,
-    const int * min_sample_size, const int * min_percent) {
-
-    llama_token drafted_token = -1;
-
-    for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == -1; --i) {
-        const llama_ngram ngram_primary = ngrams_primary[i];
-
-        llama_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary);
-        if (part_primary_it == nc_primary.end()) {
-            continue;
-        }
-        const llama_ngram_cache_part part_primary = part_primary_it->second;
-
-        int max_count_primary = 0;
-        int max_count_static  = 0;
-        int sum_count_primary = 0;
-        llama_token max_token = -1;
-
-        for (std::pair<llama_token, int> token_count_primary : part_primary) {
-            const llama_token token = token_count_primary.first;
-
-            llama_ngram_cache_part::iterator token_count_static_it = part_static.find(token);
-
-            const int32_t count_primary = token_count_primary.second;
-            const int32_t count_static  = token_count_static_it != part_static.end() ? 100*token_count_static_it->second : 1;
-
-            if (count_primary*count_static > max_count_primary*max_count_static) {
-                max_token         = token;
-                max_count_primary = count_primary;
-                max_count_static  = count_static;
-            }
-            sum_count_primary += count_primary;
-        }
-
-        if (sum_count_primary < min_sample_size[i]) {
-            continue;
-        }
-        if (100*max_count_primary < min_percent[i]*sum_count_primary) {
-            continue;;
-        }
-        drafted_token = max_token;
-    }
-
-    return drafted_token;
-}
-
-void llama_ngram_cache_draft(
-    std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
-    llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static
-) {
-    GGML_ASSERT(draft.size() == 1);
-    const int inp_size = inp.size();
-
-    if (inp_size < LLAMA_NGRAM_STATIC) {
-        return;
-    }
-
-    while ((int) draft.size()-1 < n_draft) {
-        llama_token drafted_token = -1;
-
-        const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1;
-        llama_ngram ngram_static;
-        for (int j = ngram_start_static; j < ngram_start_static + LLAMA_NGRAM_STATIC; ++j) {
-            ngram_static.tokens[j-ngram_start_static] = get_token(inp, draft, j);
-        }
-        llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
-        llama_ngram_cache_part part_static;
-        if (part_static_it != nc_static.end()) {
-            part_static = part_static_it->second;
-        }
-
-        // cd = context + dynamic
-        std::vector<llama_ngram> ngrams_cd;
-        for (int ngram_size_cd = ngram_min; ngram_size_cd <= ngram_max; ++ngram_size_cd) {
-            const int ngram_start_cd = inp_size-ngram_size_cd + draft.size()-1;
-            llama_ngram ngram_cd;
-            for (int j = ngram_start_cd; j < ngram_start_cd + ngram_size_cd; ++j) {
-                ngram_cd.tokens[j-ngram_start_cd] = get_token(inp, draft, j);
-            }
-            ngrams_cd.push_back(ngram_cd);
-        }
-        if (drafted_token == -1) {
-            drafted_token = try_draft(nc_context, ngrams_cd, part_static, draft_min_sample_size_lax, draft_min_percent_lax);
-        }
-        if (drafted_token == -1) {
-            drafted_token = try_draft(nc_dynamic, ngrams_cd, part_static, draft_min_sample_size_strict, draft_min_percent_strict);
-        }
-        if (drafted_token == -1) {
-            drafted_token = try_draft(nc_static, ngram_static);
-        }
-
-        if (drafted_token == -1) {
-            break;
-        }
-
-        LOG(" - draft candidate: token=%d\n", drafted_token);
-        draft.push_back(drafted_token);
-    }
-}
-
-void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename) {
-    std::ofstream file_out(filename, std::ios::binary);
-    for (std::pair<llama_ngram, llama_ngram_cache_part> item : ngram_cache) {
-        const llama_ngram      ngram        = item.first;
-        llama_ngram_cache_part token_counts = item.second;
-        GGML_ASSERT(!token_counts.empty());
-        const int32_t ntokens = token_counts.size();
-        GGML_ASSERT(ntokens > 0);
-
-        file_out.write(reinterpret_cast<const char *>(&ngram),   sizeof(llama_ngram));
-        file_out.write(reinterpret_cast<const char *>(&ntokens), sizeof(int32_t));
-        for (std::pair<llama_token, int32_t> item2 : token_counts) {
-            const llama_token token = item2.first;
-            const int32_t     count = item2.second;
-            GGML_ASSERT(count > 0);
-
-            file_out.write(reinterpret_cast<const char *>(&token), sizeof(llama_token));
-            file_out.write(reinterpret_cast<const char *>(&count), sizeof(int32_t));
-        }
-    }
-
-}
-
-llama_ngram_cache llama_ngram_cache_load(std::string & filename) {
-    std::ifstream hashmap_file(filename, std::ios::binary);
-    if (!hashmap_file) {
-        throw std::ifstream::failure("Unable to open file " + filename);
-    }
-    llama_ngram_cache ngram_cache;
-
-    llama_ngram ngram;
-    int32_t     ntokens;
-    llama_token token;
-    int32_t     count;
-
-    char * ngramc   = reinterpret_cast<char*>(&ngram);
-    char * ntokensc = reinterpret_cast<char*>(&ntokens);
-    char * tokenc   = reinterpret_cast<char*>(&token);
-    char * countc   = reinterpret_cast<char*>(&count);
-    while(hashmap_file.read(ngramc, sizeof(llama_ngram))) {
-        GGML_ASSERT(!hashmap_file.eof());
-        GGML_ASSERT(hashmap_file.read(ntokensc, sizeof(int32_t)));
-        GGML_ASSERT(ntokens > 0);
-        llama_ngram_cache_part token_counts;
-
-        for (int i = 0; i < ntokens; ++i) {
-            GGML_ASSERT(!hashmap_file.eof());
-            GGML_ASSERT(hashmap_file.read(tokenc, sizeof(llama_token)));
-            GGML_ASSERT(!hashmap_file.eof());
-            GGML_ASSERT(hashmap_file.read(countc, sizeof(int32_t)));
-            GGML_ASSERT(count > 0);
-            token_counts.emplace(token, count);
-        }
-
-        ngram_cache.emplace(ngram, token_counts);
-    }
-    GGML_ASSERT(hashmap_file.eof());
-
-    return ngram_cache;
-}
-
-void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add) {
-    for (std::pair<llama_ngram, llama_ngram_cache_part> ngram_part : ngram_cache_add) {
-        const llama_ngram      ngram = ngram_part.first;
-        llama_ngram_cache_part  part = ngram_part.second;
-
-        llama_ngram_cache::iterator part_merged_it = ngram_cache_target.find(ngram);
-        if (part_merged_it == ngram_cache_target.end()) {
-            ngram_cache_target.emplace(ngram, part);
-            continue;
-        }
-
-        for (std::pair<llama_token, int32_t> token_count : part) {
-            const llama_token token = token_count.first;
-            const int32_t     count = token_count.second;
-            GGML_ASSERT(count > 0);
-
-            llama_ngram_cache_part::iterator token_count_merged_it = part_merged_it->second.find(token);
-            if (token_count_merged_it == part_merged_it->second.end()) {
-                part_merged_it->second.emplace(token, count);
-                continue;
-            }
-
-            token_count_merged_it->second += count;
-        }
-    }
-}
--- a/common/ngram-cache.h
+++ b/common/ngram-cache.h
@@ -1,94 +0,0 @@
-#pragma once
-
-#include "llama.h"
-
-#include <unordered_map>
-#include <string>
-#include <vector>
-
-#define LLAMA_NGRAM_MIN    1
-#define LLAMA_NGRAM_MAX    4
-#define LLAMA_NGRAM_STATIC 2
-
-// Data structures to map n-grams to empirical token probabilities:
-
-struct llama_ngram {
-    llama_token tokens[LLAMA_NGRAM_MAX];
-
-    llama_ngram() {
-        for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
-            tokens[i] = -1;
-        }
-    }
-
-    llama_ngram(const llama_token * input, const int ngram_size) {
-        for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
-            tokens[i] = i < ngram_size ? input[i] : -1;
-        }
-    }
-
-    bool operator==(const llama_ngram & other) const {
-        for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
-            if (tokens[i] != other.tokens[i]) {
-                return false;
-            }
-        }
-        return true;
-    }
-};
-
-struct llama_ngram_hash_function {
-    size_t operator()(const llama_ngram & ngram) const {
-        size_t hash = 0;
-        for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
-            hash ^= std::hash<llama_token>{}(ngram.tokens[i]);
-        }
-        return hash;
-    }
-};
-
-// token -> number of times token has been seen
-typedef std::unordered_map<llama_token, int32_t> llama_ngram_cache_part;
-
-// n-gram -> empirical distribution of following tokens
-typedef std::unordered_map<llama_ngram, llama_ngram_cache_part, llama_ngram_hash_function> llama_ngram_cache;
-
-
-// Update an ngram cache with tokens.
-// ngram_cache:         the cache to modify.
-// ngram_min/ngram_max: the min/max size of the ngrams to extract from inp_data.
-// inp_data:            the token sequence with which to update ngram_cache.
-// nnew:                how many new tokens have been appended to inp_data since the last call to this function.
-// print_progress:      whether to print progress to stderr.
-//
-// In order to get correct results inp_data can ONLY BE APPENDED TO.
-// Changes in the middle need a complete rebuild.
-void llama_ngram_cache_update(
-    llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector<llama_token> & inp_data, int nnew, bool print_progress);
-
-// Try to draft tokens from ngram caches.
-// inp:                the tokens generated so far.
-// draft:              the token sequence to draft. Expected to initially contain the previously sampled token.
-// n_draft:            maximum number of tokens to add to draft.
-// ngram_min/gram_max: the min/max size of the ngrams in nc_context and nc_dynamic.
-// nc_context:         ngram cache based on current context.
-// nc_dynamic:         ngram cache based on previous user generations.
-// nc_static:          ngram cache generated from a large text corpus, used for validation.
-void llama_ngram_cache_draft(
-    std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
-    llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static);
-
-// Save an ngram cache to a file.
-// ngram_cache: the ngram cache to save.
-// filename:    the path under which to save the ngram cache.
-void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename);
-
-// Load an ngram cache saved with llama_ngram_cache_save.
-// filename: the path from which to load the ngram cache.
-// returns:  an ngram cache containing the information saved to filename.
-llama_ngram_cache llama_ngram_cache_load(std::string & filename);
-
-// Merge two ngram caches.
-// ngram_cache_target: the ngram cache to which to add the information from ngram_cache_add.
-// ngram_cache_add:    the ngram cache to add to ngram_cache_target.
-void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add);
--- a/examples/convert-llama2c-to-ggml/README.md
+++ b/examples/convert-llama2c-to-ggml/README.md
@@ -21,8 +21,6 @@ An example command using a model from [karpathy/tinyllamas](https://huggingface.

 `$ ./convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin`

-Note: The vocabulary for `stories260K.bin` should be its own tokenizer `tok512.bin` found in [karpathy/tinyllamas/stories260K](https://huggingface.co/karpathy/tinyllamas/tree/main/stories260K).
-
 Now you can use the model with a command like:

 `$ ./main -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -1,7 +1,6 @@
 #include "ggml.h"
 #include "llama.h"
 #include "common.h"
-#include "log.h"

 #include <unordered_map>
 #include <vector>
@@ -79,101 +78,111 @@ typedef struct {

 struct TransformerWeights {
    // token embedding table
-    std::vector<float> token_embedding_table;    // (vocab_size, dim)
+    float* token_embedding_table;    // (vocab_size, dim)
    // weights for rmsnorms
-    std::vector<float> rms_att_weight; // (layer, dim) rmsnorm weights
-    std::vector<float> rms_ffn_weight; // (layer, dim)
+    float* rms_att_weight; // (layer, dim) rmsnorm weights
+    float* rms_ffn_weight; // (layer, dim)
    // weights for matmuls
-    std::vector<float> wq; // (layer, dim, dim)
-    std::vector<float> wk; // (layer, dim, dim)
-    std::vector<float> wv; // (layer, dim, dim)
-    std::vector<float> wo; // (layer, dim, dim)
+    float* wq; // (layer, dim, dim)
+    float* wk; // (layer, dim, dim)
+    float* wv; // (layer, dim, dim)
+    float* wo; // (layer, dim, dim)
    // weights for ffn
-    std::vector<float> w1; // (layer, hidden_dim, dim)
-    std::vector<float> w2; // (layer, dim, hidden_dim)
-    std::vector<float> w3; // (layer, hidden_dim, dim)
+    float* w1; // (layer, hidden_dim, dim)
+    float* w2; // (layer, dim, hidden_dim)
+    float* w3; // (layer, hidden_dim, dim)
    // final rmsnorm
-    std::vector<float> rms_final_weight; // (dim,)
+    float* rms_final_weight; // (dim,)
    // freq_cis for RoPE relatively positional embeddings
-    // std::vector<float> freq_cis_real; // (seq_len, dim/2)
-    // std::vector<float> freq_cis_imag; // (seq_len, dim/2)
+    // float* freq_cis_real; // (seq_len, dim/2)
+    // float* freq_cis_imag; // (seq_len, dim/2)
    // (optional) classifier weights for the logits, on the last layer
-    std::vector<float> wcls;
+    float* wcls;
+
+    ~TransformerWeights() {
+        delete[] token_embedding_table;
+        delete[] rms_att_weight;
+        delete[] rms_ffn_weight;
+        delete[] wq;
+        delete[] wk;
+        delete[] wv;
+        delete[] wo;
+        delete[] w1;
+        delete[] w2;
+        delete[] w3;
+        delete[] rms_final_weight;
+        delete[] wcls;
+    }
 };

-static void alloc_weights(TransformerWeights * w, const Config * p, bool shared_weights) {
-    const int n_multiqueries = p->n_kv_heads <= 0 || p->n_kv_heads >= p->n_heads ? 1 : p->n_heads / p->n_kv_heads;
-    try {
-        w->token_embedding_table.resize(p->vocab_size * p->dim);
-        LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
+static void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) {
+    // we calloc instead of malloc to keep valgrind happy
+    w->token_embedding_table = new float[p->vocab_size * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);

-        w->rms_att_weight.resize(p->n_layers * p->dim);
-        LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
+    w->rms_att_weight = new float[p->n_layers * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);

-        w->rms_ffn_weight.resize(p->n_layers * p->dim);
-        LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
+    w->rms_ffn_weight = new float[p->n_layers * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);

-        w->wq.resize(p->n_layers * p->dim * p->dim);
-        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+    w->wq = new float[p->n_layers * p->dim * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);

-        w->wk.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
-        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
+    w->wk = new float[p->n_layers * p->dim * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);

-        w->wv.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
-        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
+    w->wv = new float[p->n_layers * p->dim * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);

-        w->wo.resize(p->n_layers * p->dim * p->dim);
-        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+    w->wo = new float[p->n_layers * p->dim * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);

-        w->w1.resize(p->n_layers * p->hidden_dim * p->dim);
-        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
+    w->w1 = new float[p->n_layers * p->hidden_dim * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);

-        w->w2.resize(p->n_layers * p->hidden_dim * p->dim);
-        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
+    w->w2 = new float[p->n_layers * p->hidden_dim * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);

-        w->w3.resize(p->n_layers * p->hidden_dim * p->dim);
-        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
+    w->w3 = new float[p->n_layers * p->hidden_dim * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);

-        w->rms_final_weight.resize(p->dim);
-        LOG("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
+    w->rms_final_weight = new float[p->dim]();
+    printf("[%s:AK] Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);

-        if (shared_weights) {
-            w->wcls = {};
-        } else {
-            w->wcls.resize(p->vocab_size * p->dim);
-            LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
-        }
-    }
-    catch (std::length_error &) {
-        die("Invalid configuration. Failed to allocate memory for weights");
+    if (shared_weights) {
+        w->wcls = NULL;
+    } else {
+        w->wcls = new float[p->vocab_size * p->dim]();
+        printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
    }
 }

-static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FILE * f, bool shared_weights) {
-    if (fread(w->token_embedding_table.data(), sizeof(float), w->token_embedding_table.size(), f) != w->token_embedding_table.size()) return 1;
-    if (fread(w->rms_att_weight.data(), sizeof(float), w->rms_att_weight.size(), f) != w->rms_att_weight.size()) return 1;
-    if (fread(w->wq.data(), sizeof(float), w->wq.size(), f) != w->wq.size()) return 1;
-    if (fread(w->wk.data(), sizeof(float), w->wk.size(), f) != w->wk.size()) return 1;
-    if (fread(w->wv.data(), sizeof(float), w->wv.size(), f) != w->wv.size()) return 1;
-    if (fread(w->wo.data(), sizeof(float), w->wo.size(), f) != w->wo.size()) return 1;
-    if (fread(w->rms_ffn_weight.data(), sizeof(float), w->rms_ffn_weight.size(), f) != w->rms_ffn_weight.size()) return 1;
-    if (fread(w->w1.data(), sizeof(float), w->w1.size(), f) != w->w1.size()) return 1;
-    if (fread(w->w2.data(), sizeof(float), w->w2.size(), f) != w->w2.size()) return 1;
-    if (fread(w->w3.data(), sizeof(float), w->w3.size(), f) != w->w3.size()) return 1;
-    if (fread(w->rms_final_weight.data(), sizeof(float), w->rms_final_weight.size(), f) != w->rms_final_weight.size()) return 1;
+static int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shared_weights) {
+    if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
+    if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
+    if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
+    if (fread(w->wk, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
+    if (fread(w->wv, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
+    if (fread(w->wo, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
+    if (fread(w->rms_ffn_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
+    if (fread(w->w1, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
+    if (fread(w->w2, sizeof(float), p->n_layers * p->hidden_dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->hidden_dim * p->dim)) return 1;
+    if (fread(w->w3, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
+    if (fread(w->rms_final_weight, sizeof(float), p->dim, f) != static_cast<size_t>(p->dim)) return 1;

    // Skip freq_cis_real & freq_cis_imag
    int head_size = p->dim / p->n_heads;
    fseek(f, p->seq_len * head_size * sizeof(float), SEEK_CUR);

-    if (!shared_weights && fread(w->wcls.data(), sizeof(float), w->wcls.size(), f) != w->wcls.size()) return 1;
+    if (!shared_weights && fread(w->wcls, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;

    // Check we didn't forget to read anything
    auto curr = ftell(f);
    fseek(f, 0, SEEK_END);
    auto end = ftell(f);
    if (curr != end) {
-        LOG("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end =  %ld)\n", __func__, curr, end);
+        printf("Error: failed to read the checkpoint file to the end (curr = %ld, end =  %ld)\n", curr, end);
        return 1;
    }

@@ -181,20 +190,20 @@ static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FIL
 }

 static void print_sample_weights(TransformerWeights *w){
-    LOG("----- Quick print of first of the weight vales of all the variables\n");
-    LOG("%f\n", w->token_embedding_table[0]);
-    LOG("%f\n", w->rms_att_weight[0]);
-    LOG("%f\n", w->rms_ffn_weight[0]);
+    printf("----- Quick print of first of the weight vales of all the variables\n");
+    printf("%f\n", w->token_embedding_table[0]);
+    printf("%f\n", w->rms_att_weight[0]);
+    printf("%f\n", w->rms_ffn_weight[0]);

-    LOG("%f\n", w->wq[0]);
-    LOG("%f\n", w->wk[0]);
-    LOG("%f\n", w->wv[0]);
-    LOG("%f\n", w->wo[0]);
-    LOG("%f\n", w->w1[0]);
-    LOG("%f\n", w->w2[0]);
-    LOG("%f\n", w->w3[0]);
-    LOG("%f\n", w->rms_att_weight[0]);
-    if (!w->wcls.empty()) LOG("%f\n", w->wcls[0]);
+    printf("%f\n", w->wq[0]);
+    printf("%f\n", w->wk[0]);
+    printf("%f\n", w->wv[0]);
+    printf("%f\n", w->wo[0]);
+    printf("%f\n", w->w1[0]);
+    printf("%f\n", w->w2[0]);
+    printf("%f\n", w->w3[0]);
+    printf("%f\n", w->rms_att_weight[0]);
+    if (w->wcls) printf("%f\n", w->wcls[0]);
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////

@@ -216,16 +225,14 @@ struct llama_vocab {
 };

 struct my_llama_hparams {
-    uint32_t n_vocab   = 32000;
-    uint32_t n_ctx     = 512;   // this is provided as user input?
-    uint32_t n_embd    = 4096;
-    uint32_t n_ff      = 11008;
-    uint32_t n_mult    = 4;
-    uint32_t n_head    = 32;
-    uint32_t n_head_kv = 32;
-    uint32_t n_layer   = 32;
-    uint32_t n_rot     = 64;
-
+    uint32_t n_vocab = 32000;
+    uint32_t n_ctx   = 512;   // this is provided as user input?
+    uint32_t n_embd  = 4096;
+    uint32_t n_ff    = 11008;
+    uint32_t n_mult  = 4;
+    uint32_t n_head  = 32;
+    uint32_t n_layer = 32;
+    uint32_t n_rot   = 64;
    bool operator!=(const my_llama_hparams& other) const {
        return memcmp(this, &other, sizeof(my_llama_hparams));
    }
@@ -318,30 +325,14 @@ struct train_params {
 };

 static void print_params(struct my_llama_hparams * params) {
-    LOG("%s: n_vocab:   %u\n", __func__, params->n_vocab);
-    LOG("%s: n_ctx:     %u\n", __func__, params->n_ctx);
-    LOG("%s: n_embd:    %u\n", __func__, params->n_embd);
-    LOG("%s: n_mult:    %u\n", __func__, params->n_mult);
-    LOG("%s: n_head:    %u\n", __func__, params->n_head);
-    LOG("%s: n_head_kv: %u\n", __func__, params->n_head_kv);
-    LOG("%s: n_ff:      %u\n", __func__, params->n_ff);
-    LOG("%s: n_layer:   %u\n", __func__, params->n_layer);
-    LOG("%s: n_rot:     %u\n", __func__, params->n_rot);
-}
-
-static void print_tensor_info(const struct ggml_context * ctx) {
-    for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
-        LOG("%s: Allocating ", __func__);
-        int64_t total = 1;
-        int i = 0;
-        for (; i < ggml_n_dims(t); ++i) {
-            if (i > 0) LOG("x ");
-            LOG("[%" PRId64 "] ", t->ne[i]);
-            total *= t->ne[i];
-        }
-        if (i > 1) LOG("= [%" PRId64 "] ", total);
-        LOG("float space for %s\n", ggml_get_name(t));
-    }
+    printf("%s: n_vocab: %u\n", __func__, params->n_vocab);
+    printf("%s: n_ctx:   %u\n", __func__, params->n_ctx);
+    printf("%s: n_embd:  %u\n", __func__, params->n_embd);
+    printf("%s: n_mult:  %u\n", __func__, params->n_mult);
+    printf("%s: n_head:  %u\n", __func__, params->n_head);
+    printf("%s: n_ff:    %u\n", __func__, params->n_ff);
+    printf("%s: n_layer: %u\n", __func__, params->n_layer);
+    printf("%s: n_rot:   %u\n", __func__, params->n_rot);
 }

 static void init_model(struct my_llama_model * model) {
@@ -351,8 +342,6 @@ static void init_model(struct my_llama_model * model) {
    const uint32_t n_layer = hparams.n_layer;
    const uint32_t n_vocab = hparams.n_vocab;

-    const uint32_t n_multiqueries = hparams.n_head_kv <= 0 || hparams.n_head_kv >= hparams.n_head ? 1 : hparams.n_head / hparams.n_head_kv;
-
    const uint32_t n_ff = hparams.n_ff;
    struct ggml_context * ctx = model->ctx;

@@ -361,8 +350,25 @@ static void init_model(struct my_llama_model * model) {
    model->train_tokens = 0;

    model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
+    printf("[%s:GG] Allocating [%u] x [%u] = [%u] float space for model->tok_embeddings\n",__func__,n_embd , n_vocab, n_embd * n_vocab);
+
    model->norm           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+    printf("[%s:GG] Allocating [%u] float space for model->norm\n",__func__,n_embd);
+
    model->output         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
+    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab);
+
+    // printing the per-layer allocations here so we dont print in the for loop.
+    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wq for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wk for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wv for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wo for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
+
+    printf("[%s:GG] Allocating [%u] float space for layer.ffn_norm for [%u] layers\n",__func__,n_embd, n_layer);
+
+    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w1 for [%u] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
+    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w2 for [%u] layers\n",__func__, n_embd, n_ff, n_ff * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w3 for [%u] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);

    ggml_set_name(model->tok_embeddings, "tok_embeddings.weight");
    ggml_set_name(model->norm,           "norm.weight");
@@ -377,8 +383,8 @@ static void init_model(struct my_llama_model * model) {
        layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);

        layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
-        layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd / n_multiqueries);
-        layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd / n_multiqueries);
+        layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
+        layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
        layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);

        layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
@@ -400,8 +406,6 @@ static void init_model(struct my_llama_model * model) {
        ggml_format_name(layer.w2, "%s.feed_forward.w2.weight", layers_i.c_str());
        ggml_format_name(layer.w3, "%s.feed_forward.w3.weight", layers_i.c_str());
    }
-
-    print_tensor_info(ctx);
 }

 static float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
@@ -417,9 +421,9 @@ static int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
 static void print_row(struct ggml_tensor * probs, int i) {
    for (int k = 0; k < probs->ne[0]; ++k) {
        float p = get_f32_2d(probs, k, i);
-        LOG(" %f", p);
+        printf(" %f", p);
    }
-    LOG("\n");
+    printf("\n");
 }

 static void print_matrix(struct ggml_tensor * probs) {
@@ -427,12 +431,33 @@ static void print_matrix(struct ggml_tensor * probs) {
    for (int i = 0; i < probs->ne[1]; ++i) {
        for (int k = 0; k < probs->ne[0]; ++k) {
            float p = get_f32_2d(probs, k, i);
-            LOG(" %.2f", p);
+            printf(" %.2f", p);
        }
-        LOG("\n");
+        printf("\n");
    }
 }

+#ifdef __GNUC__
+#ifdef __MINGW32__
+__attribute__((format(gnu_printf, 1, 2)))
+#else
+__attribute__((format(printf, 1, 2)))
+#endif
+#endif
+static std::string format(const char * fmt, ...) {
+    va_list ap, ap2;
+    va_start(ap, fmt);
+    va_copy(ap2, ap);
+    int size = vsnprintf(NULL, 0, fmt, ap);
+    GGML_ASSERT(size >= 0 && size < INT_MAX);
+    std::vector<char> buf(size + 1);
+    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
+    GGML_ASSERT(size2 == size);
+    va_end(ap2);
+    va_end(ap);
+    return std::string(buf.data(), size);
+}
+
 struct llama_file {
    // use FILE * so we don't have to re-open the file to mmap
    FILE * fp;
@@ -524,9 +549,8 @@ static std::string llama_escape_whitespaces(const std::string & text) {
    return out.str();
 }

-static void load_vocab(const char * filename, const Config * config, struct llama_vocab * vocab) {
+static void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
    if (is_ggml_file(filename)) {
-        LOG("%s: Loading vocabulary from gguf file %s\n", __func__, filename);
        struct ggml_context * ctx_data = NULL;

        struct gguf_init_params params = {
@@ -554,9 +578,6 @@ static void load_vocab(const char * filename, const Config * config, struct llam
        const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);

        const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx);
-        if (n_vocab != static_cast<uint32_t>(config->vocab_size)) {
-            die_fmt("vocab size mismatch: (gguf) %u != (llama2c) %d", n_vocab, config->vocab_size);
-        }

        vocab->id_to_token.resize(n_vocab);

@@ -574,7 +595,7 @@ static void load_vocab(const char * filename, const Config * config, struct llam
        gguf_free(ctx);
    } else {
        // assume llama2.c vocabulary
-        LOG("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
+        printf("Assuming llama2.c vocabulary since %s is not a gguf file\n", filename);
        llama_file file(filename, "rb");
        if (!file.fp) {
            die_fmt("%s: %s", strerror(errno), filename);
@@ -617,15 +638,38 @@ static void load_vocab(const char * filename, const Config * config, struct llam
 }

 static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) {
-    int size = 1;
-    for (int dim = 0; dim < ggml_n_dims(gg_weights); ++dim) {
-        size *= gg_weights->ne[dim];
-    }
-    for (int ct = 0; ct < size; ++ct) {
-        int64_t i0 = 0; int64_t i1 = 0;
-        int64_t i2 = 0; int64_t i3 = 0;
-        ggml_unravel_index(gg_weights, ct, &i0, &i1, &i2, &i3);
-        ggml_set_f32_nd(gg_weights, i0, i1, i2, i3, karpathy_weights[ct]);
+    int ct;
+    switch (ggml_n_dims(gg_weights)) {
+        case 1:
+            ct = 0;
+            for (int i0 = 0; i0 < gg_weights->ne[0]; i0++){
+                float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0]);
+                *ptr = karpathy_weights[ct];
+                ct++;
+            }
+            break;
+        case 2:
+            ct = 0;
+            for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) {
+                for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) {
+                    float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1]);
+                    *ptr = karpathy_weights[ct];
+                    ct++;
+                }
+            }
+            break;
+        case 3:
+            ct = 0;
+            for (int i2 = 0; i2 < gg_weights->ne[2]; i2++) {
+                for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) {
+                    for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) {
+                        float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1] + i2*gg_weights->nb[2]);
+                        *ptr = karpathy_weights[ct];
+                        ct++;
+                    }
+                }
+            }
+            break;
    }
 }

@@ -635,18 +679,16 @@ static void save_as_llama_model(
    // convert AK weights into GG weights one by one.
    // w->token_embedding_table -> model->tok_embeddings
    // float*                   -> struct ggml_tensor
-    convert_weights_ak_to_gg(model->tok_embeddings, w->token_embedding_table.data());
-    convert_weights_ak_to_gg(model->output, !w->wcls.empty() ? w->wcls.data() : w->token_embedding_table.data());
+    convert_weights_ak_to_gg(model->tok_embeddings, w->token_embedding_table);
+    convert_weights_ak_to_gg(model->output, w->wcls ? w->wcls : w->token_embedding_table);

-    convert_weights_ak_to_gg(model->norm, w->rms_final_weight.data());
+    convert_weights_ak_to_gg(model->norm, w->rms_final_weight);
    //print_row(model->norm, 0);

    // for rms-att-weight
    int row_length = model->hparams.n_embd;
    int n_ff = model->hparams.n_ff;

-    const uint32_t n_multiqueries = model->hparams.n_head_kv <= 0 || model->hparams.n_head_kv >= model->hparams.n_head ? 1 : model->hparams.n_head / model->hparams.n_head_kv;
-
    for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
        auto & layer = model->layers[i];
        // 1d
@@ -655,10 +697,9 @@ static void save_as_llama_model(

        // from 3d matrix layer x dim x dim to 2d matrix dim x dim
        convert_weights_ak_to_gg(layer.wq            , &w->wq[i*row_length*row_length]);
+        convert_weights_ak_to_gg(layer.wk            , &w->wk[i*row_length*row_length]);
+        convert_weights_ak_to_gg(layer.wv            , &w->wv[i*row_length*row_length]);
        convert_weights_ak_to_gg(layer.wo            , &w->wo[i*row_length*row_length]);
-        // from 3d matrix layer x dim x dim to 2d matrix dim x dim / n_multiqueries
-        convert_weights_ak_to_gg(layer.wk            , &w->wk[i*row_length*row_length/n_multiqueries]);
-        convert_weights_ak_to_gg(layer.wv            , &w->wv[i*row_length*row_length/n_multiqueries]);

        convert_weights_ak_to_gg(layer.w1            , &w->w1[i*row_length*n_ff]);
        convert_weights_ak_to_gg(layer.w2            , &w->w2[i*n_ff*row_length]);
@@ -695,8 +736,8 @@ static void save_as_llama_model(
    gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, model->hparams.n_embd);
    gguf_set_val_u32(ctx, KV_FEED_FORWARD_LENGTH, model->hparams.n_ff);
    gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, model->hparams.n_head);
-    gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, model->hparams.n_head);
-    gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT_KV, model->hparams.n_head_kv);
+    // n_head_kv is optional, default to n_head
+    // gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT_KV, ...);
    gguf_set_val_u32(ctx, KV_BLOCK_COUNT, model->hparams.n_layer);
    gguf_set_val_u32(ctx, KV_ROPE_DIMENSION_COUNT, model->hparams.n_rot);
    gguf_set_val_f32(ctx, KV_ATTENTION_LAYERNORM_RMS_EPS, 1e-5f);
@@ -748,12 +789,12 @@ static void save_as_llama_model(

 static struct train_params get_default_train_params() {
    struct train_params params;
-    params.fn_vocab_model          = "models/7B/ggml-model-f16.gguf";
+    params.fn_vocab_model    = "models/7B/ggml-model-f16.gguf";
    params.fn_llama2c_output_model = "ak_llama_model.bin";
-    params.fn_train_data           = "shakespeare.txt";
-    params.fn_checkpoint_in        = "checkpoint.bin";
-    params.fn_checkpoint_out       = "checkpoint.bin";
-    params.fn_model_out            = "ggml-checkpoint-f32.bin";
+    params.fn_train_data     = "shakespeare.txt";
+    params.fn_checkpoint_in  = "checkpoint.bin";
+    params.fn_checkpoint_out = "checkpoint.bin";
+    params.fn_model_out      = "ggml-checkpoint-f32.bin";

    params.seed       =   -1;

@@ -788,8 +829,8 @@ static struct train_params get_default_train_params() {
    params.adam_alpha        = 1e-3f;
    params.adam_decay        = 1e-3f;

-    params.mem_model_gb    = 2;
-    params.mem_compute_gb  = 24;
+    params.mem_model_gb   = 2;
+    params.mem_compute_gb = 24;
    params.mem_compute0_gb = 8;
    params.mem_compute1_gb = 2;

@@ -875,30 +916,19 @@ int main(int argc, char ** argv) {
    if (!params_parse(argc, argv, &params)) {
        return 1;
    }
-    log_set_target(stdout);
    Config config;
    TransformerWeights weights = {};
    {
-        LOG("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model);
-        FILE *file = fopen(params.fn_llama2c_model, "r");
-        if (!file) {
-            LOG("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model);
-            return 1;
-        }
+        FILE *file = fopen(params.fn_llama2c_model, "rb");
+        if (!file) { printf("Unable to open the checkpoint file %s!\n", params.fn_llama2c_model); return 1; }
        // read in the config header
-        if (fread(&config, sizeof(Config), 1, file) != 1) {
-            LOG("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model);
-            return 1;
-        }
+        if(fread(&config, sizeof(Config), 1, file) != 1) { return 1; }
        auto shared_weights = config.vocab_size > 0;
        config.vocab_size = abs(config.vocab_size);

        // read in the Transformer weights
-        alloc_weights(&weights, &config, shared_weights);
-        if (checkpoint_init_weights(&weights, &config, file, shared_weights)) {
-            LOG("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model);
-            return 1;
-        }
+        malloc_weights(&weights, &config, shared_weights);
+        if(checkpoint_init_weights(&weights, &config, file, shared_weights)) { return 1; }
        fclose(file);
    }

@@ -906,18 +936,15 @@ int main(int argc, char ** argv) {
    load_vocab(params.fn_vocab_model, &config, &vocab);

    struct my_llama_model model;
-    model.hparams.n_vocab   = config.vocab_size; //llama_n_vocab(lctx);
-    model.hparams.n_ctx     = params.n_ctx;
-    model.hparams.n_embd    = config.dim; //params.n_embd;
-    model.hparams.n_ff      = config.hidden_dim;
-    model.hparams.n_mult    = 32;//params.n_mult;
-    model.hparams.n_head    = config.n_heads; //params.n_head;
-    model.hparams.n_head_kv = config.n_kv_heads;
-    model.hparams.n_layer   = config.n_layers; //params.n_layer;
-    model.hparams.n_rot     = std::min((uint32_t)params.n_rotmax, model.hparams.n_embd / model.hparams.n_head);
-
+    model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx);
+    model.hparams.n_ctx   = params.n_ctx;
+    model.hparams.n_embd  = config.dim; //params.n_embd;
+    model.hparams.n_ff    = config.hidden_dim;
+    model.hparams.n_mult  = 32;//params.n_mult;
+    model.hparams.n_head  = config.n_heads; //params.n_head;
+    model.hparams.n_layer = config.n_layers; //params.n_layer;
+    model.hparams.n_rot   = std::min((uint32_t)params.n_rotmax, model.hparams.n_embd / model.hparams.n_head);
    print_params(&model.hparams);
-
    struct ggml_init_params lcparams;
    lcparams.mem_size   = 1024ll*1024ll*1024ll*((size_t) params.mem_model_gb);
    lcparams.mem_buffer = NULL;
@@ -929,7 +956,7 @@ int main(int argc, char ** argv) {
    model.name = basename(params.fn_llama2c_model);
    save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model);

-    LOG("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model);
+    printf("Saving llama.c model file %s in ggml format at %s\n", params.fn_llama2c_model, params.fn_llama2c_output_model);

    ggml_free(model.ctx);
    return 0;
--- a/examples/gguf-split/gguf-split.cpp
+++ b/examples/gguf-split/gguf-split.cpp
@@ -1,34 +1,31 @@
 #include "llama.h"
+#include "ggml.h"
 #include "common.h"

 #include <algorithm>
 #include <cmath>
+#include <cstdint>
 #include <cstdlib>
 #include <fstream>
+#include <ios>
 #include <string>
 #include <vector>

 #include <stdio.h>
+#include <fcntl.h>
 #include <string.h>
-#include <climits>
-#include <stdexcept>
-
-#if defined(_WIN32)
-    #include <windows.h>
-    #ifndef PATH_MAX
-        #define PATH_MAX MAX_PATH
-    #endif
-    #include <io.h>
-#endif

 enum split_operation : uint8_t {
    SPLIT_OP_SPLIT,
    SPLIT_OP_MERGE,
 };

-static const char * const LLM_KV_SPLIT_NO            = "split.no";
-static const char * const LLM_KV_SPLIT_COUNT         = "split.count";
-static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
+static const char * const LLM_KV_GENERAL_SPLIT_I_SPLIT = "general.split";
+static const char * const LLM_KV_GENERAL_SPLIT_N_SPLIT = "general.split_count";
+
+static const int SPLIT_FILENAME_MAX = 256;
+
+static const char * const SPLIT_FILENAME_FORMAT = "%s-%05d-of-%05d.gguf";

 struct split_params {
    split_operation operation = SPLIT_OP_SPLIT;
@@ -119,13 +116,13 @@ static bool split_params_parse(int argc, const char ** argv, split_params & para
    try {
        if (!split_params_parse_ex(argc, argv, params)) {
            split_print_usage(argv[0]);
-            exit(EXIT_FAILURE);
+            exit(1);
        }
    }
    catch (const std::invalid_argument & ex) {
        fprintf(stderr, "%s\n", ex.what());
        split_print_usage(argv[0]);
-        exit(EXIT_FAILURE);
+        exit(1);
    }
    return result;
 }
@@ -137,6 +134,12 @@ static void zeros(std::ofstream & file, size_t n) {
    }
 }

+static std::string split_file_name(const std::string & path, int i_split, int n_split) {
+    char f_split[SPLIT_FILENAME_MAX] = {0};
+    snprintf(f_split, sizeof(f_split), SPLIT_FILENAME_FORMAT, path.c_str(), i_split + 1, n_split);
+    return std::string(f_split);
+}
+
 struct split_strategy {
    const split_params params;
    std::ifstream & f_input;
@@ -177,9 +180,8 @@ struct split_strategy {
        if (i_split == 0) {
            gguf_set_kv(ctx_out, ctx_gguf);
        }
-        gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_NO, i_split);
-        gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_COUNT, n_split);
-        gguf_set_val_i32(ctx_out, LLM_KV_SPLIT_TENSORS_COUNT, n_tensors);
+        gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_I_SPLIT, i_split);
+        gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_N_SPLIT, n_split);

        // populate the original tensors, so we get an initial metadata
        for (int i = i_split * params.n_split_tensors; i < n_tensors && i < (i_split + 1) * params.n_split_tensors; ++i) {
@@ -187,11 +189,10 @@ struct split_strategy {
            gguf_add_tensor(ctx_out, meta);
        }

-        char split_path[PATH_MAX] = {0};
-        llama_split_path(split_path, sizeof(split_path), params.output.c_str(), i_split, n_split);
+        auto split_name = split_file_name(params.output, i_split, n_split);

-        fprintf(stderr, "%s: %s ...", __func__, split_path);
-        fout = std::ofstream(split_path, std::ios::binary);
+        fprintf(stderr, "%s: %s ...", __func__, split_name.c_str());
+        fout = std::ofstream(split_name, std::ios::binary);
        fout.exceptions(std::ofstream::failbit); // fail fast on write errors

        auto meta_size = gguf_get_meta_size(ctx_out);
@@ -249,23 +250,19 @@ static void gguf_split(const split_params & split_params) {
    std::ifstream f_input(split_params.input.c_str(), std::ios::binary);
    if (!f_input.is_open()) {
        fprintf(stderr, "%s:  failed to open input GGUF from %s\n", __func__, split_params.input.c_str());
-        exit(EXIT_FAILURE);
+        exit(1);
    }

    auto * ctx_gguf = gguf_init_from_file(split_params.input.c_str(), params);
    if (!ctx_gguf) {
        fprintf(stderr, "%s:  failed to load input GGUF from %s\n", __func__, split_params.input.c_str());
-        exit(EXIT_FAILURE);
+        exit(1);
    }

    split_strategy strategy(split_params, f_input, ctx_gguf, ctx_meta);
-
-    char first_split_path[PATH_MAX] = {0};
-    llama_split_path(first_split_path, sizeof(first_split_path),
-                     split_params.output.c_str(), strategy.i_split, strategy.n_split);
    fprintf(stderr, "%s: %s -> %s (%d tensors per file)\n",
            __func__, split_params.input.c_str(),
-            first_split_path,
+            split_file_name(split_params.output, strategy.i_split, strategy.n_split).c_str(),
            split_params.n_split_tensors);

    strategy.split_start();
@@ -301,9 +298,7 @@ static void gguf_merge(const split_params & split_params) {
    std::vector<ggml_context *> ctx_metas;
    std::vector<gguf_context *> ctx_ggufs;

-    char split_path[PATH_MAX] = {0};
-    strncpy(split_path, split_params.input.c_str(), sizeof(split_path) - 1);
-    char split_prefix[PATH_MAX] = {0};
+    std::string split_prefix;

    // First pass to find KV and tensors metadata
    for (int i_split = 0; i_split < n_split; i_split++) {
@@ -314,66 +309,89 @@ static void gguf_merge(const split_params & split_params) {
            /*.ctx      = */ &ctx_meta,
        };

+        auto split_name = split_params.input;
        if (i_split > 0) {
-            llama_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split);
+            split_name = split_file_name(split_prefix, i_split, n_split);
        }
-        fprintf(stderr, "%s: reading metadata %s ...", __func__, split_path);
+        fprintf(stderr, "%s: reading metadata %s ...", __func__, split_name.c_str());

-        auto * ctx_gguf = gguf_init_from_file(split_path, params);
+        auto * ctx_gguf = gguf_init_from_file(split_name.c_str(), params);
        if (!ctx_gguf) {
            fprintf(stderr, "\n%s:  failed to load input GGUF from %s\n", __func__, split_params.input.c_str());
-            exit(EXIT_FAILURE);
+            exit(1);
        }
        ctx_ggufs.push_back(ctx_gguf);
        ctx_metas.push_back(ctx_meta);

        if (i_split == 0) {
-            auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
+            auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_GENERAL_SPLIT_N_SPLIT);
            if (key_n_split < 0) {
                fprintf(stderr,
                        "\n%s: input file does not contain %s metadata\n",
                        __func__,
-                        LLM_KV_SPLIT_COUNT);
+                        LLM_KV_GENERAL_SPLIT_N_SPLIT);
                gguf_free(ctx_gguf);
-                ggml_free(ctx_meta);
                gguf_free(ctx_out);
                fout.close();
-                exit(EXIT_FAILURE);
+                exit(1);
            }

-            n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
+            n_split = gguf_get_val_u8(ctx_gguf, key_n_split);
            if (n_split < 1) {
                fprintf(stderr,
                        "\n%s: input file does not contain a valid split count %d\n",
                        __func__,
                        n_split);
                gguf_free(ctx_gguf);
-                ggml_free(ctx_meta);
                gguf_free(ctx_out);
                fout.close();
-                exit(EXIT_FAILURE);
-            }
-
-            // Verify the file naming and extract split_prefix
-            if (!llama_split_prefix(split_prefix, sizeof (split_prefix), split_path, i_split, n_split)) {
-                fprintf(stderr, "\n%s: unexpected input file name: %s"
-                                " i_split=%d"
-                                " n_split=%d\n", __func__,
-                        split_path, i_split, n_split);
-                gguf_free(ctx_gguf);
-                ggml_free(ctx_meta);
-                gguf_free(ctx_out);
-                fout.close();
-                exit(EXIT_FAILURE);
+                exit(1);
            }

            // Do not trigger merge if we try to merge again the output
-            gguf_set_val_u16(ctx_gguf, LLM_KV_SPLIT_COUNT, 0);
+            gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_N_SPLIT, 0);

            // Set metadata from the first split
            gguf_set_kv(ctx_out, ctx_gguf);
        }

+        // Verify the file naming
+        {
+            int i_split_file = 0;
+            int n_split_file = 0;
+            const char * i_split_format = "-00000-of-00000.gguf";
+
+            if (split_name.size() < strlen(i_split_format)) {
+                fprintf(stderr, "\n%s: unexpected input file name: %s\n", __func__, split_params.input.c_str());
+                for (auto * _ctx_gguf : ctx_ggufs) {
+                    gguf_free(_ctx_gguf);
+                }
+                gguf_free(ctx_out);
+                fout.close();
+                exit(1);
+            }
+
+            split_prefix = split_name.substr(0, split_name.size() - strlen(i_split_format));
+
+            const char * split_name_c_str = split_name.c_str();
+            int n_part = sscanf(&split_name_c_str[0] + split_prefix.size(), "-%d-of-%d", &i_split_file, &n_split_file);
+
+            if (n_part != 2 || i_split_file - 1 != i_split || n_split_file != n_split) {
+                fprintf(stderr, "\n%s: unexpected input file name: %s"
+                                " i_split=%d i_split_file=%d"
+                                " n_split=%d n_split_file=%d\n", __func__,
+                        split_params.input.c_str(),
+                        i_split, i_split_file,
+                        n_split, n_split_file);
+                for (auto * _ctx_gguf : ctx_ggufs) {
+                    gguf_free(_ctx_gguf);
+                }
+                gguf_free(ctx_out);
+                fout.close();
+                exit(1);
+            }
+        }
+
        auto n_tensors = gguf_get_n_tensors(ctx_gguf);
        for (int i_tensor = 0; i_tensor < n_tensors; i_tensor++) {
            const char * t_name = gguf_get_tensor_name(ctx_gguf, i_tensor);
@@ -393,19 +411,18 @@ static void gguf_merge(const split_params & split_params) {

    // Write tensors data
    for (int i_split = 0; i_split < n_split; i_split++) {
-        llama_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split);
-        std::ifstream f_input(split_path, std::ios::binary);
+        auto split_name = split_file_name(split_prefix, i_split, n_split);
+        std::ifstream f_input(split_name.c_str(), std::ios::binary);
        if (!f_input.is_open()) {
-            fprintf(stderr, "%s:  failed to open input GGUF from %s\n", __func__, split_path);
-            for (uint32_t i = 0; i < ctx_ggufs.size(); i++) {
-                gguf_free(ctx_ggufs[i]);
-                ggml_free(ctx_metas[i]);
+            fprintf(stderr, "%s:  failed to open input GGUF from %s\n", __func__, split_name.c_str());
+            for (auto * _ctx_gguf : ctx_ggufs) {
+                gguf_free(_ctx_gguf);
            }
            gguf_free(ctx_out);
            fout.close();
-            exit(EXIT_FAILURE);
+            exit(1);
        }
-        fprintf(stderr, "%s: writing tensors %s ...", __func__, split_path);
+        fprintf(stderr, "%s: writing tensors %s ...", __func__, split_name.c_str());

        auto * ctx_gguf = ctx_ggufs[i_split];
        auto * ctx_meta = ctx_metas[i_split];
@@ -464,8 +481,8 @@ int main(int argc, const char ** argv) {
            break;
        case SPLIT_OP_MERGE: gguf_merge(params);
            break;
-        default: split_print_usage(argv[0]);
-            exit(EXIT_FAILURE);
+        default:split_print_usage(argv[0]);
+            exit(1);
    }

    return 0;
--- a/examples/json-schema-to-grammar.py
+++ b/examples/json-schema-to-grammar.py
@@ -61,7 +61,7 @@ class SchemaConverter:

    def _format_literal(self, literal):
        escaped = GRAMMAR_LITERAL_ESCAPE_RE.sub(
-            lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)), literal
+            lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)), json.dumps(literal)
        )
        return f'"{escaped}"'

@@ -308,7 +308,8 @@ class SchemaConverter:
        return ref_name

    def _generate_constant_rule(self, value):
-        return self._format_literal(json.dumps(value))
+        assert isinstance(value, str), f'Only string constants are supported, got {value}'
+        return self._format_literal(value)

    def visit(self, schema, name):
        schema_type = schema.get('type')
@@ -427,7 +428,7 @@ class SchemaConverter:
            prop_rule_name = self.visit(prop_schema, f'{name}{"-" if name else ""}{prop_name}')
            prop_kv_rule_names[prop_name] = self._add_rule(
                f'{name}{"-" if name else ""}{prop_name}-kv',
-                fr'{self._format_literal(json.dumps(prop_name))} space ":" space {prop_rule_name}'
+                fr'{self._format_literal(prop_name)} space ":" space {prop_rule_name}'
            )
        required_props = [k for k in sorted_props if k in required]
        optional_props = [k for k in sorted_props if k not in required]
--- a/examples/lookup/CMakeLists.txt
+++ b/examples/lookup/CMakeLists.txt
@@ -3,21 +3,3 @@ add_executable(${TARGET} lookup.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
-
-set(TARGET lookup-create)
-add_executable(${TARGET} lookup-create.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
-
-set(TARGET lookup-merge)
-add_executable(${TARGET} lookup-merge.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
-
-set(TARGET lookup-stats)
-add_executable(${TARGET} lookup-stats.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/lookup/lookup-create.cpp
+++ b/examples/lookup/lookup-create.cpp
@@ -1,43 +0,0 @@
-#include "ggml.h"
-#include "llama.h"
-#include "common.h"
-#include "ngram-cache.h"
-
-#include <cstdint>
-#include <fstream>
-#include <iostream>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-int main(int argc, char ** argv){
-    gpt_params params;
-
-    if (!gpt_params_parse(argc, argv, params)) {
-        return 1;
-    }
-    // init llama.cpp
-    llama_backend_init();
-    llama_numa_init(params.numa);
-
-    llama_model * model = NULL;
-    llama_context * ctx = NULL;
-
-    // load the model
-    std::tie(model, ctx) = llama_init_from_gpt_params(params);
-    GGML_ASSERT(model != nullptr);
-
-    // tokenize the prompt
-    const bool add_bos = llama_should_add_bos_token(model);
-
-    std::vector<llama_token> inp;
-    inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
-    fprintf(stderr, "%s: tokenization done\n", __func__);
-
-
-    llama_ngram_cache ngram_cache;
-    llama_ngram_cache_update(ngram_cache, LLAMA_NGRAM_STATIC, LLAMA_NGRAM_STATIC, inp, inp.size(), true);
-    fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str());
-
-    llama_ngram_cache_save(ngram_cache, params.lookup_cache_static);
-}
--- a/examples/lookup/lookup-merge.cpp
+++ b/examples/lookup/lookup-merge.cpp
@@ -1,47 +0,0 @@
-#include "ggml.h"
-#include "llama.h"
-#include "common.h"
-#include "ngram-cache.h"
-
-#include <cstdint>
-#include <cstdio>
-#include <fstream>
-#include <iostream>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-static void print_usage() {
-    fprintf(stderr, "Merges multiple lookup cache files into a single one.\n");
-    fprintf(stderr, "Usage: lookup-merge [--help] lookup_part_1.bin lookup_part_2.bin ... lookup_merged.bin\n");
-}
-
-int main(int argc, char ** argv){
-    if (argc < 3) {
-        print_usage();
-        exit(1);
-    }
-
-    std::vector<std::string> args;
-    args.resize(argc-1);
-    for (int i = 0; i < argc-1; ++i) {
-        args[i] = argv[i+1];
-        if (args[i] == "-h" || args[i] == "--help") {
-            print_usage();
-            exit(0);
-        }
-    }
-
-    fprintf(stderr, "lookup-merge: loading file %s\n", args[0].c_str());
-    llama_ngram_cache ngram_cache_merged = llama_ngram_cache_load(args[0]);
-
-    for (size_t i = 1; i < args.size()-1; ++i) {
-        fprintf(stderr, "lookup-merge: loading file %s\n", args[i].c_str());
-        llama_ngram_cache ngram_cache = llama_ngram_cache_load(args[i]);
-
-        llama_ngram_cache_merge(ngram_cache_merged, ngram_cache);
-    }
-
-    fprintf(stderr, "lookup-merge: saving file %s\n", args.back().c_str());
-    llama_ngram_cache_save(ngram_cache_merged, args.back());
-}
--- a/examples/lookup/lookup-stats.cpp
+++ b/examples/lookup/lookup-stats.cpp
@@ -1,163 +0,0 @@
-#include "ggml.h"
-#include "common.h"
-#include "llama.h"
-#include "log.h"
-#include "ngram-cache.h"
-
-#include <cmath>
-#include <cstdint>
-#include <cstdio>
-#include <fstream>
-#include <string>
-#include <vector>
-#include <unordered_map>
-
-int main(int argc, char ** argv){
-    gpt_params params;
-
-    if (!gpt_params_parse(argc, argv, params)) {
-        return 1;
-    }
-
-    const int n_draft = params.n_draft;
-
-    // init llama.cpp
-    llama_backend_init();
-    llama_numa_init(params.numa);
-
-    llama_model * model = NULL;
-    llama_context * ctx = NULL;
-
-    // load the model
-    std::tie(model, ctx) = llama_init_from_gpt_params(params);
-    llama_set_rng_seed(ctx, params.seed);
-    GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
-
-    // tokenize the prompt
-    const bool add_bos = llama_should_add_bos_token(model);
-    LOG("add_bos tgt: %d\n", add_bos);
-
-    std::vector<llama_token> inp;
-    inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
-
-    llama_ngram_cache ngram_cache_context;
-    llama_ngram_cache ngram_cache_dynamic;
-    llama_ngram_cache ngram_cache_static;
-    int64_t t_draft_flat_us = 0;
-    int64_t t_draft_us = 0;
-
-    {
-        const int64_t t_start_draft_us = ggml_time_us();
-
-        if (!params.lookup_cache_static.empty()) {
-            try {
-                ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
-            } catch (std::ifstream::failure const &) {
-                fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
-                exit(1);
-            }
-        }
-
-        if (!params.lookup_cache_dynamic.empty()) {
-            try {
-                ngram_cache_dynamic = llama_ngram_cache_load(params.lookup_cache_dynamic);
-            } catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
-        }
-
-        t_draft_flat_us += ggml_time_us() - t_start_draft_us;
-    }
-
-    const int n_input = inp.size();
-    const int n_ctx = params.n_ctx;
-
-    int n_drafted = 0;
-    int n_accept  = 0;
-
-    const int64_t t_start_ms = ggml_time_ms();
-
-    // Iterate over input tokens in chunks of size n_ctx.
-    // Each chunk is treated as if a sequential generation but with pre-determined tokens to ensure reproducibility.
-    for (int i_start = 0; i_start + n_ctx < n_input; i_start += n_ctx) {
-        const std::vector<llama_token> inp_slice(inp.begin() + i_start, inp.begin() + i_start + n_ctx);
-        std::vector<llama_token> pseudo_output;
-        pseudo_output.push_back(inp_slice[0]);
-
-        while ((int) pseudo_output.size() < n_ctx) {
-            // Simulate drafting and decoding from draft:
-            std::vector<llama_token> draft;
-            draft.push_back(pseudo_output.back());
-
-            {
-                const int64_t t_start_draft_us = ggml_time_us();
-                llama_ngram_cache_draft(pseudo_output, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
-                t_draft_us += ggml_time_us() - t_start_draft_us;
-            }
-
-            n_drafted += draft.size() - 1;
-
-            for (size_t j = 1; j < draft.size() && (int) pseudo_output.size() < n_ctx; ++j) {
-                const llama_token ground_truth = inp_slice[pseudo_output.size()];
-                const llama_token drafted = draft[j];
-
-                if (ground_truth != drafted) {
-                    break;
-                }
-
-                ++n_accept;
-                pseudo_output.push_back(ground_truth);
-
-                {
-                    const int64_t t_start_draft_us = ggml_time_us();
-                    llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
-                    t_draft_us += ggml_time_us() - t_start_draft_us;
-                }
-            }
-
-            // After each simulated batch decoding simulate the sampling of a single token:
-            if ((int) pseudo_output.size() < n_ctx) {
-                pseudo_output.push_back(inp_slice[pseudo_output.size()]);
-                {
-                    const int64_t t_start_draft_us = ggml_time_us();
-                    llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
-                    t_draft_us += ggml_time_us() - t_start_draft_us;
-                }
-            }
-
-            draft.erase(draft.begin());
-
-        }
-        if (i_start > 0 && i_start / 100000 != (i_start - n_ctx) / 100000) {
-            const int64_t t_now_ms = ggml_time_ms();
-            const int64_t eta_ms   = (n_input - i_start) * (t_now_ms - t_start_ms) / i_start;
-            const int64_t eta_min  = eta_ms / (60*1000);
-            const int64_t eta_s    = (eta_ms - 60*1000*eta_min) / 1000;
-
-            LOG_TEE("lookup-stats: %d/%d done, ETA: %02" PRId64 ":%02" PRId64 "\n", i_start, n_input, eta_min, eta_s);
-        }
-
-        // After each chunk, update the dynamic ngram cache with the context ngram cache:
-        llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
-        ngram_cache_context.clear();
-    }
-
-    LOG_TEE("\n");
-
-    LOG_TEE("\n");
-    LOG_TEE("n_draft      = %d\n", n_draft);
-    LOG_TEE("n_predict    = %d\n", n_input - n_input % n_ctx);
-    LOG_TEE("n_drafted    = %d\n", n_drafted);
-    LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
-    LOG_TEE("t_draft      = %.2f ms, %.2f us per token, %.2f tokens per second\n",
-            t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
-    LOG_TEE("n_accept     = %d\n", n_accept);
-    LOG_TEE("accept       = %.3f%%\n", 100.0f * n_accept / n_drafted);
-
-    llama_free(ctx);
-    llama_free_model(model);
-
-    llama_backend_free();
-
-    fprintf(stderr, "\n\n");
-
-    return 0;
-}
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -1,15 +1,12 @@
+#include "common.h"
 #include "ggml.h"
 #include "llama.h"
-#include "common.h"
-#include "ngram-cache.h"

 #include <cmath>
 #include <cstdint>
 #include <cstdio>
-#include <fstream>
 #include <string>
 #include <vector>
-#include <unordered_map>

 int main(int argc, char ** argv){
    gpt_params params;
@@ -18,7 +15,11 @@ int main(int argc, char ** argv){
        return 1;
    }

-    // max. number of additional tokens to draft if match is found
+    // max/min n-grams size to search for in prompt
+    const int ngram_max = 4;
+    const int ngram_min = 1;
+
+    // length of the candidate / draft sequence, if match is found
    const int n_draft = params.n_draft;

    const bool dump_kv_cache = params.dump_kv_cache;
@@ -38,8 +39,6 @@ int main(int argc, char ** argv){

    // load the model
    std::tie(model, ctx) = llama_init_from_gpt_params(params);
-    llama_set_rng_seed(ctx, params.seed);
-    GGML_ASSERT(llama_n_vocab(model) < (1 << 16));

    // tokenize the prompt
    const bool add_bos = llama_should_add_bos_token(model);
@@ -48,35 +47,6 @@ int main(int argc, char ** argv){
    std::vector<llama_token> inp;
    inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);

-    llama_ngram_cache ngram_cache_context;
-    llama_ngram_cache ngram_cache_dynamic;
-    llama_ngram_cache ngram_cache_static;
-    int64_t t_draft_flat_us = 0;
-    int64_t t_draft_us = 0;
-
-    {
-        // Fill up context ngram cache with tokens from user input:
-        const int64_t t_start_draft_us = ggml_time_us();
-        llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, inp.size(), false);
-
-        if (!params.lookup_cache_static.empty()) {
-            try {
-                ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
-            } catch (std::ifstream::failure const &) {
-                fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
-                exit(1);
-            }
-        }
-
-        if (!params.lookup_cache_dynamic.empty()) {
-            try {
-                ngram_cache_dynamic = llama_ngram_cache_load(params.lookup_cache_dynamic);
-            } catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
-        }
-
-        t_draft_flat_us += ggml_time_us() - t_start_draft_us;
-    }
-
    const int max_context_size     = llama_n_ctx(ctx);
    const int max_tokens_list_size = max_context_size - 4;

@@ -106,6 +76,8 @@ int main(int argc, char ** argv){
    int n_drafted = 0;
    int n_accept  = 0;

+    int64_t t_draft_us = 0;
+
    int n_past = inp.size();

    bool has_eos = false;
@@ -157,12 +129,6 @@ int main(int argc, char ** argv){
                ++n_past;
                ++i_dft;
                inp.push_back(id);
-                {
-                    // Update context ngram cache with the newly accepted token:
-                    const int64_t t_start_draft_us = ggml_time_us();
-                    llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
-                    t_draft_us += ggml_time_us() - t_start_draft_us;
-                }

                if (params.use_color) {
                    // color accepted draft token
@@ -183,12 +149,6 @@ int main(int argc, char ** argv){
            draft.clear();
            draft.push_back(id);
            inp.push_back(id);
-            {
-                // Update context ngram cache with the newly accepted token:
-                const int64_t t_start_draft_us = ggml_time_us();
-                llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
-                t_draft_us += ggml_time_us() - t_start_draft_us;
-            }
            break;
        }

@@ -203,19 +163,44 @@ int main(int argc, char ** argv){
        llama_batch_clear(batch_tgt);
        llama_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);

-        // Draft already contains a single token sampled from the model:
-        GGML_ASSERT(draft.size() == 1);
-        GGML_ASSERT(draft[0] == inp.back());
+        // generate n_pred tokens through prompt lookup
+        auto prompt_lookup = [&]() -> void {
+            const int inp_size = inp.size();
+            for (int ngram_size = ngram_max ; ngram_size > ngram_min; --ngram_size){
+                const llama_token * ngram = &inp[inp_size - ngram_size];
+
+                for (int i = 0; i <= (int) inp_size - (ngram_size * 2); ++i) {
+                    bool match = true;
+                    for (int j = 0; j < ngram_size; ++j) {
+                        if (inp[i + j] != ngram[j]) {
+                            match = false;
+                            break;
+                        }
+                    }
+
+                    if (match) {
+                        const int startIdx = i + ngram_size;
+                        const int endIdx = startIdx + n_draft;
+                        if (endIdx < inp_size) {
+                            for (int j = startIdx; j < endIdx; ++j) {
+                                LOG(" - draft candidate %d: %d\n", j, inp[j]);
+                                draft.push_back(inp[j]);
+                                llama_batch_add(batch_tgt, inp[j], n_past + (j - startIdx) + 1, { 0 }, true);
+                                ++n_drafted;
+                            }
+                            return;
+                        }
+                    }
+                }
+            }
+            return;
+        };
+
        const int64_t t_start_draft_us = ggml_time_us();

-        llama_ngram_cache_draft(inp, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
-
-        for (size_t i = 1; i < draft.size(); ++i) {
-            llama_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true);
-        }
+        prompt_lookup();

        t_draft_us += ggml_time_us() - t_start_draft_us;
-        n_drafted += draft.size() - 1;

        llama_decode(ctx, batch_tgt);
        ++n_past;
@@ -225,24 +210,19 @@ int main(int argc, char ** argv){

    auto t_dec_end = ggml_time_us();

-    // Update dynamic ngram cache with context ngram cache and save it to disk:
-    llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
-    llama_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic);
-
    LOG_TEE("\n\n");

    LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
    LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));

    LOG_TEE("\n");
-    LOG_TEE("n_draft      = %d\n", n_draft);
-    LOG_TEE("n_predict    = %d\n", n_predict);
-    LOG_TEE("n_drafted    = %d\n", n_drafted);
-    LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
-    LOG_TEE("t_draft      = %.2f ms, %.2f us per token, %.2f tokens per second\n",
+    LOG_TEE("n_draft   = %d\n", n_draft);
+    LOG_TEE("n_predict = %d\n", n_predict);
+    LOG_TEE("n_drafted = %d\n", n_drafted);
+    LOG_TEE("t_draft   = %.2f ms, %.2f us per token, %.2f tokens per second\n",
            t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
-    LOG_TEE("n_accept     = %d\n", n_accept);
-    LOG_TEE("accept       = %.3f%%\n", 100.0f * n_accept / n_drafted);
+    LOG_TEE("n_accept  = %d\n", n_accept);
+    LOG_TEE("accept    = %.3f%%\n", 100.0f * n_accept / n_drafted);

    LOG_TEE("\ntarget:\n");
    llama_print_timings(ctx);
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -189,18 +189,6 @@ static void prepare_imatrix(const std::string& imatrix_file,
    }
 }

-static ggml_type parse_ggml_type(const char * arg) {
-    ggml_type result = GGML_TYPE_COUNT;
-    for (int j = 0; j < GGML_TYPE_COUNT; ++j) {
-        auto type = ggml_type(j);
-        const auto * name = ggml_type_name(type);
-        if (name && strcmp(arg, name) == 0) {
-            result = type; break;
-        }
-    }
-    return result;
-}
-
 int main(int argc, char ** argv) {
    if (argc < 3) {
        usage(argv[0]);
@@ -215,18 +203,6 @@ int main(int argc, char ** argv) {
    for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
        if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
            params.quantize_output_tensor = false;
-        } else if (strcmp(argv[arg_idx], "--output-tensor-type") == 0) {
-            if (arg_idx < argc-1) {
-                params.output_tensor_type = parse_ggml_type(argv[++arg_idx]);
-            } else {
-                usage(argv[0]);
-            }
-        } else if (strcmp(argv[arg_idx], "--token-embedding-type") == 0) {
-            if (arg_idx < argc-1) {
-                params.token_embedding_type = parse_ggml_type(argv[++arg_idx]);
-            } else {
-                usage(argv[0]);
-            }
        } else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
            params.allow_requantize = true;
        } else if (strcmp(argv[arg_idx], "--pure") == 0) {
--- a/examples/server/json-schema-to-grammar.mjs.hpp
+++ b/examples/server/json-schema-to-grammar.mjs.hpp
--- a/examples/server/public/json-schema-to-grammar.mjs
+++ b/examples/server/public/json-schema-to-grammar.mjs
@@ -48,7 +48,7 @@ export class SchemaConverter {
  }

  _formatLiteral(literal) {
-    const escaped = literal.replace(
+    const escaped = JSON.stringify(literal).replace(
      GRAMMAR_LITERAL_ESCAPE_RE,
      m => GRAMMAR_LITERAL_ESCAPES[m]
    );
@@ -327,7 +327,10 @@ export class SchemaConverter {
  }

  _generateConstantRule(value) {
-    return this._formatLiteral(JSON.stringify(value));
+    if (typeof value !== 'string') {
+      throw new Error('Only string constants are supported, got ' + JSON.stringify(value));
+    }
+    return this._formatLiteral(value);
  }

  visit(schema, name) {
@@ -343,6 +346,9 @@ export class SchemaConverter {
    } else if (Array.isArray(schemaType)) {
      return this._addRule(ruleName, this._generateUnionRule(name, schemaType.map(t => ({ type: t }))));
    } else if ('const' in schema) {
+      if (typeof schema.const !== 'string') {
+        throw new Error('Only string constants are supported, got ' + JSON.stringify(schema.const));
+      }
      return this._addRule(ruleName, this._generateConstantRule(schema.const));
    } else if ('enum' in schema) {
      const rule = schema.enum.map(v => this._generateConstantRule(v)).join(' | ');
@@ -451,7 +457,7 @@ export class SchemaConverter {
      const propRuleName = this.visit(propSchema, `${name ?? ''}${name ? '-' : ''}${propName}`);
      propKvRuleNames[propName] = this._addRule(
        `${name ?? ''}${name ? '-' : ''}${propName}-kv`,
-        `${this._formatLiteral(JSON.stringify(propName))} space ":" space ${propRuleName}`
+        `${this._formatLiteral(propName)} space ":" space ${propRuleName}`
      );
    }
    const requiredProps = sortedProps.filter(k => required.has(k));
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -30,7 +30,7 @@
 #include <signal.h>
 #include <memory>

-using json = nlohmann::ordered_json;
+using json = nlohmann::json;

 bool server_verbose = false;
 bool server_log_json = true;
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -12,7 +12,7 @@

 #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"

-using json = nlohmann::ordered_json;
+using json = nlohmann::json;

 // https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11
 enum error_type {
@@ -95,8 +95,8 @@ static inline void server_log(const char *level, const char *function, int line,

        const std::string str = ss.str();
        printf("%.*s\n", (int)str.size(), str.data());
+        fflush(stdout);
    }
-    fflush(stdout);
 }

 //
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -771,11 +771,7 @@ GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t
        if (src_ctx->device == dst_ctx->device) {
            CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice, cudaStreamPerThread));
        } else {
-#ifdef GGML_CUDA_NO_PEER_COPY
-            return false;
-#else
            CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, dst_ctx->device, src->data, src_ctx->device, ggml_nbytes(src), cudaStreamPerThread));
-#endif
        }
        CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
        return true;
@@ -11326,23 +11322,19 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
        GGML_ASSERT(cuda_ctx_src->device == buf_ctx_src->device);
        GGML_ASSERT(cuda_ctx_dst->device == buf_ctx_dst->device);

-        // copy on src stream
-        if (cuda_ctx_src->device == cuda_ctx_dst->device) {
-            CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream()));
-        } else {
-#ifdef GGML_CUDA_NO_PEER_COPY
-            return false;
-#else
-            CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream()));
-#endif
-        }
-
-        // record event on src stream
        if (!cuda_ctx_src->copy_event) {
            ggml_cuda_set_device(cuda_ctx_src->device);
            CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming));
        }

+        // copy on src stream
+        if (cuda_ctx_src->device == cuda_ctx_dst->device) {
+            CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream()));
+        } else {
+            CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream()));
+        }
+
+        // record event on src stream
        CUDA_CHECK(cudaEventRecord(cuda_ctx_src->copy_event, cuda_ctx_src->stream()));

        // wait on dst stream for the copy to complete
@@ -11538,9 +11530,6 @@ GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const
 }

 static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend) {
-#ifdef GGML_CUDA_NO_PEER_COPY
-    return nullptr;
-#else
    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;

    ggml_cuda_set_device(cuda_ctx->device);
@@ -11552,7 +11541,6 @@ static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend)
        /* .backend = */ backend,
        /* .context = */ event,
    };
-#endif
 }

 static void ggml_backend_cuda_event_free(ggml_backend_event_t event) {
--- a/llama.cpp
+++ b/llama.cpp
@@ -52,9 +52,6 @@
        #define NOMINMAX
    #endif
    #include <windows.h>
-    #ifndef PATH_MAX
-        #define PATH_MAX MAX_PATH
-    #endif
    #include <io.h>
 #endif

@@ -293,10 +290,6 @@ enum llm_kv {
    LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
    LLM_KV_ROPE_SCALING_FINETUNED,

-    LLM_KV_SPLIT_NO,
-    LLM_KV_SPLIT_COUNT,
-    LLM_KV_SPLIT_TENSORS_COUNT,
-
    LLM_KV_SSM_INNER_SIZE,
    LLM_KV_SSM_CONV_KERNEL,
    LLM_KV_SSM_STATE_SIZE,
@@ -362,10 +355,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,     "%s.rope.scaling.original_context_length" },
    { LLM_KV_ROPE_SCALING_FINETUNED,        "%s.rope.scaling.finetuned"               },

-    { LLM_KV_SPLIT_NO,                      "split.no"            },
-    { LLM_KV_SPLIT_COUNT,                   "split.count"         },
-    { LLM_KV_SPLIT_TENSORS_COUNT,           "split.tensors.count" },
-
    { LLM_KV_SSM_CONV_KERNEL,               "%s.ssm.conv_kernel"    },
    { LLM_KV_SSM_INNER_SIZE,                "%s.ssm.inner_size"     },
    { LLM_KV_SSM_STATE_SIZE,                "%s.ssm.state_size"     },
@@ -1110,7 +1099,6 @@ struct llama_file {
        }
    }
 };
-using llama_files = std::vector<std::unique_ptr<llama_file>>;

 struct llama_mmap {
    void * addr;
@@ -1311,7 +1299,6 @@ struct llama_mmap {
    }
 #endif
 };
-using llama_mmaps = std::vector<std::unique_ptr<llama_mmap>>;

 // Represents some region of memory being locked using mlock or VirtualLock;
 // will automatically unlock on destruction.
@@ -1461,7 +1448,6 @@ struct llama_mlock {
    static void raw_unlock(const void * addr, size_t len) {}
 #endif
 };
-using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;

 static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
    std::vector<char> result(8, 0);
@@ -2037,12 +2023,12 @@ struct llama_model {
    // the model memory buffers for the tensor data
    std::vector<ggml_backend_buffer_t> bufs;

-    // model memory mapped files
-    llama_mmaps mappings;
+    // model memory mapped file
+    std::unique_ptr<llama_mmap> mapping;

    // objects representing data potentially being locked in memory
-    llama_mlocks mlock_bufs;
-    llama_mlocks mlock_mmaps;
+    std::vector<std::unique_ptr<llama_mlock>> mlock_bufs;
+    llama_mlock mlock_mmap;

    // for quantize-stats only
    std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
@@ -2806,8 +2792,6 @@ namespace GGUFMeta {
    };
 }

-using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;
-
 struct llama_model_loader {
    int n_kv      = 0;
    int n_tensors = 0;
@@ -2818,133 +2802,54 @@ struct llama_model_loader {

    bool use_mmap = false;

-    llama_files files;
+    llama_file  file;
    llama_ftype ftype;
    llama_fver  fver;

-    llama_mmaps mappings;
-
-    // Holds information on a model weights
-    struct llama_tensor_weights {
-        uint16_t  idx; // source file index
-        size_t   offs; // tensor data offset in the original file
-
-        ggml_tensor * tensor;
-
-        llama_tensor_weights(uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
-            const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
-            offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
-        }
-    };
-    std::vector<llama_tensor_weights> weights;
-
+    std::unique_ptr<llama_mmap> mapping;
    std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;

-    struct gguf_context * meta = NULL;
-    std::vector<ggml_context *> contexts;
+    struct gguf_context * ctx_gguf = NULL;
+    struct ggml_context * ctx_meta = NULL;

    std::string arch_name;
    LLM_KV      llm_kv    = LLM_KV(LLM_ARCH_UNKNOWN);

-    llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) {
+    llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) : file(fname.c_str(), "rb") {
        int trace = 0;
        if (getenv("LLAMA_TRACE")) {
            trace = atoi(getenv("LLAMA_TRACE"));
        }

+        struct gguf_init_params params = {
+            /*.no_alloc = */ true,
+            /*.ctx      = */ &ctx_meta,
+        };
+
        if (param_overrides_p != nullptr) {
            for (const struct llama_model_kv_override *p = param_overrides_p; p->key[0] != 0; p++) {
                kv_overrides.insert({std::string(p->key), *p});
            }
        }

-        struct ggml_context * ctx = NULL;
-        struct gguf_init_params params = {
-            /*.no_alloc = */ true,
-            /*.ctx      = */ &ctx,
-        };
-
-        meta = gguf_init_from_file(fname.c_str(), params);
-        if (!meta) {
+        ctx_gguf = gguf_init_from_file(fname.c_str(), params);
+        if (!ctx_gguf) {
            throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str()));
        }

        get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
        llm_kv = LLM_KV(llm_arch_from_string(arch_name));

-        // Save tensors data offset of the main file.
-        // For subsidiary files, `meta` tensor data offset must not be used,
-        // so we build a unified tensors index for weights.
-        for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
-            weights.emplace_back(llama_tensor_weights(0, cur->name, meta, cur));
-        }
-        files.emplace_back(new llama_file(fname.c_str(), "rb"));
-        contexts.emplace_back(ctx);
+        n_kv      = gguf_get_n_kv(ctx_gguf);
+        n_tensors = gguf_get_n_tensors(ctx_gguf);

-        uint16_t n_split = 0;
-        get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
+        fver = (enum llama_fver ) gguf_get_version(ctx_gguf);

-        // Load additional GGML contexts
-        if (n_split > 1) {
-            uint16_t idx = 0;
-            get_key(llm_kv(LLM_KV_SPLIT_NO), idx);
-            if (idx != 0) {
-                throw std::runtime_error(format("illegal split file: %d, model must be loaded with the first split", idx));
-            }
-
-            char split_prefix[PATH_MAX] = {0};
-            if (!llama_split_prefix(split_prefix, sizeof(split_prefix), fname.c_str(), idx, n_split)) {
-                throw std::runtime_error(format("invalid split file: %s", fname.c_str()));
-            }
-
-            if (trace > 0) {
-                LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split);
-            }
-
-            char split_path[PATH_MAX] = {0};
-            for (idx = 1; idx < n_split; idx++) {
-                llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split);
-
-                struct gguf_init_params split_params = {
-                    /*.no_alloc = */ true,
-                    /*.ctx      = */ &ctx,
-                };
-                struct gguf_context * ctx_gguf = gguf_init_from_file(split_path, split_params);
-                if (!ctx_gguf) {
-                    throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path));
-                }
-
-                // Save tensors data offset info of the shard.
-                for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
-                    weights.emplace_back(llama_tensor_weights(idx, cur->name, ctx_gguf, cur));
-                }
-                files.emplace_back(new llama_file(split_path, "rb"));
-                contexts.emplace_back(ctx);
-
-                gguf_free(ctx_gguf);
-            }
-
-            get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
-
-            // sanity check
-            {
-                const int n_tensors_loaded = (int) weights.size();
-                if (n_tensors != n_tensors_loaded) {
-                    throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded));
-                }
-            }
-
-            LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n",  __func__, n_split);
-        }
-
-        n_kv      = gguf_get_n_kv(meta);
-        n_tensors = weights.size();
-
-        fver = (enum llama_fver) gguf_get_version(meta);
-
-        for (auto & w : weights) {
-            n_elements += ggml_nelements(w.tensor);
-            n_bytes    += ggml_nbytes(w.tensor);
+        for (int i = 0; i < n_tensors; i++) {
+            const char * name = gguf_get_tensor_name(ctx_gguf, i);
+            struct ggml_tensor * t = ggml_get_tensor(ctx_meta, name);
+            n_elements += ggml_nelements(t);
+            n_bytes    += ggml_nbytes(t);
        }

        LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
@@ -2959,8 +2864,7 @@ struct llama_model_loader {
            enum ggml_type type_max = GGML_TYPE_F32;

            for (int i = 0; i < n_tensors; i++) {
-                const ggml_tensor * tensor = weights.at(i).tensor;
-                enum ggml_type type = tensor->type;
+                enum ggml_type type = gguf_get_tensor_type(ctx_gguf, i);

                n_type[type]++;

@@ -2970,8 +2874,8 @@ struct llama_model_loader {
                }

                if (trace > 0) {
-                    const uint16_t sid = weights.at(i).idx;
-                    LLAMA_LOG_INFO("%s: - tensor %4d, split %2d: %32s %-8s [ %s ]\n", __func__, i, sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str());
+                    struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
+                    LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, ggml_get_name(meta), ggml_type_name(type), llama_format_tensor_shape(meta).c_str());
                }
            }

@@ -3007,23 +2911,22 @@ struct llama_model_loader {
            ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);

            {
-                const int kid = gguf_find_key(meta, "general.file_type");
+                const int kid = gguf_find_key(ctx_gguf, "general.file_type");
                if (kid >= 0) {
-                    ftype = (llama_ftype) gguf_get_val_u32(meta, kid);
+                    ftype = (llama_ftype) gguf_get_val_u32(ctx_gguf, kid);
                }
            }

            LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
-
            for (int i = 0; i < n_kv; i++) {
-                const char * name           = gguf_get_key(meta, i);
-                const enum gguf_type type   = gguf_get_kv_type(meta, i);
+                const char * name           = gguf_get_key(ctx_gguf, i);
+                const enum gguf_type type   = gguf_get_kv_type(ctx_gguf, i);
                const std::string type_name =
                    type == GGUF_TYPE_ARRAY
-                    ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta, i)), gguf_get_arr_n(meta, i))
+                    ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx_gguf, i)), gguf_get_arr_n(ctx_gguf, i))
                    : gguf_type_name(type);

-                std::string value          = gguf_kv_to_str(meta, i);
+                std::string value          = gguf_kv_to_str(ctx_gguf, i);
                const size_t MAX_VALUE_LEN = 40;
                if (value.size() > MAX_VALUE_LEN) {
                    value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
@@ -3052,18 +2955,18 @@ struct llama_model_loader {
    }

    ~llama_model_loader() {
-        if (meta) {
-            gguf_free(meta);
+        if (ctx_gguf) {
+            gguf_free(ctx_gguf);
        }
-        for (auto * ctx : contexts) {
-            ggml_free(ctx);
+        if (ctx_meta) {
+            ggml_free(ctx_meta);
        }
    }

    template<typename T>
    typename std::enable_if<std::is_integral<T>::value, bool>::type
    get_arr_n(const std::string & key, T & result, const bool required = true) {
-        const int kid = gguf_find_key(meta, key.c_str());
+        const int kid = gguf_find_key(ctx_gguf, key.c_str());

        if (kid < 0) {
            if (required) {
@@ -3073,7 +2976,7 @@ struct llama_model_loader {
        }

        struct GGUFMeta::ArrayInfo arr_info =
-            GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
+            GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx_gguf, kid);


        result = arr_info.length;
@@ -3093,7 +2996,7 @@ struct llama_model_loader {
        const struct llama_model_kv_override * override =
            it != kv_overrides.end() ? &it->second : nullptr;

-        const bool found = GGUFMeta::GKV<T>::set(meta, key, result, override);
+        const bool found = GGUFMeta::GKV<T>::set(ctx_gguf, key, result, override);

        if (required && !found) {
            throw std::runtime_error(format("key not found in model: %s", key.c_str()));
@@ -3116,33 +3019,20 @@ struct llama_model_loader {
    }

    const char * get_tensor_name(int i) const {
-        return weights.at(i).tensor->name;
-    }
-
-    const llama_tensor_weights & get_weights(const char * name) const {
-        for (const auto & weight : weights) {
-            if (strcmp(name, weight.tensor->name) == 0) {
-                return weight;
-            }
-        }
-        throw std::runtime_error(format("tensor %s not found", name));
+        return gguf_get_tensor_name(ctx_gguf, i);
    }

    struct ggml_tensor * get_tensor_meta(const char * name) const {
-        try {
-            return get_weights(name).tensor;
-        } catch (const std::runtime_error & e) {
-            return NULL;
-        }
+        return ggml_get_tensor(ctx_meta, name);
    }

    struct ggml_tensor * get_tensor_meta(int i) const {
        return get_tensor_meta(get_tensor_name(i));
    }

-    struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur) {
-        struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
-        ggml_set_name(tensor, ggml_get_name(cur));
+    struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta) {
+        struct ggml_tensor * tensor = ggml_dup_tensor(ctx, meta);
+        ggml_set_name(tensor, ggml_get_name(meta));

        n_created++;

@@ -3150,7 +3040,7 @@ struct llama_model_loader {
    }

    struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, bool required = true) {
-        const struct ggml_tensor * cur = get_tensor_meta(name.c_str());
+        struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());

        if (cur == NULL) {
            if (!required) {
@@ -3185,79 +3075,76 @@ struct llama_model_loader {
        }
    }

-    void init_mappings(bool prefetch = true, llama_mlocks * mlock_mmaps = nullptr) {
+    size_t file_offset(const char * name) const {
+        const int idx = gguf_find_tensor(ctx_gguf, name);
+
+        if (idx < 0) {
+            throw std::runtime_error(format("%s: tensor '%s' not found in the file", __func__, name));
+        }
+
+        return gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, idx);
+    }
+
+    void init_mapping(bool prefetch = true, llama_mlock * lmlock = nullptr) {
+        // prefetch the whole file - all the data is needed anyway
        if (use_mmap) {
-            mappings.reserve(files.size());
-            mmaps_used.reserve(files.size());
-            for (const auto & file : files) {
-                std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa()));
-                mmaps_used.emplace_back(std::make_pair(mapping->size, 0));
-                if (mlock_mmaps) {
-                    std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
-                    mlock_mmap->init(mapping->addr);
-                    mlock_mmaps->emplace_back(std::move(mlock_mmap));
-                }
-                mappings.emplace_back(std::move(mapping));
-            }
+            mapping.reset(new llama_mmap(&file, prefetch ? -1 : 0, ggml_is_numa()));
        }

        // compute the total size of all tensors for progress reporting
-        for (auto & w : weights) {
-            size_data += ggml_nbytes(w.tensor);
+        for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
+            struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
+            size_data += ggml_nbytes(cur);
+        }
+
+        if (use_mmap && mapping) {
+            if (lmlock) {
+                lmlock->init(mapping->addr);
+            }
+            mmap_used_first = mapping->size;
        }
    }

-    void get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const {
-        GGML_ASSERT(!mappings.empty());
-        const auto & mapping = mappings.at(idx);
+    void get_mapping_range(size_t * first, size_t * last, ggml_context * ctx) const {
+        GGML_ASSERT(mapping);

        *first = mapping->size;
        *last  = 0;
-        *addr = mapping->addr;
        for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
-            const auto & w = get_weights(ggml_get_name(tensor));
-            if (w.idx != idx) {
-                continue;
-            }
-            *first = std::min(*first, w.offs);
-            *last  = std::max(*last,  w.offs + ggml_nbytes(tensor));
+            const size_t offs = file_offset(ggml_get_name(tensor));
+            *first = std::min(*first, offs);
+            *last  = std::max(*last,  offs + ggml_nbytes(tensor));
        }
    }

    // for backwards compatibility, does not support ggml-backend
    void load_data_for(struct ggml_tensor * cur) const {
-        const auto & w = get_weights(ggml_get_name(cur));
+        const size_t offs = file_offset(ggml_get_name(cur));

-        if (use_mmap) {
-            const auto & mapping = mappings.at(w.idx);
+        if (use_mmap && mapping) {
            if (cur->data == nullptr) {
-                cur->data = (uint8_t *)mapping->addr + w.offs;
+                cur->data = (uint8_t *)mapping->addr + offs;
            } else {
-                memcpy(cur->data, (uint8_t *)mapping->addr + w.offs, ggml_nbytes(cur));
+                memcpy(cur->data, (uint8_t *)mapping->addr + offs, ggml_nbytes(cur));
            }
        } else {
            GGML_ASSERT(cur->data != nullptr);
-            GGML_ASSERT(w.idx < files.size());
-            const auto & file = files.at(w.idx);
-            file->seek(w.offs, SEEK_SET);
-            file->read_raw(cur->data, ggml_nbytes(cur));
+            file.seek(offs, SEEK_SET);
+            file.read_raw(cur->data, ggml_nbytes(cur));
        }
    }

    size_t size_done = 0;
    size_t size_data = 0;
-    std::vector<std::pair<size_t, size_t>> mmaps_used;
+    size_t mmap_used_first = -1;
+    size_t mmap_used_last  = 0;

    // Returns false if cancelled by progress_callback
-    bool load_all_data(
-            struct ggml_context * ctx,
-            llama_buf_map & bufs_mmap,
-            llama_mlocks * lmlocks,
-            llama_progress_callback progress_callback,
-            void * progress_callback_user_data) {
-        GGML_ASSERT(size_data != 0 && "call init_mappings() first");
+    bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) {
+        GGML_ASSERT(size_data != 0 && "call init_mapping() first");

        std::vector<no_init<uint8_t>> read_buf;
+
        for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
            if (progress_callback) {
                if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
@@ -3265,57 +3152,41 @@ struct llama_model_loader {
                }
            }

-            const auto & w = get_weights(ggml_get_name(cur));
-            size_t n_size = ggml_nbytes(cur);
+            const size_t offs = file_offset(ggml_get_name(cur));

-            if (use_mmap) {
-                const auto & mapping = mappings.at(w.idx);
-                ggml_backend_buffer_t buf_mmap = nullptr;
-                if (bufs_mmap.count(w.idx)) {
-                    buf_mmap = bufs_mmap.at(w.idx);
-                }
-                GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
+            if (use_mmap && mapping) {
                if (buf_mmap && cur->data == nullptr) {
-                    ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + w.offs);
-                    if (lmlocks) {
-                        const auto & lmlock = lmlocks->at(w.idx);
-                        lmlock->grow_to(w.offs + ggml_nbytes(cur));
+                    ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + offs);
+                    if (lmlock) {
+                        lmlock->grow_to(offs + ggml_nbytes(cur));
                    }
-
-                    auto & mmap_used = mmaps_used[w.idx];
-                    mmap_used.first  = std::min(mmap_used.first,  w.offs);
-                    mmap_used.second = std::max(mmap_used.second, w.offs + n_size);
+                    mmap_used_first = std::min(mmap_used_first, offs);
+                    mmap_used_last  = std::max(mmap_used_last,  offs + ggml_nbytes(cur));
                } else {
-                    ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + w.offs, 0, n_size);
+                    ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0, ggml_nbytes(cur));
                }
            } else {
-                GGML_ASSERT(w.idx < files.size());
-                const auto & file = files.at(w.idx);
                if (ggml_backend_buffer_is_host(cur->buffer)) {
-                    file->seek(w.offs, SEEK_SET);
-                    file->read_raw(cur->data, ggml_nbytes(cur));
+                    file.seek(offs, SEEK_SET);
+                    file.read_raw(cur->data, ggml_nbytes(cur));
                } else {
                    read_buf.resize(ggml_nbytes(cur));
-                    file->seek(w.offs, SEEK_SET);
-                    file->read_raw(read_buf.data(), ggml_nbytes(cur));
-                    ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
+                    file.seek(offs, SEEK_SET);
+                    file.read_raw(read_buf.data(), ggml_nbytes(cur));
+                    ggml_backend_tensor_set(cur, read_buf.data(), 0, ggml_nbytes(cur));
                }
            }

-            size_done += n_size;
+            size_done += ggml_nbytes(cur);
        }

        // check if this is the last call and do final cleanup
        if (size_done >= size_data) {
            // unmap offloaded tensors and metadata
-            if (use_mmap) {
-                for (uint32_t idx = 0; idx < mappings.size(); idx++) {
-                    const auto & mmap_used = mmaps_used.at(idx);
-                    auto & mapping = mappings.at(idx);
-                    mapping->unmap_fragment(0, mmap_used.first);
-                    if (mmap_used.second != 0) {
-                        mapping->unmap_fragment(mmap_used.second, mapping->size);
-                    }
+            if (use_mmap && mapping) {
+                mapping->unmap_fragment(0, mmap_used_first);
+                if (mmap_used_last != 0) {
+                    mapping->unmap_fragment(mmap_used_last, mapping->size);
                }
            }
            if (progress_callback) {
@@ -3448,7 +3319,7 @@ static void llm_load_hparams(
        llama_model_loader & ml,
        llama_model & model) {
    auto & hparams = model.hparams;
-    const gguf_context * ctx = ml.meta;
+    const gguf_context * ctx = ml.ctx_gguf;

    // get metadata as string
    for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
@@ -3838,7 +3709,7 @@ static void llm_load_vocab(
        llama_model & model) {
    auto & vocab = model.vocab;

-    struct gguf_context * ctx = ml.meta;
+    struct gguf_context * ctx = ml.ctx_gguf;

    const auto kv = LLM_KV(model.arch);

@@ -4448,8 +4319,10 @@ static bool llm_load_tensors(
                        layer.attn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
                        layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd});

-                        layer.attn_norm_2   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, false);
-                        layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i),   {n_embd}, false);
+                        if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) {
+                            layer.attn_norm_2   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd});
+                            layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i),   {n_embd});
+                        }

                        layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
                        layer.wo   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
@@ -5151,97 +5024,56 @@ static bool llm_load_tensors(

    ml.done_getting_tensors();

-    ml.init_mappings(true, &model.mlock_mmaps);
-    model.mappings.reserve(ml.mappings.size());
+    ml.init_mapping(true, use_mlock ? &model.mlock_mmap : nullptr);

    // create the backend buffers
-    std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs;
-    ctx_bufs.reserve(ctx_map.size());
-
-    // Ensure we have enough capacity for the maximum backend buffer we will potentially create
-    size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
-    model.bufs.reserve(n_max_backend_buffer);
+    std::vector<std::pair<ggml_context *, ggml_backend_buffer_t>> ctx_bufs;

    for (auto & it : ctx_map) {
        ggml_backend_buffer_type_t buft = it.first;
-        ggml_context * ctx              = it.second;
-
-        llama_buf_map bufs;
-        bufs.reserve(n_max_backend_buffer);
+        ggml_context * ctx = it.second;
+        ggml_backend_buffer_t buf = nullptr;

        // only the mmap region containing the tensors in the model is mapped to the backend buffer
        // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
        // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
        if (ml.use_mmap && buft == llama_default_buffer_type_cpu(true)) {
-            for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
-                void * addr = nullptr;
-                size_t first, last;
-                ml.get_mapping_range(&first, &last, &addr, idx, ctx);
-                if (first >= last) {
-                    continue;
-                }
-                ggml_backend_buffer_t buf = ggml_backend_cpu_buffer_from_ptr((char *) addr + first, last - first);
-                if (buf == nullptr) {
-                    throw std::runtime_error("unable to allocate backend CPU buffer");
-                }
-                model.bufs.push_back(buf);
-                bufs.emplace(idx, buf);
+            size_t first, last;
+            ml.get_mapping_range(&first, &last, ctx);
+            buf = ggml_backend_cpu_buffer_from_ptr((char *) ml.mapping->addr + first, last - first);
 #ifdef GGML_USE_CUBLAS
-                if (n_layer >= n_gpu_layers) {
-                    ggml_backend_cuda_register_host_buffer(
+            if (n_layer >= n_gpu_layers) {
+                ggml_backend_cuda_register_host_buffer(
                        ggml_backend_buffer_get_base(buf),
                        ggml_backend_buffer_get_size(buf));
-                }
-#endif
            }
+#endif
        }
 #ifdef GGML_USE_METAL
        else if (ml.use_mmap && buft == ggml_backend_metal_buffer_type()) {
-            for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
-                const size_t max_size = ggml_get_max_tensor_size(ctx);
-                void * addr = nullptr;
-                size_t first, last;
-                ml.get_mapping_range(&first, &last, &addr, idx, ctx);
-                if (first >= last) {
-                    continue;
-                }
-                ggml_backend_buffer_t buf = ggml_backend_metal_buffer_from_ptr((char *) addr + first, last - first, max_size);
-                if (buf == nullptr) {
-                    throw std::runtime_error("unable to allocate backend metal buffer");
-                }
-                model.bufs.push_back(buf);
-                bufs.emplace(idx, buf);
-            }
+            const size_t max_size = ggml_get_max_tensor_size(ctx);
+            size_t first, last;
+            ml.get_mapping_range(&first, &last, ctx);
+            buf = ggml_backend_metal_buffer_from_ptr((char *) ml.mapping->addr + first, last - first, max_size);
        }
 #endif
        else {
-            ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
-            if (buf == nullptr) {
-                throw std::runtime_error("unable to allocate backend buffer");
-            }
-            model.bufs.push_back(buf);
-            if (use_mlock && ggml_backend_buffer_is_host(buf)) {
+            buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
+            if (buf != nullptr && use_mlock && ggml_backend_buffer_is_host(buf)) {
                model.mlock_bufs.emplace_back(new llama_mlock);
                auto & mlock_buf = model.mlock_bufs.back();
                mlock_buf->init   (ggml_backend_buffer_get_base(buf));
                mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
            }
-            for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
-                bufs.emplace(idx, buf);
-            }
        }
-
-        if (bufs.empty()) {
+        if (buf == nullptr) {
            throw std::runtime_error("failed to allocate buffer");
        }
-
-        for (auto & buf : bufs) {
-            // indicate that this buffer contains weights
-            // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight
-            ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
-        }
-
-        ctx_bufs.emplace_back(ctx, bufs);
+        // indicate that this buffer contains weights
+        // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight
+        ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+        model.bufs.push_back(buf);
+        ctx_bufs.emplace_back(ctx, buf);
    }

    if (llama_supports_gpu_offload()) {
@@ -5273,15 +5105,13 @@ static bool llm_load_tensors(
    // load tensor data
    for (auto & it : ctx_bufs) {
        ggml_context * ctx = it.first;
-        auto & bufs = it.second;
-        if (!ml.load_all_data(ctx, bufs, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) {
+        ggml_backend_buffer_t buf = it.second;
+        if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf, use_mlock ? &model.mlock_mmap : NULL)) {
            return false;
        }
    }

-    for (auto & mapping : ml.mappings) {
-        model.mappings.emplace_back(std::move(mapping));
-    }
+    model.mapping = std::move(ml.mapping);

    // loading time will be recalculate after the first eval, so
    // we take page faults deferred by mmap() into consideration
@@ -12141,34 +11971,27 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
    // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
    // with the quantization of the output tensor
    if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) {
-        if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
-            new_type = qs.params->output_tensor_type;
-        } else {
-            int nx = tensor->ne[0];
-            if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
-                new_type = GGML_TYPE_Q8_0;
-            }
-            else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
-                    ftype == LLAMA_FTYPE_MOSTLY_IQ1_S   || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S  || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
-                new_type = GGML_TYPE_Q5_K;
-            }
-            else if (new_type != GGML_TYPE_Q8_0) {
-                new_type = GGML_TYPE_Q6_K;
-            }
+        int nx = tensor->ne[0];
+        if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
+            new_type = GGML_TYPE_Q8_0;
+        }
+        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
+                 ftype == LLAMA_FTYPE_MOSTLY_IQ1_S   || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S  || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
+            new_type = GGML_TYPE_Q5_K;
+        }
+        else if (new_type != GGML_TYPE_Q8_0) {
+            new_type = GGML_TYPE_Q6_K;
        }
    } else if (name == "token_embd.weight") {
-        if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
-            new_type = qs.params->token_embedding_type;
-        } else {
-            if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
-                new_type = GGML_TYPE_Q2_K;
-            }
-            else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
-                new_type = GGML_TYPE_IQ3_S;
-            }
-            else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
-                new_type = GGML_TYPE_IQ3_S;
-            }
+        if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
+            ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
+            new_type = GGML_TYPE_Q2_K;
+        }
+        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
+            new_type = GGML_TYPE_IQ3_S;
+        }
+        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
+            new_type = GGML_TYPE_IQ3_S;
        }
    } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
               ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
@@ -12204,7 +12027,13 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
            new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
        }
-        else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 4) {
+        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
+            new_type = GGML_TYPE_Q4_K;
+        }
+        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
+            new_type = GGML_TYPE_Q4_K;
+        }
+        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
            new_type = GGML_TYPE_Q4_K;
        }
        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
@@ -12479,7 +12308,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 #endif

    llama_model_loader ml(fname_inp, use_mmap, NULL);
-    ml.init_mappings(false); // no prefetching?
+    ml.init_mapping(false); // no prefetching?

    llama_model model;
    llm_load_arch(ml, model);
@@ -12503,12 +12332,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
    struct gguf_context * ctx_out = gguf_init_empty();

    // copy the KV pairs from the input file
-    gguf_set_kv     (ctx_out, ml.meta);
+    gguf_set_kv     (ctx_out, ml.ctx_gguf);
    gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
    gguf_set_val_u32(ctx_out, "general.file_type", ftype);

    for (int i = 0; i < ml.n_tensors; ++i) {
-        const struct ggml_tensor * meta = ml.get_tensor_meta(i);
+        struct ggml_tensor * meta = ml.get_tensor_meta(i);

        const std::string name = ggml_get_name(meta);

@@ -12548,7 +12377,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s

    // populate the original tensors so we get an initial meta data
    for (int i = 0; i < ml.n_tensors; ++i) {
-        const struct ggml_tensor * meta = ml.get_tensor_meta(i);
+        struct ggml_tensor * meta = ml.get_tensor_meta(i);
        gguf_add_tensor(ctx_out, meta);
    }

@@ -12753,7 +12582,7 @@ static int llama_apply_lora_from_file_internal(
    if (path_base_model) {
        LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
        ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr));
-        ml->init_mappings(/*prefetch*/ false); // no prefetching
+        ml->init_mapping(/*prefetch*/ false); // no prefetching
    }

    struct tensor_meta {
@@ -12874,7 +12703,7 @@ static int llama_apply_lora_from_file_internal(

        ggml_tensor * base_t;
        if (ml) {
-            if (!ml->get_tensor_meta(base_name.c_str())) {
+            if (gguf_find_tensor(ml->ctx_gguf, base_name.c_str()) < 0) {
                LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
                return 1;
            }
@@ -13058,8 +12887,6 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
    struct llama_model_quantize_params result = {
        /*.nthread                     =*/ 0,
        /*.ftype                       =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
-        /*.output_tensor_type          =*/ GGML_TYPE_COUNT,
-        /*.token_embedding_type        =*/ GGML_TYPE_COUNT,
        /*.allow_requantize            =*/ false,
        /*.quantize_output_tensor      =*/ true,
        /*.only_copy                   =*/ false,
@@ -14824,30 +14651,6 @@ LLAMA_API int32_t llama_chat_apply_template(
    return res;
 }

-LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) {
-    static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf";
-    if (snprintf(split_path, maxlen, SPLIT_PATH_FORMAT, path_prefix, split_no + 1, split_count)) {
-        return strlen(split_path);
-    }
-    return 0;
-}
-
-int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int split_no, int split_count) {
-    std::string str_split_path(split_path);
-    char postfix[32];
-    snprintf(postfix, 32, "-%05d-of-%05d.gguf", split_no + 1, split_count);
-    std::string str_postfix(postfix);
-
-    // check if dest ends with postfix
-    int size_prefix = str_split_path.size() - str_postfix.size();
-    if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) {
-        snprintf(dest, std::min((size_t) size_prefix, maxlen), "%s", split_path);
-        return size_prefix;
-    }
-
-    return 0;
-}
-
 struct llama_timings llama_get_timings(struct llama_context * ctx) {
    struct llama_timings result = {
        /*.t_start_ms  =*/ 1e-3 * ctx->t_start_us,
--- a/llama.h
+++ b/llama.h
@@ -275,15 +275,13 @@ extern "C" {

    // model quantization parameters
    typedef struct llama_model_quantize_params {
-        int32_t nthread;                     // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
-        enum llama_ftype ftype;              // quantize to this llama_ftype
-        enum ggml_type output_tensor_type;   // output tensor type
-        enum ggml_type token_embedding_type; // itoken embeddings tensor type
-        bool allow_requantize;               // allow quantizing non-f32/f16 tensors
-        bool quantize_output_tensor;         // quantize output.weight
-        bool only_copy;                      // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
-        bool pure;                           // quantize all tensors to the default type
-        void * imatrix;                      // pointer to importance matrix data
+        int32_t nthread;             // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
+        enum llama_ftype ftype;      // quantize to this llama_ftype
+        bool allow_requantize;       // allow quantizing non-f32/f16 tensors
+        bool quantize_output_tensor; // quantize output.weight
+        bool only_copy;              // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
+        bool pure;                   // quantize all tensors to the default type
+        void * imatrix;              // pointer to importance matrix data
    } llama_model_quantize_params;

    // grammar types
@@ -962,16 +960,6 @@ extern "C" {
                                int32_t   n_past,
                                int32_t   n_predict);

-    /// @details Build a split GGUF final path for this chunk.
-    ///          llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
-    //  Returns the split_path length.
-    LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count);
-
-    /// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
-    ///          llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0"
-    //  Returns the split_prefix length.
-    LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);
-
    // Performance information
    LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);

--- a/scripts/get-wikitext-103.sh
+++ b/scripts/get-wikitext-103.sh
@@ -1,10 +0,0 @@
-#!/bin/bash
-
-wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip
-
-echo "Usage:"
-echo ""
-echo "  ./perplexity -m model.gguf -f wiki.test.raw [other params]"
-echo ""
-
-exit 0
--- a/tests/test-json-schema-to-grammar.cpp
+++ b/tests/test-json-schema-to-grammar.cpp
@@ -90,7 +90,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase

    test({
        FAILURE,
-        "invalid type",
+        "invalid type type",
        R"""({
            "type": 123
        })""",
@@ -193,27 +193,21 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
    });

    test({
-        SUCCESS,
+        FAILURE,
        "non-string const",
        R"""({
            "const": 123
        })""",
-        R"""(
-            root ::= "123"
-            space ::= " "?
-        )"""
+        ""
    });

    test({
-        SUCCESS,
+        FAILURE,
        "non-string enum",
        R"""({
-            "enum": ["red", "amber", "green", null, 42, ["foo"]]
+            "enum": [123]
        })""",
-        R"""(
-            root ::= "\"red\"" | "\"amber\"" | "\"green\"" | "null" | "42" | "[\"foo\"]"
-            space ::= " "?
-        )"""
+        ""
    });

    test({
@@ -384,18 +378,20 @@ static void test_all(const std::string & lang, std::function<void(const TestCase

    test({
        SUCCESS,
-        "required props in original order",
+        "required props",
        R"""({
            "type": "object",
            "properties": {
-                "b": {"type": "string"},
-                "c": {"type": "string"},
-                "a": {"type": "string"}
+                "a": {
+                "type": "string"
+                },
+                "b": {
+                "type": "string"
+                }
            },
            "required": [
                "a",
-                "b",
-                "c"
+                "b"
            ],
            "additionalProperties": false,
            "definitions": {}
@@ -403,8 +399,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
        R"""(
            a-kv ::= "\"a\"" space ":" space string
            b-kv ::= "\"b\"" space ":" space string
-            c-kv ::= "\"c\"" space ":" space string
-            root ::= "{" space b-kv "," space c-kv "," space a-kv "}" space
+            root ::= "{" space a-kv "," space b-kv "}" space
            space ::= " "?
            string ::=  "\"" (
                    [^"\\] |
@@ -463,13 +458,13 @@ static void test_all(const std::string & lang, std::function<void(const TestCase

    test({
        SUCCESS,
-        "required + optional props each in original order",
+        "required + optional props",
        R"""({
            "properties": {
-                "b": {"type": "string"},
                "a": {"type": "string"},
-                "d": {"type": "string"},
-                "c": {"type": "string"}
+                "b": {"type": "string"},
+                "c": {"type": "string"},
+                "d": {"type": "string"}
            },
            "required": ["a", "b"],
            "additionalProperties": false
@@ -478,14 +473,14 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            a-kv ::= "\"a\"" space ":" space string
            b-kv ::= "\"b\"" space ":" space string
            c-kv ::= "\"c\"" space ":" space string
+            c-rest ::= ( "," space d-kv )?
            d-kv ::= "\"d\"" space ":" space string
-            d-rest ::= ( "," space c-kv )?
-            root ::= "{" space b-kv "," space a-kv ( "," space ( d-kv d-rest | c-kv ) )? "}" space
+            root ::= "{" space a-kv "," space b-kv ( "," space ( c-kv c-rest | d-kv ) )? "}" space
            space ::= " "?
            string ::=  "\"" (
                    [^"\\] |
                    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
-                )* "\"" space
+                    )* "\"" space
        )"""
    });

@@ -653,16 +648,16 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            "$ref": "#/definitions/MyType",
            "definitions": {
                "MyType": {
-                    "type": "object",
-                    "properties": {
-                        "a": {
-                            "type": "string"
-                        }
-                    },
-                    "required": [
-                        "a"
-                    ],
-                    "additionalProperties": false
+                "type": "object",
+                "properties": {
+                    "a": {
+                    "type": "string"
+                    }
+                },
+                "required": [
+                    "a"
+                ],
+                "additionalProperties": false
                }
            }
        })""",
@@ -688,10 +683,10 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            ],
            "definitions": {
                "foo": {
-                    "properties": {"a": {"type": "number"}}
+                "properties": {"a": {"type": "number"}}
                },
                "bar": {
-                    "properties": {"b": {"type": "number"}}
+                "properties": {"b": {"type": "number"}}
                }
            },
            "type": "object"
@@ -725,16 +720,16 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            ],
            "definitions": {
                "foo": {
-                    "properties": {"a": {"type": "number"}}
+                "properties": {"a": {"type": "number"}}
                },
                "bar": {
-                    "properties": {"b": {"type": "number"}}
+                "properties": {"b": {"type": "number"}}
                },
                "bam": {
-                    "properties": {"c": {"type": "number"}}
+                "properties": {"c": {"type": "number"}}
                },
                "baz": {
-                    "properties": {"d": {"type": "number"}}
+                "properties": {"d": {"type": "number"}}
                }
            },
            "type": "object"
@@ -762,15 +757,15 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
                "properties": {
                    "number": {
                    "type": "object",
-                        "properties": {
-                            "root": {
-                                "type": "number"
-                            }
-                        },
-                        "required": [
-                            "root"
-                        ],
-                        "additionalProperties": false
+                    "properties": {
+                        "root": {
+                        "type": "number"
+                        }
+                    },
+                    "required": [
+                        "root"
+                    ],
+                    "additionalProperties": false
                    }
                },
                "required": [
@@ -799,40 +794,27 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
 }

 int main() {
-    fprintf(stderr, "LLAMA_NODE_AVAILABLE = %s\n", getenv("LLAMA_NODE_AVAILABLE") ? "true" : "false");
-    fprintf(stderr, "LLAMA_PYTHON_AVAILABLE = %s\n", getenv("LLAMA_PYTHON_AVAILABLE") ? "true" : "false");
-
    test_all("C++", [](const TestCase & tc) {
        try {
-            tc.verify(json_schema_to_grammar(nlohmann::ordered_json::parse(tc.schema)));
+            tc.verify(json_schema_to_grammar(nlohmann::json::parse(tc.schema)));
            tc.verify_status(SUCCESS);
        } catch (const std::runtime_error & ex) {
            fprintf(stderr, "Error: %s\n", ex.what());
            tc.verify_status(FAILURE);
        }
    });
-
-    if (getenv("LLAMA_PYTHON_AVAILABLE") || (std::system("python --version") == 0)) {
-        test_all("Python", [](const TestCase & tc) {
-            write("test-json-schema-input.tmp", tc.schema);
-            tc.verify_status(std::system(
-                "python ./examples/json-schema-to-grammar.py test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE);
-            tc.verify(read("test-grammar-output.tmp"));
-        });
-    } else {
-        fprintf(stderr, "\033[33mWARNING: Python not found, skipping Python JSON schema -> grammar tests.\n\033[0m");
-    }
-
-    if (getenv("LLAMA_NODE_AVAILABLE") || (std::system("node --version") == 0)) {
-        test_all("JavaScript", [](const TestCase & tc) {
-            write("test-json-schema-input.tmp", tc.schema);
-            tc.verify_status(std::system(
-                "node ./tests/run-json-schema-to-grammar.mjs test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE);
-            tc.verify(read("test-grammar-output.tmp"));
-        });
-    } else {
-        fprintf(stderr, "\033[33mWARNING: Node not found, skipping JavaScript JSON schema -> grammar tests.\n\033[0m");
-    }
+    //test_all("Python", [](const TestCase & tc) {
+    //    write("test-json-schema-input.tmp", tc.schema);
+    //    tc.verify_status(std::system(
+    //        "python ./examples/json-schema-to-grammar.py test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE);
+    //    tc.verify(read("test-grammar-output.tmp"));
+    //});
+    //test_all("JavaScript", [](const TestCase & tc) {
+    //    write("test-json-schema-input.tmp", tc.schema);
+    //    tc.verify_status(std::system(
+    //        "node ./tests/run-json-schema-to-grammar.mjs test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE);
+    //    tc.verify(read("test-grammar-output.tmp"));
+    //});

    test_all("Check Expectations Validity", [](const TestCase & tc) {
        if (tc.expected_status == SUCCESS) {
Author	SHA1	Message	Date
Georgi Gerganov	8c3d5b5a79	common : remove defaults	2024-03-22 15:33:24 +02:00
Georgi Gerganov	1b2f0a9ee8	common : add HF arg helpers	2024-03-22 14:32:36 +02:00