ggml : fix quantize_row_q4_1() ARM_NEON (close #876 )

Print model version.
Also improve model type printing, and fix indentation of an unrelated switch statement.
2026-02-26 14:23:22 +02:00 · 2023-04-10 19:29:48 +03:00 · 2023-04-10 01:10:46 +02:00 · 2023-04-10 01:10:46 +02:00 · 2023-04-08 17:49:39 +02:00 · 2023-04-08 11:15:17 +00:00
19 changed files with 1669 additions and 845 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -19,6 +19,7 @@ models/*

 /main
 /quantize
+/quantize-stats
 /result
 /perplexity
 /embedding
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -115,6 +115,7 @@ if (LLAMA_OPENBLAS)

        add_compile_definitions(GGML_USE_OPENBLAS)
        add_link_options(${BLAS_LIBRARIES})
+        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} openblas)
    else()
        message(WARNING "OpenBLAS not found")
    endif()
@@ -139,6 +140,7 @@ if (LLAMA_ALL_WARNINGS)
            -Wpedantic
            -Wcast-qual
            -Wno-unused-function
+            -Wno-multichar
        )
    else()
        # todo : msvc
@@ -151,6 +153,10 @@ if (LLAMA_ALL_WARNINGS)

 endif()

+if (MSVC)
+    add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
+endif()
+
 if (LLAMA_LTO)
    include(CheckIPOSupported)
    check_ipo_supported(RESULT result OUTPUT output)
@@ -240,7 +246,9 @@ endif()

 add_library(llama
            llama.cpp
-            llama.h)
+            llama.h
+            llama_internal.h
+            llama_util.h)

 target_include_directories(llama PUBLIC .)
 target_compile_features(llama PUBLIC cxx_std_11) # don't bump
--- a/11
+++ b/11
@@ -37,7 +37,7 @@ LDFLAGS  =

 # warnings
 CFLAGS   += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wno-unused-function
-CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
+CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar

 # OS specific
 # TODO: support Windows
@@ -142,14 +142,14 @@ default: main quantize perplexity embedding
 ggml.o: ggml.c ggml.h
 	$(CC)  $(CFLAGS)   -c ggml.c -o ggml.o

-llama.o: llama.cpp llama.h
+llama.o: llama.cpp llama.h llama_util.h llama_internal.h
 	$(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o

 common.o: examples/common.cpp examples/common.h
 	$(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o

 clean:
-	rm -vf *.o main quantize perplexity embedding
+	rm -vf *.o main quantize quantize-stats perplexity embedding

 main: examples/main/main.cpp ggml.o llama.o common.o
 	$(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS)
@@ -160,12 +160,17 @@ main: examples/main/main.cpp ggml.o llama.o common.o
 quantize: examples/quantize/quantize.cpp ggml.o llama.o
 	$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp ggml.o llama.o -o quantize $(LDFLAGS)

+quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o
+	$(CXX) $(CXXFLAGS) examples/quantize-stats/quantize-stats.cpp ggml.o llama.o -o quantize-stats $(LDFLAGS)
+
 perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o
 	$(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity $(LDFLAGS)

 embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o
 	$(CXX) $(CXXFLAGS) examples/embedding/embedding.cpp ggml.o llama.o common.o -o embedding $(LDFLAGS)

+libllama.so: llama.o ggml.o
+	$(CXX) $(CXXFLAGS) -shared -fPIC -o libllama.so llama.o ggml.o $(LDFLAGS)
 #
 # Tests
 #
--- a/build.zig
+++ b/build.zig
@@ -3,12 +3,14 @@ const std = @import("std");
 pub fn build(b: *std.Build) void {
    const target = b.standardTargetOptions(.{});
    const optimize = b.standardOptimizeOption(.{});
+    const want_lto = b.option(bool, "lto", "Want -fLTO");

    const lib = b.addStaticLibrary(.{
        .name = "llama",
        .target = target,
        .optimize = optimize,
    });
+    lib.want_lto = want_lto;
    lib.linkLibCpp();
    lib.addIncludePath(".");
    lib.addIncludePath("examples");
@@ -17,11 +19,11 @@ pub fn build(b: *std.Build) void {
    }, &.{"-std=c11"});
    lib.addCSourceFiles(&.{
        "llama.cpp",
-        "examples/common.cpp",
    }, &.{"-std=c++11"});
    lib.install();

-    const build_args = .{ .b = b, .lib = lib, .target = target, .optimize = optimize };
+    const build_args = .{ .b = b, .lib = lib, .target = target, .optimize = optimize, .want_lto = want_lto };
+
    const exe = build_example("main", build_args);
    _ = build_example("quantize", build_args);
    _ = build_example("perplexity", build_args);
@@ -44,16 +46,19 @@ fn build_example(comptime name: []const u8, args: anytype) *std.build.LibExeObjS
    const lib = args.lib;
    const target = args.target;
    const optimize = args.optimize;
+    const want_lto = args.want_lto;

    const exe = b.addExecutable(.{
        .name = name,
        .target = target,
        .optimize = optimize,
    });
+    exe.want_lto = want_lto;
    exe.addIncludePath(".");
    exe.addIncludePath("examples");
    exe.addCSourceFiles(&.{
        std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{name, name}),
+        "examples/common.cpp",
    }, &.{"-std=c++11"});
    exe.linkLibrary(lib);
    exe.install();
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -31,6 +31,7 @@ if (EMSCRIPTEN)
 else()
    add_subdirectory(main)
    add_subdirectory(quantize)
+    add_subdirectory(quantize-stats)
    add_subdirectory(perplexity)
    add_subdirectory(embedding)
 endif()
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -1,7 +1,5 @@
 #include "common.h"

-#include "ggml.h"
-
 #include <cassert>
 #include <cstring>
 #include <fstream>
@@ -16,12 +14,19 @@
 #endif

 #if defined (_WIN32)
+#include <fcntl.h>
+#include <io.h>
 #pragma comment(lib,"kernel32.lib")
 extern "C" __declspec(dllimport) void* __stdcall GetStdHandle(unsigned long nStdHandle);
 extern "C" __declspec(dllimport) int __stdcall GetConsoleMode(void* hConsoleHandle, unsigned long* lpMode);
 extern "C" __declspec(dllimport) int __stdcall SetConsoleMode(void* hConsoleHandle, unsigned long dwMode);
 extern "C" __declspec(dllimport) int __stdcall SetConsoleCP(unsigned int wCodePageID);
 extern "C" __declspec(dllimport) int __stdcall SetConsoleOutputCP(unsigned int wCodePageID);
+extern "C" __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int CodePage, unsigned long dwFlags, 
+                                                                   const wchar_t * lpWideCharStr, int cchWideChar, 
+                                                                   char * lpMultiByteStr, int cbMultiByte, 
+                                                                   const char * lpDefaultChar, bool * lpUsedDefaultChar);
+#define CP_UTF8 65001
 #endif

 bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
@@ -154,6 +159,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            params.use_color = true;
        } else if (arg == "--mlock") {
            params.use_mlock = true;
+        } else if (arg == "--no-mmap") {
+            params.use_mmap = false;
        } else if (arg == "--mtest") {
            params.mem_test = true;
        } else if (arg == "--verbose-prompt") {
@@ -233,9 +240,12 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stderr, "  -b N, --batch_size N  batch size for prompt processing (default: %d)\n", params.n_batch);
    fprintf(stderr, "  --perplexity          compute perplexity over the prompt\n");
    fprintf(stderr, "  --keep                number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
-    if (ggml_mlock_supported()) {
+    if (llama_mlock_supported()) {
        fprintf(stderr, "  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
    }
+    if (llama_mmap_supported()) {
+        fprintf(stderr, "  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
+    }
    fprintf(stderr, "  --mtest               compute maximum memory usage\n");
    fprintf(stderr, "  --verbose-prompt      print prompt before generation\n");
    fprintf(stderr, "  -m FNAME, --model FNAME\n");
@@ -307,12 +317,20 @@ void win32_console_init(bool enable_color) {
            SetConsoleMode(hConOut, dwMode | 0x4); // ENABLE_VIRTUAL_TERMINAL_PROCESSING (0x4)
        }
        // Set console output codepage to UTF8
-        SetConsoleOutputCP(65001); // CP_UTF8
+        SetConsoleOutputCP(CP_UTF8);
    }
    void* hConIn = GetStdHandle((unsigned long)-10); // STD_INPUT_HANDLE (-10)
    if (hConIn && hConIn != (void*)-1 && GetConsoleMode(hConIn, &dwMode)) {
-        // Set console input codepage to UTF8
-        SetConsoleCP(65001); // CP_UTF8
+        // Set console input codepage to UTF16
+        _setmode(_fileno(stdin), _O_WTEXT);
    }
 }
+
+// Convert a wide Unicode string to an UTF8 string
+void win32_utf8_encode(const std::wstring & wstr, std::string & str) {
+	int size_needed = WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), NULL, 0, NULL, NULL);
+	std::string strTo(size_needed, 0);
+	WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), &strTo[0], size_needed, NULL, NULL);
+	str = strTo;
+}
 #endif
--- a/examples/common.h
+++ b/examples/common.h
@@ -47,6 +47,7 @@ struct gpt_params {
    bool instruct          = false; // instruction mode (used for Alpaca models)
    bool ignore_eos        = false; // do not stop generating after eos
    bool perplexity        = false; // compute perplexity over the prompt
+    bool use_mmap          = true;  // use mmap for faster loads
    bool use_mlock         = false; // use mlock to keep model in memory
    bool mem_test          = false; // compute maximum memory usage
    bool verbose_prompt    = false; // print prompt tokens before generation
@@ -92,4 +93,5 @@ void set_console_color(console_state & con_st, console_color_t color);

 #if defined (_WIN32)
 void win32_console_init(bool enable_color);
+void win32_utf8_encode(const std::wstring & wstr, std::string & str);
 #endif
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -38,6 +38,7 @@ int main(int argc, char ** argv) {
        lparams.seed       = params.seed;
        lparams.f16_kv     = params.memory_f16;
        lparams.logits_all = params.perplexity;
+        lparams.use_mmap   = params.use_mmap;
        lparams.use_mlock  = params.use_mlock;
        lparams.embedding  = params.embedding;

--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -97,6 +97,7 @@ int main(int argc, char ** argv) {
        lparams.n_parts    = params.n_parts;
        lparams.seed       = params.seed;
        lparams.f16_kv     = params.memory_f16;
+        lparams.use_mmap   = params.use_mmap;
        lparams.use_mlock  = params.use_mlock;

        ctx = llama_init_from_file(params.model.c_str(), lparams);
@@ -386,10 +387,19 @@ int main(int argc, char ** argv) {
                std::string line;
                bool another_line = true;
                do {
+#if defined(_WIN32)
+                    std::wstring wline;
+                    if (!std::getline(std::wcin, wline)) {
+                        // input stream is bad or EOF received
+                        return 0;
+                    }
+                    win32_utf8_encode(wline, line);
+#else
                    if (!std::getline(std::cin, line)) {
                        // input stream is bad or EOF received
                        return 0;
                    }
+#endif
                    if (line.empty() || line.back() != '\\') {
                        another_line = false;
                    } else {
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -115,6 +115,7 @@ int main(int argc, char ** argv) {
        lparams.seed       = params.seed;
        lparams.f16_kv     = params.memory_f16;
        lparams.logits_all = params.perplexity;
+        lparams.use_mmap   = params.use_mmap;
        lparams.use_mlock  = params.use_mlock;
        lparams.embedding  = params.embedding;

--- a/examples/quantize-stats/CMakeLists.txt
+++ b/examples/quantize-stats/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(TARGET quantize-stats)
+add_executable(${TARGET} quantize-stats.cpp)
+target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -0,0 +1,354 @@
+#include "ggml.h"
+#include "llama.h"
+#include "llama_internal.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <map>
+#include <numeric>
+#include <regex>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+static const char * type_strs[] = { "q4_0", "q4_1", "i8", "i16", "i32", "f16", "f32"  };
+static_assert(sizeof(type_strs) == GGML_TYPE_COUNT * sizeof(char *), "Incomplete type list");
+
+struct quantize_stats_params {
+    std::string model = "models/7B/ggml-model-f16.bin";
+    bool verbose = false;
+    bool per_layer_stats = false;
+    bool print_histogram = false;
+    bool reference = false;
+    std::vector<std::string> include_layers;
+    std::vector<std::string> exclude_layers;
+    std::vector<enum ggml_type> include_types;
+};
+
+const int64_t SCRATCH_ELEMENTS = 32*32;
+const size_t HISTOGRAM_BUCKETS = 150;
+const double HISTOGRAM_RANGE = 0.03;
+
+struct error_stats {
+    size_t num_samples;
+    double total_error;
+    double max_error;
+    uint64_t error_histogram[HISTOGRAM_BUCKETS];
+};
+
+
+void quantize_stats_print_usage(int /*argc*/, char ** argv) {
+    quantize_stats_params params;
+    fprintf(stderr, "usage: %s [options]\n", argv[0]);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h, --help            show this help message and exit\n");
+    fprintf(stderr, "  -m FNAME, --model FNAME\n");
+    fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
+    fprintf(stderr, "  -r, --reference\n");
+    fprintf(stderr, "                        use reference implementation (default: false)\n");
+    fprintf(stderr, "  -v, --verbose\n");
+    fprintf(stderr, "                        verbose output (default: false)\n");
+    fprintf(stderr, "  -p, --per-layer-stats\n");
+    fprintf(stderr, "                        print stats per layer (default: false)\n");
+    fprintf(stderr, "  --histogram\n");
+    fprintf(stderr, "                        print error histogram (default: false)\n");
+    fprintf(stderr, "  -l LAYER, --include-layer LAYER\n");
+    fprintf(stderr, "                        only test layers matching pattern\n");
+    fprintf(stderr, "  -L LAYER, --exclude-layer LAYER\n");
+    fprintf(stderr, "                        exclude layers matching pattern\n");
+    fprintf(stderr, "  -t TYPE, --type TYPE\n");
+    fprintf(stderr, "                        only test given type (q4_0, q4_1)\n");
+    fprintf(stderr, "\n");
+}
+
+// Check if a layer is included/excluded by command line
+bool layer_included(const quantize_stats_params params, const std::string & layer) {
+    for (const auto& excluded : params.exclude_layers) {
+        if (std::regex_search(layer, std::regex(excluded))) {
+            return false;
+        }
+    }
+    for (const auto& included : params.include_layers) {
+        if (std::regex_search(layer, std::regex(included))) {
+            return true;
+        }
+    }
+    return params.include_layers.empty();
+}
+
+// Update error statistics given vectors with the before/after result of quantization
+void update_error_stats(int64_t nelements, const float * input, const float * output, error_stats & stats) {
+    for (int64_t i = 0; i < nelements; i++) {
+        double diff = input[i] - output[i];
+        stats.total_error += diff * diff;
+        stats.max_error = fmax(fabs(diff), stats.max_error);
+        stats.error_histogram[std::max(std::min((size_t) floor(fabs(diff) / HISTOGRAM_RANGE * HISTOGRAM_BUCKETS), HISTOGRAM_BUCKETS-1), (size_t) 0)]++;
+    }
+    stats.num_samples += nelements;
+}
+
+double find_quantile(const error_stats & stats, double quantile) {
+    double sum = std::accumulate(std::begin(stats.error_histogram), std::end(stats.error_histogram), 0.0);
+
+    double accum = 0;
+    for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) {
+        accum += stats.error_histogram[i];
+        if (accum >= sum*quantile) {
+            return (i+1) * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS;
+        }
+    }
+    return INFINITY;
+}
+
+void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) {
+    double rmse = sqrt(stats.total_error / (double) stats.num_samples);
+    double median = find_quantile(stats, .5);
+    double pct95 = find_quantile(stats, .95);
+    printf("%-50s: rmse %.8f, maxerr %.8f, 95pct<%.4f, median<%.4f\n", name.c_str(), rmse, stats.max_error, pct95, median);
+    if (print_histogram) {
+        printf("Error distribution:\n");
+        for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) {
+            double lower = i * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS;
+            double upper = (i+1) * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS;
+            if (i == HISTOGRAM_BUCKETS -1) upper = INFINITY;
+            printf("[%3.4f, %3.4f): %11" PRIu64 "\n", lower, upper, stats.error_histogram[i]);
+        }
+    }
+}
+
+// copied from ggml.h - verify that we can access this as a flat array
+static bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    return
+        tensor->nb[0] == ggml_type_size(tensor->type) &&
+        tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
+        tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
+        tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
+}
+
+// Run quantization function for a single layer and update error stats
+void test_roundtrip_on_layer(
+        std::string & name,
+        bool print_layer_stats,
+        const quantize_fns_t & qfns,
+        bool use_reference,
+        const ggml_tensor * layer,
+        float * input_scratch,
+        char *quantized_scratch,
+        float * output_scratch,
+        error_stats & total_error) {
+
+    assert(tensor_is_contiguous(layer));
+    error_stats layer_error {};
+    int64_t nelements = ggml_nelements(layer);
+
+    for (int64_t offset = 0; offset < nelements; offset += SCRATCH_ELEMENTS) {
+        int64_t chunk_size = std::min(SCRATCH_ELEMENTS, nelements - offset);
+
+        if (layer->type == GGML_TYPE_F16) {
+            for (int i = 0; i < chunk_size; i++) {
+                input_scratch[i] = ggml_get_f32_1d(layer, i + offset);
+            }
+        } else {
+            input_scratch = ggml_get_data_f32(layer) + offset;
+        }
+
+        if (use_reference) {
+            qfns.quantize_row_q_reference(input_scratch, quantized_scratch, chunk_size);
+        } else {
+            qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size);
+        }
+        qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size);
+
+        update_error_stats(chunk_size, input_scratch, output_scratch, total_error);
+        if (print_layer_stats) {
+            update_error_stats(chunk_size, input_scratch, output_scratch, layer_error);
+        }
+    }
+    if (print_layer_stats) {
+        print_error_stats(name, layer_error, false);
+    }
+}
+
+int main(int argc, char ** argv) {
+    ggml_time_init();
+
+    quantize_stats_params params;
+
+    // read command line
+
+    bool invalid_param = false;
+    std::string arg;
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+
+        if (arg == "-h" || arg == "--help") {
+            quantize_stats_print_usage(argc, argv);
+            exit(0);
+        } else if (arg == "-r" || arg == "--reference") {
+            params.reference = true;
+        } else if (arg == "-v") {
+            params.verbose = true;
+        } else if (arg == "-p" || arg == "--per-layer-stats") {
+            params.per_layer_stats = true;
+        } else if (arg == "--histogram") {
+            params.print_histogram = true;
+        } else if (arg == "-m" || arg == "--model") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.model = argv[i];
+        } else if (arg == "-l" || arg == "--include-layer") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.include_layers.push_back(argv[i]);
+        } else if (arg == "-L" || arg == "--exclude-layer") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.exclude_layers.push_back(argv[i]);
+        } else if (arg == "-t" || arg == "--type") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            int j;
+            for (j = 0; j < GGML_TYPE_COUNT && strcmp(argv[i], type_strs[j]) != 0; j++) {
+                // find match
+            }
+            if (j < GGML_TYPE_COUNT) {
+                params.include_types.push_back((ggml_type) j);
+            } else {
+                fprintf(stderr, "error: %s not in list of types\n", argv[i]);
+                invalid_param = true;
+            }
+        } else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            quantize_stats_print_usage(argc, argv);
+            return 1;
+        }
+    }
+    if (invalid_param) {
+        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+        quantize_stats_print_usage(argc, argv);
+        return 1;
+    }
+
+    // load the model
+    fprintf(stderr, "Loading model\n");
+
+    const int64_t t_main_start_us = ggml_time_us();
+    llama_context * ctx;
+
+    {
+        auto lparams = llama_context_default_params();
+
+        lparams.n_ctx      = 256;
+        lparams.n_parts    = 1;
+        lparams.seed       = 1;
+        lparams.f16_kv     = false;
+        lparams.use_mlock  = false;
+
+        ctx = llama_init_from_file(params.model.c_str(), lparams);
+
+        if (ctx == NULL) {
+            fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
+            return 1;
+        }
+    }
+
+    const auto &tensors = llama_internal_get_tensor_map(ctx);
+
+    // check layer tensors
+    int included_layers = 0;
+    int64_t max_nelements = 0;
+    bool is_f16 = false;
+    for (const auto& kv_tensor : tensors) {
+        if (!layer_included(params, kv_tensor.first)) {
+            continue;
+        }
+        if (params.verbose) {
+            printf("%s: type %s, size %" PRId64 "\n", kv_tensor.first.c_str(), type_strs[kv_tensor.second->type], ggml_nelements(kv_tensor.second));
+        }
+        if (kv_tensor.second->type == GGML_TYPE_F16) {
+            is_f16 = true;
+        } else if (kv_tensor.second->type != GGML_TYPE_F32) {
+            fprintf(stderr, "%s: error: Quantization should be tested with a float model, "
+                "this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type);
+            llama_free(ctx);
+            return 1;
+        }
+        included_layers++;
+        max_nelements = std::max(max_nelements, ggml_nelements(kv_tensor.second));
+    }
+
+    if (is_f16) {
+        printf("note: source model is f16\n");
+    }
+    printf("testing %d layers with max size %" PRId64 "\n", included_layers, max_nelements);
+    // allocate scratch space
+    std::vector<float> input_scratch(SCRATCH_ELEMENTS);
+    std::vector<char> quantized_scratch(SCRATCH_ELEMENTS*4);
+    std::vector<float> output_scratch(SCRATCH_ELEMENTS);
+
+    // loop throught quantization types
+    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
+        if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
+            continue;
+        }
+        quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
+        if (qfns.quantize_row_q && qfns.dequantize_row_q) {
+            if (params.verbose) {
+                printf("testing %s ...\n",  type_strs[i]);
+            }
+
+            error_stats global_stats {};
+
+            for (const auto& kv_tensor : tensors) {
+                if (!layer_included(params, kv_tensor.first)) {
+                    continue;
+                }
+                if (params.verbose) {
+                    printf("  %s ...\n",  kv_tensor.first.c_str());
+                }
+                std::string layer_name { type_strs[i] };
+                layer_name += "::" + kv_tensor.first;
+                test_roundtrip_on_layer(
+                        layer_name,
+                        params.per_layer_stats,
+                        qfns,
+                        params.reference,
+                        kv_tensor.second,
+                        input_scratch.data(),
+                        quantized_scratch.data(),
+                        output_scratch.data(),
+                        global_stats
+                );
+            }
+
+            print_error_stats(type_strs[i], global_stats, params.print_histogram);
+        }
+    }
+
+
+    llama_free(ctx);
+    // report timing
+    {
+        const int64_t t_main_end_us = ggml_time_us();
+
+        printf("\n");
+        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0);
+    }
+
+    return 0;
+}
--- a/flake.nix
+++ b/flake.nix
@@ -30,6 +30,9 @@
            mkdir -p $out/bin
            mv bin/main $out/bin/llama
            mv bin/quantize $out/bin/quantize
+            mv bin/embedding $out/bin/embedding
+            mv bin/perplexity $out/bin/perplexity
+
            echo "#!${llama-python}/bin/python" > $out/bin/convert-pth-to-ggml
            cat ${./convert-pth-to-ggml.py} >> $out/bin/convert-pth-to-ggml
            chmod +x $out/bin/convert-pth-to-ggml
--- a/ggml.c
+++ b/ggml.c
@@ -97,17 +97,6 @@ typedef void* thread_ret_t;
 #define static_assert(cond, msg) _Static_assert(cond, msg)
 #endif

-#define GGML_MLOCK_SUPPORT 0
-
-#ifdef __has_include
-    #if __has_include(<sys/mman.h>)
-        #undef GGML_MLOCK_SUPPORT
-        #define GGML_MLOCK_SUPPORT 1
-        #include <sys/mman.h>
-    #endif
-#endif
-
-
 /*#define GGML_PERF*/
 #define GGML_DEBUG 0
 #define GGML_GELU_FP16
@@ -610,10 +599,7 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
        for (int l = 0; l < 2; l++) amaxv[4*l] = vmaxq_f32(amaxv[4*l], amaxv[4*l+2]);
        for (int l = 0; l < 1; l++) amaxv[8*l] = vmaxq_f32(amaxv[8*l], amaxv[8*l+4]);

-        // absolute max
-        const float amax = MAX(
-                MAX(vgetq_lane_f32(amaxv[0], 0), vgetq_lane_f32(amaxv[0], 1)),
-                MAX(vgetq_lane_f32(amaxv[0], 2), vgetq_lane_f32(amaxv[0], 3)));
+        const float amax = vmaxvq_f32(amaxv[0]);

        const float d = amax / ((1 << 3) - 1);
        const float id = d ? 1.0f/d : 0.0f;
@@ -935,7 +921,7 @@ static void quantize_row_q4_1(const float * restrict x, void * restrict vy, int
        float32x4_t minv[8];
        float32x4_t maxv[8];

-        for (int l = 0; l < 8; l++) srcv[l] = vld1q_f32(x + i*32 + 4*l);
+        for (int l = 0; l < 8; l++) srcv[l] = vld1q_f32(x + i*QK + 4*l);

        for (int l = 0; l < 4; l++) minv[2*l] = vminq_f32(srcv[2*l], srcv[2*l + 1]);
        for (int l = 0; l < 2; l++) minv[4*l] = vminq_f32(minv[4*l], minv[4*l + 2]);
@@ -958,7 +944,8 @@ static void quantize_row_q4_1(const float * restrict x, void * restrict vy, int

        for (int l = 0; l < 8; l++) {
            const float32x4_t v  = vmulq_n_f32(vsubq_f32(srcv[l], minv0), id);
-            const int32x4_t   vi = vcvtq_s32_f32(v);
+            const float32x4_t vf = vaddq_f32(v, vdupq_n_f32(0.5f)); // needed to round to nearest
+            const int32x4_t   vi = vcvtq_s32_f32(vf);

            y[i].qs[2*l + 0] = vgetq_lane_s32(vi, 0) | (vgetq_lane_s32(vi, 1) << 4);
            y[i].qs[2*l + 1] = vgetq_lane_s32(vi, 2) | (vgetq_lane_s32(vi, 3) << 4);
@@ -2690,21 +2677,6 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {

 static_assert(GGML_OP_COUNT == 35, "GGML_OP_COUNT != 35");

-//
-// ggml object
-//
-
-struct ggml_object {
-    size_t offs;
-    size_t size;
-
-    struct ggml_object * next;
-
-    char padding[8];
-};
-
-static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
-
 static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
 static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");

@@ -2716,7 +2688,6 @@ struct ggml_context {
    size_t mem_size;
    void * mem_buffer;
    bool   mem_buffer_owned;
-    bool   mem_buffer_mlocked;
    bool   no_alloc;

    int    n_objects;
@@ -3003,7 +2974,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
        /*.mem_size           =*/ params.mem_size,
        /*.mem_buffer         =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size),
        /*.mem_buffer_owned   =*/ params.mem_buffer ? false : true,
-        /*.mem_buffer_mlocked =*/ false,
        /*.no_alloc           =*/ params.no_alloc,
        /*.n_objects          =*/ 0,
        /*.objects_begin      =*/ NULL,
@@ -3036,14 +3006,6 @@ void ggml_free(struct ggml_context * ctx) {
            GGML_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n",
                    __func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size);

-#if GGML_MLOCK_SUPPORT
-            if (ctx->mem_buffer_mlocked) {
-                if (munlock(ctx->mem_buffer, ctx->mem_size)) {
-                    fprintf(stderr, "%s: failed to munlock buffer: %s\n", __func__, strerror(errno));
-                }
-            }
-#endif
-
            if (ctx->mem_buffer_owned) {
                free(ctx->mem_buffer);
            }
@@ -3072,48 +3034,6 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch)
    return result;
 }

-#ifdef __APPLE__
-#define MLOCK_SUGGESTION \
-    "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
-    "decreasing 'vm.global_no_user_wire_amount'.  Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
-#else
-#define MLOCK_SUGGESTION \
-    "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
-#endif
-
-bool ggml_mlock_supported(void) {
-    return GGML_MLOCK_SUPPORT;
-}
-
-bool ggml_mlock(
-        struct ggml_context * ctx,
-        const void *opt_extra_addr,
-        size_t opt_extra_len,
-        char **err_p) {
-    // TODO: Use SetProcessWorkingSetSize() + VirtualLock() on WIN32
-#if GGML_MLOCK_SUPPORT
-    if (ctx->mem_buffer_mlocked) {
-        return true;
-    }
-    if (mlock(ctx->mem_buffer, ctx->mem_size) ||
-        (opt_extra_len &&
-         mlock(opt_extra_addr, opt_extra_len))) {
-        if ((*err_p = malloc(1024))) {
-            snprintf(*err_p, 1024,
-                     "failed to mlock %zu-byte buffer: %s\n" MLOCK_SUGGESTION,
-                     ctx->mem_size + opt_extra_len,
-                     strerror(errno));
-        }
-        return false;
-    }
-    ctx->mem_buffer_mlocked = true;
-    return true;
-#else // GGML_MLOCK_SUPPORT
-    *err_p = strdup("can't mlock because it's not supported on this system");
-    return false;
-#endif // GGML_MLOCK_SUPPORT
-}
-
 ////////////////////////////////////////////////////////////////////////////////

 struct ggml_tensor * ggml_new_tensor_impl(
@@ -6564,29 +6484,27 @@ static void ggml_compute_forward_mul_mat_f16_f32(
    //}
 }

-typedef void (*dequantize_row_q_t)(const void * restrict x, float * restrict y, int k);
-typedef void (*quantize_row_q_t)(const float * restrict x, void * restrict y, int k);
-typedef void (*vec_dot_q_t)(const int n, float * restrict s, const void * restrict x, const void * restrict y);
-
-typedef struct {
-    dequantize_row_q_t dequantize_row_q;
-    quantize_row_q_t   quantize_row_q;
-    vec_dot_q_t        vec_dot_q;
-} quantize_fns_t;
-
 static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
    [GGML_TYPE_Q4_0] = {
-        .dequantize_row_q = dequantize_row_q4_0,
-        .quantize_row_q   = quantize_row_q4_0,
-        .vec_dot_q        = ggml_vec_dot_q4_0,
+        .dequantize_row_q         = dequantize_row_q4_0,
+        .quantize_row_q           = quantize_row_q4_0,
+        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0_reference,
+        .vec_dot_q                = ggml_vec_dot_q4_0,
    },
    [GGML_TYPE_Q4_1] = {
-        .dequantize_row_q = dequantize_row_q4_1,
-        .quantize_row_q   = quantize_row_q4_1,
-        .vec_dot_q        = ggml_vec_dot_q4_1,
+        .dequantize_row_q         = dequantize_row_q4_1,
+        .quantize_row_q           = quantize_row_q4_1,
+        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference,
+        .vec_dot_q                = ggml_vec_dot_q4_1,
    },
 };

+// For internal test use
+quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
+    GGML_ASSERT(i < GGML_TYPE_COUNT);
+    return quantize_fns[i];
+}
+
 static void ggml_compute_forward_mul_mat_q_f32(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
--- a/ggml.h
+++ b/ggml.h
@@ -253,6 +253,19 @@ enum ggml_op {
    GGML_OP_COUNT,
 };

+
+// ggml object
+struct ggml_object {
+    size_t offs;
+    size_t size;
+
+    struct ggml_object * next;
+
+    char padding[8];
+};
+
+static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
+
 // n-dimensional tensor
 struct ggml_tensor {
    enum ggml_type type;
@@ -344,13 +357,6 @@ size_t ggml_used_mem(const struct ggml_context * ctx);

 size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);

-bool ggml_mlock_supported(void);
-bool ggml_mlock(
-        struct ggml_context * ctx,
-        const void *opt_extra_addr,
-        size_t opt_extra_len,
-        char **err_p);
-
 struct ggml_tensor * ggml_new_tensor(
        struct ggml_context * ctx,
        enum   ggml_type type,
@@ -783,6 +789,30 @@ int ggml_cpu_has_blas(void);
 int ggml_cpu_has_sse3(void);
 int ggml_cpu_has_vsx(void);

+
+//
+// Internal types and functions exposed for tests and benchmarks
+//
+
+#ifdef  __cplusplus
+// restrict not standard in C++
+#define GGML_RESTRICT
+#else
+#define GGML_RESTRICT restrict
+#endif
+typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
+typedef void (*quantize_row_q_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
+typedef void (*vec_dot_q_t)(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
+
+typedef struct {
+    dequantize_row_q_t dequantize_row_q;
+    quantize_row_q_t   quantize_row_q;
+    quantize_row_q_t   quantize_row_q_reference;
+    vec_dot_q_t        vec_dot_q;
+} quantize_fns_t;
+
+quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
+
 #ifdef  __cplusplus
 }
 #endif
--- a/llama.cpp
+++ b/llama.cpp
--- a/llama.h
+++ b/llama.h
@@ -55,6 +55,7 @@ extern "C" {
        bool f16_kv;     // use fp16 for KV cache
        bool logits_all; // the llama_eval() call computes all logits, not just the last one
        bool vocab_only; // only load the vocabulary, no weights
+        bool use_mmap;   // use mmap if possible
        bool use_mlock;  // force system to keep model in RAM
        bool embedding;  // embedding mode only

@@ -66,6 +67,9 @@ extern "C" {

    LLAMA_API struct llama_context_params llama_context_default_params();

+    LLAMA_API bool llama_mmap_supported();
+    LLAMA_API bool llama_mlock_supported();
+
    // Various functions for loading a ggml llama model.
    // Allocate (almost) all memory needed for the model.
    // Return NULL on failure
@@ -166,4 +170,4 @@ extern "C" {
 }
 #endif

-#endif
+#endif // LLAMA_H
--- a/llama_internal.h
+++ b/llama_internal.h
@@ -0,0 +1,12 @@
+// Internal header to be included by llama.cpp and tests/benchmarks only.
+
+#ifndef LLAMA_INTERNAL_H
+#define LLAMA_INTERNAL_H
+
+#include <vector>
+#include <string>
+struct ggml_tensor;
+
+std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
+
+#endif // LLAMA_INTERNAL_H
--- a/llama_util.h
+++ b/llama_util.h
@@ -0,0 +1,383 @@
+// Internal header to be included only by llama.cpp.
+// Contains wrappers around OS interfaces.
+
+#ifndef LLAMA_UTIL_H
+#define LLAMA_UTIL_H
+
+#include <cstdio>
+#include <cstdint>
+#include <cerrno>
+#include <cstring>
+#include <cstdarg>
+#include <cstdlib>
+#include <climits>
+
+#include <string>
+#include <vector>
+
+#ifdef __has_include
+    #if __has_include(<unistd.h>)
+        #include <unistd.h>
+        #if defined(_POSIX_MAPPED_FILES)
+            #include <sys/mman.h>
+        #endif
+    #endif
+#endif
+
+#if defined(_WIN32)
+    #define WIN32_LEAN_AND_MEAN
+    #define NOMINMAX
+    #include <windows.h>
+    #include <io.h>
+    #include <stdio.h> // for _fseeki64
+#endif
+
+#define LLAMA_ASSERT(x) \
+    do { \
+        if (!(x)) { \
+            fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
+            abort(); \
+        } \
+    } while (0)
+
+#ifdef __GNUC__
+__attribute__((format(printf, 1, 2)))
+#endif
+static std::string format(const char * fmt, ...) {
+    va_list ap, ap2;
+    va_start(ap, fmt);
+    va_copy(ap2, ap);
+    int size = vsnprintf(NULL, 0, fmt, ap);
+    LLAMA_ASSERT(size >= 0 && size < INT_MAX);
+    std::vector<char> buf(size + 1);
+    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
+    LLAMA_ASSERT(size2 == size);
+    va_end(ap2);
+    va_end(ap);
+    return std::string(buf.data(), size);
+};
+
+struct llama_file {
+    // use FILE * so we don't have to re-open the file to mmap
+    FILE * fp;
+    size_t size;
+
+    llama_file(const char * fname, const char * mode) {
+        fp = std::fopen(fname, mode);
+        if (fp == NULL) {
+            throw format("failed to open %s: %s", fname, std::strerror(errno));
+        }
+        seek(0, SEEK_END);
+        size = tell();
+        seek(0, SEEK_SET);
+    }
+
+    size_t tell() const {
+#ifdef _WIN32
+        __int64 ret = _ftelli64(fp);
+#else
+        long ret = std::ftell(fp);
+#endif
+        LLAMA_ASSERT(ret != -1); // this really shouldn't fail
+        return (size_t) ret;
+    }
+
+    void seek(size_t offset, int whence) {
+#ifdef _WIN32
+        int ret = _fseeki64(fp, (__int64) offset, whence);
+#else
+        int ret = std::fseek(fp, (long) offset, whence);
+#endif
+        LLAMA_ASSERT(ret == 0); // same
+    }
+
+    void read_raw(void * ptr, size_t size) {
+        if (size == 0) {
+            return;
+        }
+        errno = 0;
+        std::size_t ret = std::fread(ptr, size, 1, fp);
+        if (ferror(fp)) {
+            throw format("read error: %s", strerror(errno));
+        }
+        if (ret != 1) {
+            throw std::string("unexpectedly reached end of file");
+        }
+    }
+
+    std::uint32_t read_u32() {
+        std::uint32_t ret;
+        read_raw(&ret, sizeof(ret));
+        return ret;
+    }
+
+    std::string read_string(std::uint32_t len) {
+        std::vector<char> chars(len);
+        read_raw(chars.data(), len);
+        return std::string(chars.data(), len);
+    }
+
+    void write_raw(const void * ptr, size_t size) {
+        if (size == 0) {
+            return;
+        }
+        errno = 0;
+        size_t ret = std::fwrite(ptr, size, 1, fp);
+        if (ret != 1) {
+            throw format("write error: %s", strerror(errno));
+        }
+    }
+
+    void write_u32(std::uint32_t val) {
+        write_raw(&val, sizeof(val));
+    }
+
+    ~llama_file() {
+        if (fp) {
+            std::fclose(fp);
+        }
+    }
+};
+
+#if defined(_WIN32)
+static std::string llama_format_win_err(DWORD err) {
+    LPSTR buf;
+    size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
+                                 NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
+    if (!size) {
+        return "FormatMessageA failed";
+    }
+    std::string ret(buf, size);
+    LocalFree(buf);
+    return ret;
+}
+#endif
+
+struct llama_mmap {
+    void * addr;
+    size_t size;
+
+    llama_mmap(const llama_mmap &) = delete;
+
+#ifdef _POSIX_MAPPED_FILES
+    static constexpr bool SUPPORTED = true;
+
+    llama_mmap(struct llama_file * file) {
+        size = file->size;
+        int fd = fileno(file->fp);
+        int flags = MAP_SHARED;
+#ifdef __linux__
+        flags |= MAP_POPULATE;
+#endif
+        addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
+        close(fd);
+        if (addr == MAP_FAILED) {
+            throw format("mmap failed: %s", strerror(errno));
+        }
+
+        // Advise the kernel to preload the mapped memory
+        if (madvise(addr, file->size, MADV_WILLNEED)) {
+            fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
+                    strerror(errno));
+        }
+    }
+
+    ~llama_mmap() {
+        munmap(addr, size);
+    }
+#elif defined(_WIN32)
+    static constexpr bool SUPPORTED = true;
+
+    llama_mmap(struct llama_file * file) {
+        size = file->size;
+
+        HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
+
+        HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
+        DWORD error = GetLastError();
+        CloseHandle(hFile);
+
+        if (hMapping == NULL) {
+            throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
+        }
+
+        addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
+        error = GetLastError();
+        CloseHandle(hMapping);
+
+        if (addr == NULL) {
+            throw format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str());
+        }
+
+        // Advise the kernel to preload the mapped memory
+        WIN32_MEMORY_RANGE_ENTRY range;
+        range.VirtualAddress = addr;
+        range.NumberOfBytes = (SIZE_T)size;
+        if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
+            fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
+                    llama_format_win_err(GetLastError()).c_str());
+        }
+    }
+
+    ~llama_mmap() {
+        if (!UnmapViewOfFile(addr)) {
+            fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
+                    llama_format_win_err(GetLastError()).c_str());
+        }
+    }
+#else
+    static constexpr bool SUPPORTED = false;
+
+    llama_mmap(struct llama_file *) {
+        throw std::string("mmap not supported");
+    }
+#endif
+};
+
+// Represents some region of memory being locked using mlock or VirtualLock;
+// will automatically unlock on destruction.
+struct llama_mlock {
+    void * addr = NULL;
+    size_t size = 0;
+    bool failed_already = false;
+
+    llama_mlock() {}
+    llama_mlock(const llama_mlock &) = delete;
+
+    ~llama_mlock() {
+        if (size) {
+            raw_unlock(addr, size);
+        }
+    }
+
+    void init(void * addr) {
+        LLAMA_ASSERT(this->addr == NULL && this->size == 0);
+        this->addr = addr;
+    }
+
+    void grow_to(size_t target_size) {
+        LLAMA_ASSERT(addr);
+        if (failed_already) {
+            return;
+        }
+        size_t granularity = lock_granularity();
+        target_size = (target_size + granularity - 1) & ~(granularity - 1);
+        if (target_size > size) {
+            if (raw_lock((uint8_t *) addr + size, target_size - size)) {
+                size = target_size;
+            } else {
+                failed_already = true;
+            }
+        }
+    }
+
+#ifdef _POSIX_MEMLOCK_RANGE
+    static constexpr bool SUPPORTED = true;
+
+    size_t lock_granularity() {
+        return (size_t) sysconf(_SC_PAGESIZE);
+    }
+
+    #ifdef __APPLE__
+        #define MLOCK_SUGGESTION \
+            "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
+            "decreasing 'vm.global_no_user_wire_amount'.  Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
+    #else
+        #define MLOCK_SUGGESTION \
+            "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
+    #endif
+
+    bool raw_lock(const void * addr, size_t size) {
+        if (!mlock(addr, size)) {
+            return true;
+        } else {
+            fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n" MLOCK_SUGGESTION,
+                    size, this->size, std::strerror(errno));
+            return false;
+        }
+    }
+
+    #undef MLOCK_SUGGESTION
+
+    void raw_unlock(void * addr, size_t size) {
+        if (munlock(addr, size)) {
+            fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno));
+        }
+    }
+#elif defined(_WIN32)
+    static constexpr bool SUPPORTED = true;
+
+    size_t lock_granularity() {
+        SYSTEM_INFO si;
+        GetSystemInfo(&si);
+        return (size_t) si.dwPageSize;
+    }
+
+    bool raw_lock(void * addr, size_t size) {
+        for (int tries = 1; ; tries++) {
+            if (VirtualLock(addr, size)) {
+                return true;
+            }
+            if (tries == 2) {
+                fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
+                        size, this->size, llama_format_win_err(GetLastError()).c_str());
+                return false;
+            }
+
+            // It failed but this was only the first try; increase the working
+            // set size and try again.
+            SIZE_T min_ws_size, max_ws_size;
+            if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
+                fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n",
+                        llama_format_win_err(GetLastError()).c_str());
+                return false;
+            }
+            // Per MSDN: "The maximum number of pages that a process can lock
+            // is equal to the number of pages in its minimum working set minus
+            // a small overhead."
+            // Hopefully a megabyte is enough overhead:
+            size_t increment = size + 1048576;
+            // The minimum must be <= the maximum, so we need to increase both:
+            min_ws_size += size;
+            max_ws_size += size;
+            if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
+                fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
+                        llama_format_win_err(GetLastError()).c_str());
+                return false;
+            }
+        }
+    }
+
+    void raw_unlock(void * addr, size_t size) {
+        if (!VirtualUnlock(addr, size)) {
+            fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
+                    llama_format_win_err(GetLastError()).c_str());
+        }
+    }
+#else
+    static constexpr bool SUPPORTED = false;
+
+    void raw_lock(const void * addr, size_t size) {
+        fprintf(stderr, "warning: mlock not supported on this system\n");
+    }
+
+    void raw_unlock(const void * addr, size_t size) {}
+#endif
+};
+
+// Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
+struct llama_buffer {
+    uint8_t * addr = NULL;
+    size_t size = 0;
+
+    void resize(size_t size) {
+        delete[] addr;
+        addr = new uint8_t[size];
+        this->size = size;
+    }
+
+    ~llama_buffer() {
+        delete[] addr;
+    }
+};
+#endif
Author	SHA1	Message	Date
Georgi Gerganov	684da25926	ggml : fix quantize_row_q4_1() ARM_NEON (close #876 )	2023-04-10 19:29:48 +03:00
comex	180b693a47	Print model version. Also improve model type printing, and fix indentation of an unrelated switch statement.	2023-04-10 01:10:46 +02:00
comex	f963b63afa	Rewrite loading code to try to satisfy everyone: - Support all three formats (ggml, ggmf, ggjt). (However, I didn't include the hack needed to support GPT4All files without conversion. Those can still be used after converting them with convert.py from my other PR.) - Support both mmap and read (mmap is used by default, but can be disabled with `--no-mmap`, and is automatically disabled for pre-ggjt files or on platforms where mmap is not supported). - Support multi-file models like before, but automatically determine the number of parts rather than requiring `--n_parts`. - Improve validation and error checking. - Stop using the per-file type field (f16) entirely in favor of just relying on the per-tensor type/size fields. This has no immediate benefit, but makes it easier to experiment with different formats, and should make it easier to support the new GPTQ-for-LLaMa models in the future (I have some work in progress on that front). - Support VirtualLock on Windows (using the same `--mlock` option as on Unix). - Indicate loading progress when using mmap + mlock. (Which led me to the interesting observation that on my Linux machine, with a warm file cache, mlock actually takes some time, whereas mmap without mlock starts almost instantly...) - To help implement this, move mlock support from ggml to the loading code. - madvise/PrefetchVirtualMemory support (based on #740) - Switch from ifstream to the `fopen` family of functions to avoid unnecessary copying and, when mmap is enabled, allow reusing the same file descriptor for both metadata reads and mmap (whereas the existing implementation opens the file a second time to mmap). - Quantization now produces a single-file output even with multi-file inputs (not really a feature as much as 'it was easier this way'). Implementation notes: I tried to factor the code into more discrete pieces than before. Regarding code style: I tried to follow the code style, but I'm naughty and used a few advanced C++ features repeatedly: - Destructors to make it easier to ensure everything gets cleaned up. - Exceptions. I don't even usually use exceptions when writing C++, and I can remove them if desired... but here they make the loading code much more succinct while still properly handling a variety of errors, ranging from API calls failing to integer overflow and allocation failure. The exceptions are converted to error codes at the API boundary.) Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)	2023-04-10 01:10:46 +02:00
Tomáš Pazdiora	aaf3b23deb	fix for windows utf-8 input (#840 ) Use UTF-16 as input on Windows, since UTF-8 does not work and reads multibyte characters as zeros	2023-04-08 17:49:39 +02:00
eiery	f2d1c47294	cmake should link openblas properly with -lopenblas like how it's done in the makefile (#839 )	2023-04-08 11:15:17 +00:00
lon	317fb12fbd	Add new binaries to flake.nix (#847 )	2023-04-08 12:04:23 +02:00
unbounded	62cfc54f77	Add quantize-stats command for testing quantization (#728 ) Command that calculates some statistics over the errors introduced by quantization, like mean square error, max error and some percentile errors for layer weights. Should be useful for testing quantization improvements. Exposes some internal state from ggml and llama for testing	2023-04-08 00:09:18 +02:00
bhubbb	698f7b5d63	make : add libllama.so target for llama-cpp-python (#797 ) I was able to get llama-cpp-python working but only when I build libllama.so with make.	2023-04-07 19:11:58 +03:00
iacore	c1950c3431	zig : don't link examples/common.cpp for non-example (#814 )	2023-04-07 19:05:29 +03:00
Ivan Stepanov	4953e9007f	llama : always sort logits before nucleus sampling (#812 ) * Always sort logits before nucleus sampling * remove second normalization - fix windows build - remove normalization since std::discrete_distribution does not require it	2023-04-07 19:02:12 +03:00