Properly free llama_context on failure

additional optimizations for POWER9 (#454 )
Support calling mlock() on loaded model data on Linux and macOS (#453 )
2026-02-26 14:23:22 +02:00 · 2023-03-24 17:21:01 +02:00 · 2023-03-24 17:19:26 +02:00 · 2023-03-24 17:19:05 +02:00 · 2023-03-24 17:05:13 +02:00 · 2023-03-24 09:13:35 +02:00
14 changed files with 303 additions and 125 deletions
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@@ -16,11 +16,7 @@ elif [[ $arg1 == '--quantize' || $arg1 == '-q' ]]; then
    ./quantize $arg2
 elif [[ $arg1 == '--run' || $arg1 == '-r' ]]; then
    ./main $arg2
-elif [[ $arg1 == '--download' || $arg1 == '-d' ]]; then
-    python3 ./download-pth.py $arg2
 elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then
-    echo "Downloading model..."
-    python3 ./download-pth.py "$1" "$2"
    echo "Converting PTH to GGML..."
    for i in `ls $1/$2/ggml-model-f16.bin*`; do
        if [ -f "${i/f16/q4_0}" ]; then
@@ -39,8 +35,6 @@ else
    echo "              ex: \"/models/7B/\" 1"
    echo "  --quantize (-q): Optimize with quantization process ggml"
    echo "              ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
-    echo "  --download (-d): Download original llama model from CDN: https://agi.gpt4.org/llama/"
-    echo "              ex: \"/models/\" 7B"
-    echo "  --all-in-one (-a): Execute --download, --convert & --quantize"
+    echo "  --all-in-one (-a): Execute --convert & --quantize"
    echo "              ex: \"/models/\" 7B"
 fi
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -218,6 +218,9 @@ add_library(utils OBJECT
 target_include_directories(utils PUBLIC .)
 target_compile_features(utils PUBLIC cxx_std_11) # don't bump
 target_link_libraries(utils PRIVATE ${LLAMA_EXTRA_LIBS})
+if (BUILD_SHARED_LIBS)
+    set_target_properties(utils PROPERTIES POSITION_INDEPENDENT_CODE ON)
+endif()

 add_library(ggml OBJECT
            ggml.c
@@ -226,6 +229,9 @@ add_library(ggml OBJECT
 target_include_directories(ggml PUBLIC .)
 target_compile_features(ggml PUBLIC c_std_11) # don't bump
 target_link_libraries(ggml PRIVATE Threads::Threads ${LLAMA_EXTRA_LIBS})
+if (BUILD_SHARED_LIBS)
+    set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
+endif()

 add_library(llama
            llama.cpp
@@ -234,6 +240,10 @@ add_library(llama
 target_include_directories(llama PUBLIC .)
 target_compile_features(llama PUBLIC cxx_std_11) # don't bump
 target_link_libraries(llama PRIVATE utils ggml ${LLAMA_EXTRA_LIBS})
+if (BUILD_SHARED_LIBS)
+    set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD)
+endif()

 #
 # Executables
--- a/3
+++ b/3
@@ -156,7 +156,8 @@ endif
 ifneq ($(filter ppc64%,$(UNAME_M)),)
 	POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
 	ifneq (,$(findstring POWER9,$(POWER9_M)))
-		CFLAGS += -mpower9-vector
+		CFLAGS += -mcpu=power9
+		CXXFLAGS += -mcpu=power9
 	endif
 	# Require c++23's std::byteswap for big-endian support.
 	ifeq ($(UNAME_M),ppc64)
--- a/README.md
+++ b/README.md
@@ -7,8 +7,8 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++

 **Hot topics:**

+- [Roadmap (short-term)](https://github.com/ggerganov/llama.cpp/discussions/457)
 - New C-style API is now available: https://github.com/ggerganov/llama.cpp/pull/370
- [Added Alpaca support](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca)
 - Cache input prompts for faster initialization: https://github.com/ggerganov/llama.cpp/issues/64
 - Create a `llama.cpp` logo: https://github.com/ggerganov/llama.cpp/issues/105

--- a/convert-gptq-to-ggml.py
+++ b/convert-gptq-to-ggml.py
@@ -36,7 +36,8 @@ fname_out = sys.argv[3]

 fout = open(fname_out, "wb")

-fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
+fout.write(struct.pack("i", 0x67676d66)) # magic: ggmf in hex
+fout.write(struct.pack("i", 1)) # file version
 fout.write(struct.pack("i", n_vocab))
 fout.write(struct.pack("i", n_embd))
 fout.write(struct.pack("i", n_mult))
@@ -49,27 +50,21 @@ fout.write(struct.pack("i", 4))
 # This loop unchanged from convert-pth-to-ggml.py:
 for i in range(tokenizer.vocab_size()):
    if tokenizer.is_unknown(i):
-        # "<unk>" token (translated as ??)
        text = " \u2047 ".encode("utf-8")
-        fout.write(struct.pack("i", len(text)))
-        fout.write(text)
    elif tokenizer.is_control(i):
-        # "<s>"/"</s>" tokens
-        fout.write(struct.pack("i", 0))
+        text = b""
    elif tokenizer.is_byte(i):
-        # "<U+XX>" tokens (which may be invalid UTF-8)
        piece = tokenizer.id_to_piece(i)
        if len(piece) != 6:
-            print("Invalid token: " + piece)
+            print(f"Invalid token: {piece}")
            sys.exit(1)
        byte_value = int(piece[3:-1], 16)
-        fout.write(struct.pack("i", 1))
-        fout.write(struct.pack("B", byte_value))
+        text = struct.pack("B", byte_value)
    else:
-        # normal token. Uses U+2581 (LOWER ONE EIGHTH BLOCK) to represent spaces.
        text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
-        fout.write(struct.pack("i", len(text)))
-        fout.write(text)
+    fout.write(struct.pack("i", len(text)))
+    fout.write(text)
+    fout.write(struct.pack("f", tokenizer.get_score(i)))

 def write_header(shape, dst_name, ftype_cur):
    sname = dst_name.encode('utf-8')
--- a/download-pth.py
+++ b/download-pth.py
@@ -1,66 +0,0 @@
-import os
-import sys
-from tqdm import tqdm
-import requests
-
-if len(sys.argv) < 3:
-    print("Usage: download-pth.py dir-model model-type\n")
-    print("  model-type: Available models 7B, 13B, 30B or 65B")
-    sys.exit(1)
-
-modelsDir = sys.argv[1]
-model = sys.argv[2]
-
-num = {
-    "7B": 1,
-    "13B": 2,
-    "30B": 4,
-    "65B": 8,
-}
-
-if model not in num:
-    print(f"Error: model {model} is not valid, provide 7B, 13B, 30B or 65B")
-    sys.exit(1)
-
-print(f"Downloading model {model}")
-
-files = ["checklist.chk", "params.json"]
-
-for i in range(num[model]):
-    files.append(f"consolidated.0{i}.pth")
-
-resolved_path = os.path.abspath(os.path.join(modelsDir, model))
-os.makedirs(resolved_path, exist_ok=True)
-
-for file in files:
-    dest_path = os.path.join(resolved_path, file)
-    
-    if os.path.exists(dest_path):
-        print(f"Skip file download, it already exists: {file}")
-        continue
-
-    url = f"https://agi.gpt4.org/llama/LLaMA/{model}/{file}"
-    response = requests.get(url, stream=True)
-    with open(dest_path, 'wb') as f:
-        with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t:
-            for chunk in response.iter_content(chunk_size=1024):
-                if chunk:
-                    f.write(chunk)
-                    t.update(len(chunk))
-
-files2 = ["tokenizer_checklist.chk", "tokenizer.model"]
-for file in files2:
-    dest_path = os.path.join(modelsDir, file)
-    
-    if os.path.exists(dest_path):
-        print(f"Skip file download, it already exists: {file}")
-        continue
-    
-    url = f"https://agi.gpt4.org/llama/LLaMA/{file}"
-    response = requests.get(url, stream=True)
-    with open(dest_path, 'wb') as f:
-        with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t:
-            for chunk in response.iter_content(chunk_size=1024):
-                if chunk:
-                    f.write(chunk)
-                    t.update(len(chunk))
--- a/ggml.c
+++ b/ggml.c
@@ -1,5 +1,5 @@
-// Defines CLOCK_MONOTONIC on Linux
-#define _POSIX_C_SOURCE 199309L
+// Defines CLOCK_MONOTONIC and asprintf on Linux
+#define _GNU_SOURCE

 #include "ggml.h"

@@ -10,6 +10,7 @@
 #endif

 #include <assert.h>
+#include <errno.h>
 #include <time.h>
 #include <math.h>
 #include <stdlib.h>
@@ -31,7 +32,6 @@
 #else
 // ref: https://github.com/ggerganov/whisper.cpp/issues/168
 #include <windows.h>
-#include <errno.h>
 #endif

 typedef volatile LONG atomic_int;
@@ -83,6 +83,17 @@ typedef void* thread_ret_t;
 #define static_assert(cond, msg) _Static_assert(cond, msg)
 #endif

+#define GGML_MLOCK_SUPPORT 0
+
+#ifdef __has_include
+    #if __has_include(<sys/mman.h>)
+        #undef GGML_MLOCK_SUPPORT
+        #define GGML_MLOCK_SUPPORT 1
+        #include <sys/mman.h>
+    #endif
+#endif
+
+
 /*#define GGML_PERF*/
 #define GGML_DEBUG 0
 #define GGML_GELU_FP16
@@ -164,6 +175,39 @@ typedef double ggml_float;
 #define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
 #define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)

+#elif defined(__POWER9_VECTOR__)
+
+#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
+/* the inline asm below is about 12% faster than the lookup method */
+#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
+#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
+
+static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
+    register float f;
+    register double d;
+    __asm__(
+        "mtfprd %0,%2\n"
+        "xscvhpdp %0,%0\n"
+        "frsp %1,%0\n" :
+        /* temp */ "=d"(d),
+        /* out */  "=f"(f):
+        /* in */   "r"(h));
+    return f;
+}
+
+static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
+    register double d;
+    register ggml_fp16_t r;
+    __asm__( /* xscvdphp can work on double or single precision */
+        "xscvdphp %0,%2\n"
+        "mffprd %1,%0\n" :
+        /* temp */ "=d"(d),
+        /* out */  "=r"(r):
+        /* in */   "f"(f));
+    return r;
+}
+
 #else

 // FP16 <-> FP32
@@ -261,6 +305,7 @@ static float table_f32_f16[1 << 16];

 // On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
 // so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
+// This is also true for POWER9.
 #if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)

 inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
@@ -451,7 +496,7 @@ static void quantize_row_q4_0_reference(const float * restrict x, void * restric
 void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
    assert(k % QK == 0);

-#if __ARM_NEON || defined(__AVX2__) || defined(__wasm_simd128__)
+#if __ARM_NEON || defined(__AVX2__) || defined(__wasm_simd128__) || defined(__POWER9_VECTOR__)
    const int nb = k / QK;
    const size_t bs = sizeof(float) + QK/2;

@@ -461,7 +506,52 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
    uint8_t pp[QK/2];
 #endif

-#if __ARM_NEON
+#if defined(__POWER9_VECTOR__)
+#if QK == 32
+    const vector float v85 = vec_splats(8.5f);
+    for (int i = 0; i < nb; i++) {
+        float amax = 0.0f; // absolute max
+
+        vector float srcv [8];
+        vector float asrcv[8];
+        vector float amaxv[8];
+
+        for (int l = 0; l < 8; l++) srcv[l]  = *(vector float *)(x + i*32 + 4*l);
+        for (int l = 0; l < 8; l++) asrcv[l] = vec_abs(srcv[l]);
+
+        for (int l = 0; l < 4; l++) amaxv[2*l] = vec_max(asrcv[2*l], asrcv[2*l+1]);
+        //for (int l = 0; l < 2; l++) amaxv[4*l] = vec_max(amaxv[4*l], amaxv[4*l+2]);
+        amaxv[0] = vec_max(amaxv[0], amaxv[2]);
+        amaxv[4] = vec_max(amaxv[4], amaxv[6]);
+        //for (int l = 0; l < 1; l++) amaxv[8*l] = vec_max(amaxv[8*l], amaxv[8*l+4]);
+        amaxv[0] = vec_max(amaxv[0], amaxv[4]);
+
+        amax = MAX(
+                MAX(vec_extract(amaxv[0], 0), vec_extract(amaxv[0], 1)),
+                MAX(vec_extract(amaxv[0], 2), vec_extract(amaxv[0], 3)));
+
+        const float d = amax / ((1 << 3) - 1);
+        const float id = d ? 1.0/d : 0.0;
+
+        *(float *)pd = d;
+        pd += bs;
+
+        const vector float vid = vec_splats(id);
+        for (int l = 0; l < 8; l++) {
+            const vector float vf  = vec_madd(srcv[l], vid, v85);
+            const vector signed int vi = vec_signed(vf);
+
+            pb[2*l + 0] = vec_extract(vi, 0) | (vec_extract(vi, 1) << 4);
+            pb[2*l + 1] = vec_extract(vi, 2) | (vec_extract(vi, 3) << 4);
+        }
+
+        //memcpy(pb, pp, sizeof(pp));
+        pb += bs;
+    }
+#else
+#error "not implemented for QK"
+#endif
+#elif __ARM_NEON
 #if QK == 32
    for (int i = 0; i < nb; i++) {
        float amax = 0.0f; // absolute max
@@ -2344,6 +2434,7 @@ struct ggml_context {
    size_t mem_size;
    void * mem_buffer;
    bool   mem_buffer_owned;
+    bool   mem_buffer_mlocked;

    int n_objects;

@@ -2619,16 +2710,19 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
    }

    *ctx = (struct ggml_context) {
-        /*.mem_size         =*/ params.mem_size,
-        /*.mem_buffer       =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size),
-        /*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
-        /*.n_objects        =*/ 0,
-        /*.objects_begin    =*/ NULL,
-        /*.objects_end      =*/ NULL,
-        /*.scratch          =*/ { 0, 0, NULL, },
-        /*.scratch_save     =*/ { 0, 0, NULL, },
+        /*.mem_size           =*/ params.mem_size,
+        /*.mem_buffer         =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size),
+        /*.mem_buffer_owned   =*/ params.mem_buffer ? false : true,
+        /*.mem_buffer_mlocked =*/ false,
+        /*.n_objects          =*/ 0,
+        /*.objects_begin      =*/ NULL,
+        /*.objects_end        =*/ NULL,
+        /*.scratch            =*/ { 0, 0, NULL, },
+        /*.scratch_save       =*/ { 0, 0, NULL, },
    };

+    GGML_ASSERT(ctx->mem_buffer != NULL); // check for allocation failure
+
    ggml_assert_aligned(ctx->mem_buffer);

    GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
@@ -2651,6 +2745,14 @@ void ggml_free(struct ggml_context * ctx) {
            GGML_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n",
                    __func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size);

+#if GGML_MLOCK_SUPPORT
+            if (ctx->mem_buffer_mlocked) {
+                if (munlock(ctx->mem_buffer, ctx->mem_size)) {
+                    fprintf(stderr, "%s: failed to munlock buffer: %s\n", __func__, strerror(errno));
+                }
+            }
+#endif
+
            if (ctx->mem_buffer_owned) {
                free(ctx->mem_buffer);
            }
@@ -2679,6 +2781,37 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch)
    return result;
 }

+bool ggml_mlock_supported(void) {
+    return GGML_MLOCK_SUPPORT;
+}
+
+#if GGML_MLOCK_SUPPORT
+#ifdef __APPLE__
+    #define MLOCK_SUGGESTION "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or\n" \
+                             "decreasing 'vm.global_no_user_wire_amount'.  Also try increasing RLIMIT_MLOCK (ulimit -l)."
+#else
+    #define MLOCK_SUGGESTION "Try increasing RLIMIT_MLOCK (ulimit -l)."
+#endif
+bool ggml_mlock(struct ggml_context * ctx, char ** err_p) {
+    if (ctx->mem_buffer_mlocked) {
+        return true;
+    }
+    if (mlock(ctx->mem_buffer, ctx->mem_size)) {
+        int ret = asprintf(err_p, "failed to mlock %zu-byte buffer: %s\n" MLOCK_SUGGESTION,
+                           ctx->mem_size, strerror(errno));
+        GGML_ASSERT(ret >= 0);
+        return false;
+    }
+    ctx->mem_buffer_mlocked = true;
+    return true;
+}
+#else // GGML_MLOCK_SUPPORT
+bool ggml_mlock(struct ggml_context * ctx, char ** err_p) {
+    *err_p = strdup("can't mlock because it's not supported on this system");
+    return false;
+}
+#endif // GGML_MLOCK_SUPPORT
+
 ////////////////////////////////////////////////////////////////////////////////

 struct ggml_tensor * ggml_new_tensor_impl(
--- a/ggml.h
+++ b/ggml.h
@@ -343,6 +343,9 @@ size_t ggml_used_mem(const struct ggml_context * ctx);

 size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);

+bool ggml_mlock_supported(void);
+bool ggml_mlock(struct ggml_context * ctx, char ** err_p);
+
 struct ggml_tensor * ggml_new_tensor(
        struct ggml_context * ctx,
        enum   ggml_type type,
--- a/llama.cpp
+++ b/llama.cpp
@@ -102,6 +102,9 @@ struct llama_context {
    // decode output (2-dimensional array: [n_tokens][n_vocab])
    std::vector<float> logits;
    bool logits_all = false;
+
+    // input embedding (1-dimensional array: [n_embd])
+    std::vector<float> embedding;
 };

 struct llama_context_params llama_context_default_params() {
@@ -112,6 +115,8 @@ struct llama_context_params llama_context_default_params() {
        /*.f16_kv     =*/ false,
        /*.logits_all =*/ false,
        /*.vocab_only =*/ false,
+        /*.use_mlock  =*/ false,
+        /*.embedding  =*/ false,
    };

    return result;
@@ -592,8 +597,6 @@ static bool llama_model_load(
        fin.close();
    }

-    lctx.logits.reserve(lctx.model.hparams.n_ctx);
-
    lctx.t_load_us = ggml_time_us() - t_start_us;

    return true;
@@ -727,11 +730,13 @@ static bool llama_eval_internal(

            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
            struct ggml_tensor * V_trans =
-                ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0,
-                            ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
-                            n_embd/n_head, n_head, n_past + N),
-                        1, 2, 0, 3);
+                ggml_cpy(ctx0,
+                    ggml_permute(ctx0,
+                            ggml_reshape_3d(ctx0,
+                                ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
+                                n_embd/n_head, n_head, n_past + N),
+                            1, 2, 0, 3),
+                    ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));

            // KQV = transpose(V) * KQ_soft_max
            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
@@ -789,6 +794,9 @@ static bool llama_eval_internal(
        inpL = cur;
    }

+    // used at the end to optionally extract the embeddings
+    struct ggml_tensor * embeddings = NULL;
+
    // norm
    {
        inpL = ggml_rms_norm(ctx0, inpL);
@@ -797,6 +805,8 @@ static bool llama_eval_internal(
        inpL = ggml_mul(ctx0,
                    ggml_repeat(ctx0, model.norm, inpL),
                    inpL);
+
+        embeddings = inpL;
    }

    // lm_head
@@ -819,15 +829,26 @@ static bool llama_eval_internal(
    //embd_w.resize(n_vocab*N);
    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);

-    auto & logits_out = lctx.logits;
+    // extract logits
+    {
+        auto & logits_out = lctx.logits;

-    if (lctx.logits_all) {
-        logits_out.resize(n_vocab * N);
-        memcpy(logits_out.data(), (float *) ggml_get_data(inpL), sizeof(float)*n_vocab*N);
-    } else {
-        // return result for just the last token
-        logits_out.resize(n_vocab);
-        memcpy(logits_out.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
+        if (lctx.logits_all) {
+            logits_out.resize(n_vocab * N);
+            memcpy(logits_out.data(), (float *) ggml_get_data(inpL), sizeof(float)*n_vocab*N);
+        } else {
+            // return result for just the last token
+            logits_out.resize(n_vocab);
+            memcpy(logits_out.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
+        }
+    }
+
+    // extract embeddings
+    if (lctx.embedding.size()) {
+        auto & embedding_out = lctx.embedding;
+
+        embedding_out.resize(n_embd);
+        memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
    }

    if (mem_per_token == 0) {
@@ -1408,17 +1429,44 @@ struct llama_context * llama_init_from_file(

    ggml_type type_memory = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;

-    if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_parts, type_memory, params.vocab_only)) {
+    if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_parts, type_memory,
+                          params.vocab_only)) {
        fprintf(stderr, "%s: failed to load model\n", __func__);
-        delete ctx;
+        llama_free(ctx);
        return nullptr;
    }

+    if (params.use_mlock) {
+        char *err;
+        if (!ggml_mlock(ctx->model.ctx, &err)) {
+            fprintf(stderr, "%s\n", err);
+            free(err);
+            llama_free(ctx);
+            return nullptr;
+        }
+    }
+
+    // reserve memory for context buffers
+    {
+        const auto & hparams = ctx->model.hparams;
+        if (params.logits_all) {
+            ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);
+        } else {
+            ctx->logits.reserve(hparams.n_ctx);
+        }
+
+        if (params.embedding){
+            ctx->embedding.reserve(hparams.n_embd);
+        }
+    }
+
    return ctx;
 }

 void llama_free(struct llama_context * ctx) {
-    ggml_free(ctx->model.ctx);
+    if (ctx->model.ctx) {
+        ggml_free(ctx->model.ctx);
+    }

    delete ctx;
 }
@@ -1482,6 +1530,10 @@ float * llama_get_logits(struct llama_context * ctx) {
    return ctx->logits.data();
 }

+float * llama_get_embeddings(struct llama_context * ctx) {
+    return ctx->embedding.data();
+}
+
 const char * llama_token_to_str(struct llama_context * ctx, llama_token token) {
    if (token >= llama_n_vocab(ctx)) {
        return nullptr;
--- a/llama.h
+++ b/llama.h
@@ -53,6 +53,8 @@ extern "C" {
        bool f16_kv;     // use fp16 for KV cache
        bool logits_all; // the llama_eval() call computes all logits, not just the last one
        bool vocab_only; // only load the vocabulary, no weights
+        bool use_mlock;  // force system to keep model in RAM
+        bool embedding;  // embedding mode only
    };

    LLAMA_API struct llama_context_params llama_context_default_params();
@@ -108,6 +110,10 @@ extern "C" {
    // Cols: n_vocab
    LLAMA_API float * llama_get_logits(struct llama_context * ctx);

+    // Get the embeddings for the input
+    // shape: [n_embd] (1-dimensional)
+    LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
+
    // Token Id -> String. Uses the vocabulary in the provided context
    LLAMA_API const char * llama_token_to_str(struct llama_context * ctx, llama_token token);

--- a/main.cpp
+++ b/main.cpp
@@ -199,6 +199,8 @@ int main(int argc, char ** argv) {
        lparams.seed       = params.seed;
        lparams.f16_kv     = params.memory_f16;
        lparams.logits_all = params.perplexity;
+        lparams.use_mlock  = params.use_mlock;
+        lparams.embedding  = params.embedding;

        ctx = llama_init_from_file(params.model.c_str(), lparams);

@@ -258,6 +260,9 @@ int main(int argc, char ** argv) {
        params.interactive = true;
    }

+    // determine newline token
+    auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
+
    fprintf(stderr, "\n");
    fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
    fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
@@ -289,6 +294,7 @@ int main(int argc, char ** argv) {

    std::vector<llama_token> embd;

+
    int last_n_size = params.repeat_last_n;
    std::vector<llama_token> last_n_tokens(last_n_size);
    std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
@@ -321,6 +327,27 @@ int main(int argc, char ** argv) {
    // the first thing we will do is to output the prompt, so set color accordingly
    set_console_state(CONSOLE_STATE_PROMPT);

+    if (params.embedding){
+        embd = embd_inp;
+
+        if (embd.size() > 0) {
+            if (llama_eval(ctx, embd.data(), embd.size(), n_past, params.n_threads)) {
+                fprintf(stderr, "%s : failed to eval\n", __func__);
+                return 1;
+            }
+        }
+
+        const auto embeddings = llama_get_embeddings(ctx);
+
+        // TODO: print / use the embeddings
+
+        if (params.use_color) {
+            printf(ANSI_COLOR_RESET);
+        }
+
+        return 0;
+    }
+
    while (remaining_tokens > 0 || params.interactive) {
        // predict
        if (embd.size() > 0) {
@@ -359,6 +386,16 @@ int main(int argc, char ** argv) {
                last_n_tokens.push_back(id);
            }

+            // replace end of text token with newline token when in interactive mode
+            if (id == llama_token_eos() && params.interactive) {
+                id = llama_token_newline.front();
+                if (params.antiprompt.size() != 0) {
+                    // tokenize and inject first reverse prompt
+                    const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
+                    embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
+                }
+            }
+
            // add it to the context
            embd.push_back(id);

@@ -451,12 +488,8 @@ int main(int argc, char ** argv) {

        // end of text token
        if (embd.back() == llama_token_eos()) {
-            if (params.interactive) {
-                is_interacting = true;
-            } else {
-                fprintf(stderr, " [end of text]\n");
-                break;
-            }
+            fprintf(stderr, " [end of text]\n");
+            break;
        }

        // In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
--- a/quantize.py
+++ b/quantize.py
@@ -57,6 +57,7 @@ def main():
    # )

    args = parser.parse_args()
+    args.models_path = os.path.abspath(args.models_path)

    if not os.path.isfile(args.quantize_script_path):
        print(
--- a/utils.cpp
+++ b/utils.cpp
@@ -1,3 +1,5 @@
+#include "ggml.h"
+
 #include "utils.h"

 #include <cassert>
@@ -117,12 +119,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            params.model = argv[i];
        } else if (arg == "-i" || arg == "--interactive") {
            params.interactive = true;
+        } else if (arg == "--embedding") {
+            params.embedding = true;
+        } else if (arg == "--interactive-start") {
+            params.interactive = true;
        } else if (arg == "--interactive-first") {
            params.interactive_start = true;
        } else if (arg == "-ins" || arg == "--instruct") {
            params.instruct = true;
        } else if (arg == "--color") {
            params.use_color = true;
+        } else if (arg == "--mlock") {
+            params.use_mlock = true;
        } else if (arg == "-r" || arg == "--reverse-prompt") {
            if (++i >= argc) {
                invalid_param = true;
@@ -190,6 +198,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stderr, "  --n_parts N           number of model parts (default: -1 = determine from dimensions)\n");
    fprintf(stderr, "  -b N, --batch_size N  batch size for prompt processing (default: %d)\n", params.n_batch);
    fprintf(stderr, "  --perplexity          compute perplexity over the prompt\n");
+    if (ggml_mlock_supported()) {
+        fprintf(stderr, "  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
+    }
    fprintf(stderr, "  -m FNAME, --model FNAME\n");
    fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
    fprintf(stderr, "\n");
--- a/utils.h
+++ b/utils.h
@@ -32,16 +32,21 @@ struct gpt_params {
    std::string model  = "models/lamma-7B/ggml-model.bin"; // model path
    std::string prompt = "";

+
    std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted

    bool memory_f16        = false; // use f16 instead of f32 for memory kv
    bool random_prompt     = false; // do not randomize prompt if none provided
    bool use_color         = false; // use color to distinguish generations and inputs
    bool interactive       = false; // interactive mode
+
+    bool embedding         = false; // get only sentence embedding
    bool interactive_start = false; // wait for user input immediately
+
    bool instruct          = false; // instruction mode (used for Alpaca models)
    bool ignore_eos        = false; // do not stop generating after eos
    bool perplexity        = false; // compute perplexity over the prompt
+    bool use_mlock         = false; // use mlock to keep model in memory
 };

 bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
Author	SHA1	Message	Date
Georgi Gerganov	afd220d9c6	Properly free llama_context on failure	2023-03-24 17:21:01 +02:00
Cameron Kaiser	481044d50c	additional optimizations for POWER9 (#454 )	2023-03-24 17:19:26 +02:00
comex	563cdc391d	Support calling mlock() on loaded model data on Linux and macOS (#453 ) * Support calling mlock() on loaded model data on Linux and macOS This is enabled by a new --mlock command line option. Using mlock() disables swapping and memory compression for the model data. Doing so can be useful on systems where the model takes up a large fraction of system RAM. In my experience, macOS is quite eager to start compressing llama.cpp's memory, which then makes it halt for a few seconds while it decompresses, even with a model that uses "only" 25GB out of 32GB. Of course, this comes at the cost of forcing the system to swap or compress other processes' memory instead, so it needs to be used with care and shouldn't be enabled by default. In theory it should be possible to support this on Windows as well using VirtualLock(), but I'm not much of a Windows user. * Update llama.cpp --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2023-03-24 17:19:05 +02:00
Luciano	8d4a855c24	Add embedding mode with arg flag. Currently working (#282 ) * working but ugly * add arg flag, not working on embedding mode * typo * Working! Thanks to @nullhook * make params argument instead of hardcoded boolean. remove useless time check * start doing the instructions but not finished. This probably doesnt compile * Embeddings extraction support --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2023-03-24 17:05:13 +02:00
Georgi Gerganov	b6b268d441	Add link to Roadmap discussion	2023-03-24 09:13:35 +02:00
Georgi Gerganov	3cd8dde0d1	Revert "Fix memory allocation issues and seg faults" This reverts commit `4870e455b3`. Will provide the correct fix later	2023-03-24 06:22:28 +02:00
Georgi Gerganov	4870e455b3	Fix memory allocation issues and seg faults	2023-03-24 00:11:53 +02:00
Georgi Gerganov	483bab2e3d	Avoid the transposed X branch in the Z = X * Y matrix multiplication (#439 ) Should make results reproducible for different number of threads and batch sizes	2023-03-23 23:22:01 +02:00
Jed Fox	404e1da38e	Fix quantize script not finding models in parent directory (#428 )	2023-03-23 22:42:52 +02:00
Georgi Gerganov	4cc053b6d5	Remove oboslete command from Docker script	2023-03-23 22:39:44 +02:00
Georgi Gerganov	0ba5a3a9a5	Obsolete	2023-03-23 22:32:21 +02:00
rabidcopy	2e17dfd80a	Replace EOS with newline to prevent context/memory being flushed by EOS in interactive mode (#333 ) * Improve interactive mode's coherence after EOS Aims to improve coherence and ability to resume the interactive session when the user is given input back after an end of text token is reached. Not sure what token 13 is or why it seems to help. See conversation for examples. * Make newline token a constant * dynamically determine newline token * relocate previous newline token const * cleanup whitespace * print a new line on end of text in interactive this may need to be looked into further when not using a reverse prompt * only print manual newline with reverse prompt fix formatting of reverse prompts so they don't end up at the end of the current line while not introducing unnecessary new lines otherwise * alternate approach to replace end of text tokens * Inject the reverse prompt again after eos in interactive mode * tokenize reverse prompt when needed makes this PR compatible with https://github.com/ggerganov/llama.cpp/pull/330 * tokenize and inject only first reverse prompt thanks to tjohnman * tokenize first reverse prompt once * add newline token * add newline token * tokenize/inject reverse prompt for refactor this doesn't seem right though * tokenize nothing for antiprompt if no reverse * Update main.cpp * Update main.cpp * tokenize and inject reverse prompt as needed this doesn't seem to work if the reverse prompt is tokenized outside earlier on * not needed * remove newline token * remove newline token * tokenize newline token * add space to comment * Update main.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> --------- Co-authored-by: Slaren <2141330+slaren@users.noreply.github.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2023-03-23 22:22:47 +02:00
Timmy Knight	20a1a4e09c	Fix GPTQ converter (#423 ) * Fix GPTQ converter * Fix comment --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2023-03-23 22:18:13 +02:00
nusu-github	ad072fc5ad	Generate library with CMake (#430 ) * Generate library with CMake BUILD_SHARED_LIBS to allow llama library to be generated. * Turn ON PIC when BUILD_SHARED_LIBS is ON	2023-03-23 21:16:48 +01:00