Define non-positive temperature behavior (#720 )

Remove torch GPU dependencies from the Docker.full image (#665 )
By using `pip install torch --index-url https://download.pytorch.org/whl/cpu` instead of `pip install torch` we can specify we want to install a CPU-only version of PyTorch without any GPU dependencies. This reduces the size of the Docker image from 7.32 GB to 1.62 GB
2026-04-23 16:37:33 +03:00 · 2023-04-03 02:19:04 +02:00 · 2023-04-03 00:13:03 +02:00 · 2023-04-02 12:48:57 +02:00 · 2023-04-02 12:23:04 +02:00 · 2023-04-02 13:21:31 +03:00
10 changed files with 465 additions and 473 deletions
--- a/.devops/full.Dockerfile
+++ b/.devops/full.Dockerfile
@@ -6,7 +6,8 @@ RUN apt-get update && \
    apt-get install -y build-essential python3 python3-pip

 RUN pip install --upgrade pip setuptools wheel \
-    && pip install numpy requests sentencepiece torch tqdm
+    && pip install numpy requests sentencepiece tqdm \
+    && pip install torch --index-url https://download.pytorch.org/whl/cpu

 WORKDIR /app

--- a/91
+++ b/91
@@ -70,95 +70,8 @@ endif
 # TODO: probably these flags need to be tweaked on some architectures
 #       feel free to update the Makefile for your architecture and send a pull request or issue
 ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
-	ifeq ($(UNAME_S),Darwin)
-		F16C_M := $(shell sysctl machdep.cpu.features)
-		ifneq (,$(findstring F16C,$(F16C_M)))
-		    CFLAGS += -mf16c
-		endif
-		AVX1_M := $(shell sysctl machdep.cpu.features)
-		ifneq (,$(findstring FMA,$(AVX1_M)))
-			CFLAGS += -mfma
-		endif
-		ifneq (,$(findstring AVX1.0,$(AVX1_M)))
-			CFLAGS += -mavx
-		endif
-		AVX2_M := $(shell sysctl machdep.cpu.leaf7_features)
-		ifneq (,$(findstring AVX2,$(AVX2_M)))
-			CFLAGS += -mavx2
-		endif
-	else ifeq ($(UNAME_S),Linux)
-		AVX1_M := $(shell grep "avx " /proc/cpuinfo)
-		ifneq (,$(findstring avx,$(AVX1_M)))
-			CFLAGS += -mavx
-		endif
-		AVX2_M := $(shell grep "avx2 " /proc/cpuinfo)
-		ifneq (,$(findstring avx2,$(AVX2_M)))
-			CFLAGS += -mavx2
-		endif
-		FMA_M := $(shell grep "fma " /proc/cpuinfo)
-		ifneq (,$(findstring fma,$(FMA_M)))
-			CFLAGS += -mfma
-		endif
-		F16C_M := $(shell grep "f16c " /proc/cpuinfo)
-		ifneq (,$(findstring f16c,$(F16C_M)))
-			CFLAGS += -mf16c
-		endif
-		SSE3_M := $(shell grep "sse3 " /proc/cpuinfo)
-		ifneq (,$(findstring sse3,$(SSE3_M)))
-			CFLAGS += -msse3
-		endif
-		AVX512F_M := $(shell grep "avx512f " /proc/cpuinfo)
-		ifneq (,$(findstring avx512f,$(AVX512F_M)))
-			CFLAGS += -mavx512f
-		endif
-		AVX512BW_M := $(shell grep "avx512bw " /proc/cpuinfo)
-		ifneq (,$(findstring avx512bw,$(AVX512BW_M)))
-			CFLAGS += -mavx512bw
-		endif
-		AVX512DQ_M := $(shell grep "avx512dq " /proc/cpuinfo)
-		ifneq (,$(findstring avx512dq,$(AVX512DQ_M)))
-			CFLAGS += -mavx512dq
-		endif
-		AVX512VL_M := $(shell grep "avx512vl " /proc/cpuinfo)
-		ifneq (,$(findstring avx512vl,$(AVX512VL_M)))
-			CFLAGS += -mavx512vl
-		endif
-		AVX512CD_M := $(shell grep "avx512cd " /proc/cpuinfo)
-		ifneq (,$(findstring avx512cd,$(AVX512CD_M)))
-			CFLAGS += -mavx512cd
-		endif
-		AVX512ER_M := $(shell grep "avx512er " /proc/cpuinfo)
-		ifneq (,$(findstring avx512er,$(AVX512ER_M)))
-			CFLAGS += -mavx512er
-		endif
-		AVX512IFMA_M := $(shell grep "avx512ifma " /proc/cpuinfo)
-		ifneq (,$(findstring avx512ifma,$(AVX512IFMA_M)))
-			CFLAGS += -mavx512ifma
-		endif
-		AVX512PF_M := $(shell grep "avx512pf " /proc/cpuinfo)
-		ifneq (,$(findstring avx512pf,$(AVX512PF_M)))
-			CFLAGS += -mavx512pf
-		endif
-	else ifeq ($(UNAME_S),Haiku)
-		AVX1_M := $(shell sysinfo -cpu | grep -w "AVX")
-		ifneq (,$(findstring AVX,$(AVX1_M)))
-			CFLAGS += -mavx
-		endif
-		AVX2_M := $(shell sysinfo -cpu | grep -w "AVX2")
-		ifneq (,$(findstring AVX2,$(AVX2_M)))
-			CFLAGS += -mavx2
-		endif
-		FMA_M := $(shell sysinfo -cpu | grep -w "FMA")
-		ifneq (,$(findstring FMA,$(FMA_M)))
-			CFLAGS += -mfma
-		endif
-		F16C_M := $(shell sysinfo -cpu | grep -w "F16C")
-		ifneq (,$(findstring F16C,$(F16C_M)))
-			CFLAGS += -mf16c
-		endif
-	else
-		CFLAGS += -mfma -mf16c -mavx -mavx2
-	endif
+	# Use all CPU extensions that are available:
+	CFLAGS += -march=native -mtune=native
 endif
 ifneq ($(filter ppc64%,$(UNAME_M)),)
 	POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
--- a/README.md
+++ b/README.md
@@ -232,13 +232,15 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.

 - Obtain the `gpt4all-lora-quantized.bin` model
 - It is distributed in the old `ggml` format which is now obsoleted
- You have to convert it to the new format using [./convert-gpt4all-to-ggml.py](./convert-gpt4all-to-ggml.py):
+- You have to convert it to the new format using [./convert-gpt4all-to-ggml.py](./convert-gpt4all-to-ggml.py). You may also need to
+convert the model from the old format to the new format with [./migrate-ggml-2023-03-30-pr613.py](./migrate-ggml-2023-03-30-pr613.py):

  ```bash
  python3 convert-gpt4all-to-ggml.py models/gpt4all-7B/gpt4all-lora-quantized.bin ./models/tokenizer.model 
+  python3 migrate-ggml-2023-03-30-pr613.py models/gpt4all-7B/gpt4all-lora-quantized.bin models/gpt4all-7B/gpt4all-lora-quantized-new.bin
  ```
  
- You can now use the newly generated `gpt4all-lora-quantized.bin` model in exactly the same way as all other models
+- You can now use the newly generated `gpt4all-lora-quantized-new.bin` model in exactly the same way as all other models
 - The original model is saved in the same folder with a suffix `.orig`

 ### Obtaining and verifying the Facebook LLaMA original model and Stanford Alpaca model data
@@ -301,7 +303,7 @@ And after 4.45 hours, you will have the final perplexity.

 ### Android

-You can easily run `llama.cpp` on Android device with [termux](https://play.google.com/store/apps/details?id=com.termux).
+You can easily run `llama.cpp` on Android device with [termux](https://termux.dev/).
 First, obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake:
 ```
 $ mkdir build-android
@@ -310,7 +312,7 @@ $ export NDK=<your_ndk_directory>
 $ cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
 $ make
 ```
-Install [termux](https://play.google.com/store/apps/details?id=com.termux) on your device and run `termux-setup-storage` to get access to your SD card.
+Install [termux](https://termux.dev/) on your device and run `termux-setup-storage` to get access to your SD card.
 Finally, copy the `llama` binary and the model files to your device storage. Here is a demo of an interactive session running on Pixel 5 phone:

 https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
--- a/convert-ggml-to-pth.py
+++ b/convert-ggml-to-pth.py
@@ -254,7 +254,7 @@ def main():
    parser.add_argument(
        "--hf",
        action="store_true",
-        help="Whether to save the model in the huggingface format. (default: False)",
+        help="Whether to save the model in the Hugging Face format. (default: False)",
    )
    parser.add_argument(
        "--chat", "-c", action="store_true", help="Whether to open a chat with the model. (default: False)"
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -39,6 +39,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {

    bool invalid_param = false;
    std::string arg;
+    gpt_params default_params;
+
    for (int i = 1; i < argc; i++) {
        arg = argv[i];

@@ -173,7 +175,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            }
            params.n_parts = std::stoi(argv[i]);
        } else if (arg == "-h" || arg == "--help") {
-            gpt_print_usage(argc, argv, params);
+            gpt_print_usage(argc, argv, default_params);
            exit(0);
        } else if (arg == "--random-prompt") {
            params.random_prompt = true;
@@ -185,13 +187,13 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            params.input_prefix = argv[i];
        } else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            gpt_print_usage(argc, argv, params);
+            gpt_print_usage(argc, argv, default_params);
            exit(1);
        }
    }
    if (invalid_param) {
        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
-        gpt_print_usage(argc, argv, params);
+        gpt_print_usage(argc, argv, default_params);
        exit(1);
    }

--- a/examples/gpt4all.sh
+++ b/examples/gpt4all.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+#
+# Temporary script - will be removed in the future
+#
+
+cd `dirname $0`
+cd ..
+
+./main --color --instruct --threads 4 \
+       --model ./models/gpt4all-7B/gpt4all-lora-quantized.bin \
+       --file ./prompts/alpaca.txt \
+       --batch_size 8 --ctx_size 2048 \
+       --repeat_last_n 64 --repeat_penalty 1.3 \
+       --n_predict 128 --temp 0.1 --top_k 40 --top_p 0.95
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@@ -258,11 +258,11 @@ struct ggml_tensor {
    enum ggml_type type;

    int    n_dims;
-    int    ne[GGML_MAX_DIMS]; // number of elements
-    size_t nb[GGML_MAX_DIMS]; // stride in bytes:
-                              // nb[0] = sizeof(type)
-                              // nb[1] = nb[0]   * ne[0] + padding
-                              // nb[i] = nb[i-1] * ne[i-1]
+    int64_t ne[GGML_MAX_DIMS]; // number of elements
+    size_t  nb[GGML_MAX_DIMS]; // stride in bytes:
+                               // nb[0] = sizeof(type)
+                               // nb[1] = nb[0]   * ne[0] + padding
+                               // nb[i] = nb[i-1] * ne[i-1]

    // compute data
    enum ggml_op op;
@@ -328,8 +328,8 @@ int64_t ggml_cycles_per_ms(void);
 void ggml_print_object (const struct ggml_object * obj);
 void ggml_print_objects(const struct ggml_context * ctx);

-int    ggml_nelements(const struct ggml_tensor * tensor);
-size_t ggml_nbytes   (const struct ggml_tensor * tensor);
+int64_t ggml_nelements(const struct ggml_tensor * tensor);
+size_t  ggml_nbytes   (const struct ggml_tensor * tensor);

 int    ggml_blck_size (enum ggml_type type);
 size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
@@ -355,33 +355,33 @@ struct ggml_tensor * ggml_new_tensor(
        struct ggml_context * ctx,
        enum   ggml_type type,
        int    n_dims,
-        const int *ne);
+        const int64_t *ne);

 struct ggml_tensor * ggml_new_tensor_1d(
        struct ggml_context * ctx,
        enum   ggml_type type,
-        int    ne0);
+        int64_t ne0);

 struct ggml_tensor * ggml_new_tensor_2d(
        struct ggml_context * ctx,
        enum   ggml_type type,
-        int    ne0,
-        int    ne1);
+        int64_t ne0,
+        int64_t ne1);

 struct ggml_tensor * ggml_new_tensor_3d(
        struct ggml_context * ctx,
        enum   ggml_type type,
-        int    ne0,
-        int    ne1,
-        int    ne2);
+        int64_t ne0,
+        int64_t ne1,
+        int64_t ne2);

 struct ggml_tensor * ggml_new_tensor_4d(
        struct ggml_context * ctx,
        enum   ggml_type type,
-        int    ne0,
-        int    ne1,
-        int    ne2,
-        int    ne3);
+        int64_t ne0,
+        int64_t ne1,
+        int64_t ne2,
+        int64_t ne3);

 struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
 struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
@@ -531,30 +531,30 @@ struct ggml_tensor * ggml_reshape(
 struct ggml_tensor * ggml_reshape_2d(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
-        int                   ne0,
-        int                   ne1);
+        int64_t               ne0,
+        int64_t               ne1);

 // return view(a)
 // TODO: when we start computing gradient, make a copy instead of view
 struct ggml_tensor * ggml_reshape_3d(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
-        int                   ne0,
-        int                   ne1,
-        int                   ne2);
+        int64_t               ne0,
+        int64_t               ne1,
+        int64_t               ne2);

 // offset in bytes
 struct ggml_tensor * ggml_view_1d(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
-        int                   ne0,
+        int64_t               ne0,
        size_t                offset);

 struct ggml_tensor * ggml_view_2d(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
-        int                   ne0,
-        int                   ne1,
+        int64_t               ne0,
+        int64_t               ne1,
        size_t                nb1, // row stride in bytes
        size_t                offset);

--- a/llama.cpp
+++ b/llama.cpp
@@ -256,8 +256,8 @@ static bool kv_cache_init(
    const int n_embd  = hparams.n_embd;
    const int n_layer = hparams.n_layer;

-    const int n_mem      = n_layer*n_ctx;
-    const int n_elements = n_embd*n_mem;
+    const int64_t n_mem      = (int64_t)n_layer*n_ctx;
+    const int64_t n_elements = n_embd*n_mem;

    cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);

@@ -679,7 +679,7 @@ static bool llama_model_load(
                return false;
            }
            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
-                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
+                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%" PRId64 ", %" PRId64 "], expected [%d, %d]\n",
                        __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
                return false;
            }
@@ -1194,6 +1194,20 @@ static llama_vocab::id llama_sample_top_p_top_k(
    const auto & logits = lctx.logits;
    const auto * plogits = logits.data() + logits.size() - n_logits;

+    if (temp <= 0) {
+        // select the token with the highest logit directly
+        float max_logit = plogits[0];
+        llama_vocab::id max_id = 0;
+
+        for (int i = 1; i < n_logits; ++i) {
+            if (plogits[i] > max_logit) {
+                max_logit = plogits[i];
+                max_id = i;
+            }
+        }
+        return max_id;
+    }
+
    std::vector<std::pair<float, llama_vocab::id>> logits_id;
    logits_id.reserve(n_logits);

@@ -1608,7 +1622,7 @@ struct llama_context * llama_init_from_file(
    }

    // reserve memory for context buffers
-    {
+    if (!params.vocab_only) {
        if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) {
            fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
            llama_free(ctx);
@@ -1668,6 +1682,33 @@ int llama_model_quantize(
    return 0;
 }

+// Returns the KV cache that will contain the context for the
+// ongoing prediction with the model.
+const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
+    return ctx->model.kv_self.buf.data();
+}
+
+// Returns the size of the KV cache
+size_t llama_get_kv_cache_size(struct llama_context * ctx) {
+    return ctx->model.kv_self.buf.size();
+}
+
+int llama_get_kv_cache_token_count(struct llama_context * ctx) {
+    return ctx->model.kv_self.n;
+}
+
+// Sets the KV cache containing the current context for the model
+void llama_set_kv_cache(
+        struct llama_context * ctx,
+               const uint8_t * kv_cache,
+                      size_t   n_size,
+                         int   n_token_count) {
+    // Make sure we have the same kv cache setup
+    LLAMA_ASSERT(ctx->model.kv_self.buf.size() == n_size);
+    memcpy(ctx->model.kv_self.buf.data(), kv_cache, n_size);
+    ctx->model.kv_self.n = n_token_count;
+}
+
 int llama_eval(
        struct llama_context * ctx,
           const llama_token * tokens,
--- a/llama.h
+++ b/llama.h
@@ -83,6 +83,23 @@ extern "C" {
            const char * fname_out,
                   int   itype);

+    // Returns the KV cache that will contain the context for the
+    // ongoing prediction with the model.
+    LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx);
+
+    // Returns the size of the KV cache
+    LLAMA_API size_t llama_get_kv_cache_size(struct llama_context * ctx);
+
+    // Returns the number of tokens in the KV cache
+    LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
+
+    // Sets the KV cache containing the current context for the model
+    LLAMA_API void llama_set_kv_cache(
+            struct llama_context * ctx,
+                   const uint8_t * kv_cache,
+                          size_t   n_size,
+                             int   n_token_count);
+
    // Run the llama inference to obtain the logits and probabilities for the next token.
    // tokens + n_tokens is the provided batch of new tokens to process
    // n_past is the number of tokens to use from previous eval calls
Author	SHA1	Message	Date
Ivan Stepanov	cd7fa95690	Define non-positive temperature behavior (#720 )	2023-04-03 02:19:04 +02:00
bsilvereagle	a0c0516416	Remove torch GPU dependencies from the Docker.full image (#665 ) By using `pip install torch --index-url https://download.pytorch.org/whl/cpu` instead of `pip install torch` we can specify we want to install a CPU-only version of PyTorch without any GPU dependencies. This reduces the size of the Docker image from 7.32 GB to 1.62 GB	2023-04-03 00:13:03 +02:00
Thatcher Chamberlin	d8d4e865cd	Add a missing step to the gpt4all instructions (#690 ) `migrate-ggml-2023-03-30-pr613.py` is needed to get gpt4all running.	2023-04-02 12:48:57 +02:00
Christian Falch	e986f94829	Added api for getting/setting the kv_cache (#685 ) The api provides access methods for retrieving the current memory buffer for the kv_cache and its token number. It also contains a method for setting the kv_cache from a memory buffer. This makes it possible to load/save history - maybe support --cache-prompt paramater as well? Co-authored-by: Pavol Rusnak <pavol@rusnak.io>	2023-04-02 12:23:04 +02:00
Marian Cepok	c0bb1d3ce2	ggml : change ne to int64_t (#626 )	2023-04-02 13:21:31 +03:00
Leonardo Neumann	6e7801d08d	examples : add gpt4all script (#658 )	2023-04-02 10:56:20 +03:00
Stephan Walter	81040f10aa	llama : do not allocate KV cache for "vocab_only == true" (#682 ) Fixes sanitizer CI	2023-04-02 10:18:53 +03:00
Fabian	c4f89d8d73	make : use -march=native -mtune=native on x86 (#609 )	2023-04-02 10:17:05 +03:00
Murilo Santana	5b70e7de4c	fix default params for examples/main (#697 )	2023-04-02 04:41:12 +02:00
Ikko Eltociear Ashimine	a717cba844	py: huggingface -> Hugging Face (#686 )	2023-04-01 18:38:18 +02:00
rimoliga	d0a7f742e7	readme: replace termux links with homepage, play store is deprecated (#680 )	2023-04-01 16:57:30 +02:00