llama : add flash attention (demo)

ggml, llama : avoid heavy V transpose + improvements (#775 )
ggml : - added ggml_view_3d() - ggml_view_tensor() now inherits the stride too - reimplement ggml_cpy() to account for dst stride - no longer require tensor->data to be memory aligned llama : - compute RoPE on 32-bit tensors (should be more accurate) - store RoPE-ed K in the KV cache - store transposed V in the KV cache (significant speed-up) - avoid unnecessary Q copy
2026-04-23 16:37:33 +03:00 · 2023-04-05 22:12:04 +03:00 · 2023-04-05 22:07:33 +03:00 · 2023-04-05 19:54:30 +03:00 · 2023-04-05 19:20:05 +03:00 · 2023-04-05 18:59:13 +03:00
27 changed files with 1087 additions and 804 deletions
--- a/.devops/full.Dockerfile
+++ b/.devops/full.Dockerfile
@@ -6,7 +6,8 @@ RUN apt-get update && \
    apt-get install -y build-essential python3 python3-pip

 RUN pip install --upgrade pip setuptools wheel \
-    && pip install numpy requests sentencepiece torch tqdm
+    && pip install numpy requests sentencepiece tqdm \
+    && pip install torch --index-url https://download.pytorch.org/whl/cpu

 WORKDIR /app

--- a/.gitignore
+++ b/.gitignore
@@ -33,3 +33,6 @@ compile_commands.json
 .venv
 __pycache__
 .swiftpm
+
+zig-out/
+zig-cache/
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -68,7 +68,9 @@ option(LLAMA_BUILD_EXAMPLES         "llama: build examples" ${LLAMA_STANDALONE})
 # Compile flags
 #

+set(CMAKE_CXX_STANDARD 11)
 set(CMAKE_CXX_STANDARD_REQUIRED true)
+set(CMAKE_C_STANDARD 11)
 set(CMAKE_C_STANDARD_REQUIRED true)
 set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)
--- a/92
+++ b/92
@@ -70,95 +70,9 @@ endif
 # TODO: probably these flags need to be tweaked on some architectures
 #       feel free to update the Makefile for your architecture and send a pull request or issue
 ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
-	ifeq ($(UNAME_S),Darwin)
-		F16C_M := $(shell sysctl machdep.cpu.features)
-		ifneq (,$(findstring F16C,$(F16C_M)))
-		    CFLAGS += -mf16c
-		endif
-		AVX1_M := $(shell sysctl machdep.cpu.features)
-		ifneq (,$(findstring FMA,$(AVX1_M)))
-			CFLAGS += -mfma
-		endif
-		ifneq (,$(findstring AVX1.0,$(AVX1_M)))
-			CFLAGS += -mavx
-		endif
-		AVX2_M := $(shell sysctl machdep.cpu.leaf7_features)
-		ifneq (,$(findstring AVX2,$(AVX2_M)))
-			CFLAGS += -mavx2
-		endif
-	else ifeq ($(UNAME_S),Linux)
-		AVX1_M := $(shell grep "avx " /proc/cpuinfo)
-		ifneq (,$(findstring avx,$(AVX1_M)))
-			CFLAGS += -mavx
-		endif
-		AVX2_M := $(shell grep "avx2 " /proc/cpuinfo)
-		ifneq (,$(findstring avx2,$(AVX2_M)))
-			CFLAGS += -mavx2
-		endif
-		FMA_M := $(shell grep "fma " /proc/cpuinfo)
-		ifneq (,$(findstring fma,$(FMA_M)))
-			CFLAGS += -mfma
-		endif
-		F16C_M := $(shell grep "f16c " /proc/cpuinfo)
-		ifneq (,$(findstring f16c,$(F16C_M)))
-			CFLAGS += -mf16c
-		endif
-		SSE3_M := $(shell grep "sse3 " /proc/cpuinfo)
-		ifneq (,$(findstring sse3,$(SSE3_M)))
-			CFLAGS += -msse3
-		endif
-		AVX512F_M := $(shell grep "avx512f " /proc/cpuinfo)
-		ifneq (,$(findstring avx512f,$(AVX512F_M)))
-			CFLAGS += -mavx512f
-		endif
-		AVX512BW_M := $(shell grep "avx512bw " /proc/cpuinfo)
-		ifneq (,$(findstring avx512bw,$(AVX512BW_M)))
-			CFLAGS += -mavx512bw
-		endif
-		AVX512DQ_M := $(shell grep "avx512dq " /proc/cpuinfo)
-		ifneq (,$(findstring avx512dq,$(AVX512DQ_M)))
-			CFLAGS += -mavx512dq
-		endif
-		AVX512VL_M := $(shell grep "avx512vl " /proc/cpuinfo)
-		ifneq (,$(findstring avx512vl,$(AVX512VL_M)))
-			CFLAGS += -mavx512vl
-		endif
-		AVX512CD_M := $(shell grep "avx512cd " /proc/cpuinfo)
-		ifneq (,$(findstring avx512cd,$(AVX512CD_M)))
-			CFLAGS += -mavx512cd
-		endif
-		AVX512ER_M := $(shell grep "avx512er " /proc/cpuinfo)
-		ifneq (,$(findstring avx512er,$(AVX512ER_M)))
-			CFLAGS += -mavx512er
-		endif
-		AVX512IFMA_M := $(shell grep "avx512ifma " /proc/cpuinfo)
-		ifneq (,$(findstring avx512ifma,$(AVX512IFMA_M)))
-			CFLAGS += -mavx512ifma
-		endif
-		AVX512PF_M := $(shell grep "avx512pf " /proc/cpuinfo)
-		ifneq (,$(findstring avx512pf,$(AVX512PF_M)))
-			CFLAGS += -mavx512pf
-		endif
-	else ifeq ($(UNAME_S),Haiku)
-		AVX1_M := $(shell sysinfo -cpu | grep -w "AVX")
-		ifneq (,$(findstring AVX,$(AVX1_M)))
-			CFLAGS += -mavx
-		endif
-		AVX2_M := $(shell sysinfo -cpu | grep -w "AVX2")
-		ifneq (,$(findstring AVX2,$(AVX2_M)))
-			CFLAGS += -mavx2
-		endif
-		FMA_M := $(shell sysinfo -cpu | grep -w "FMA")
-		ifneq (,$(findstring FMA,$(FMA_M)))
-			CFLAGS += -mfma
-		endif
-		F16C_M := $(shell sysinfo -cpu | grep -w "F16C")
-		ifneq (,$(findstring F16C,$(F16C_M)))
-			CFLAGS += -mf16c
-		endif
-	else
-		CFLAGS += -mfma -mf16c -mavx -mavx2
-	endif
+	# Use all CPU extensions that are available:
+	CFLAGS += -march=native -mtune=native
+	CXXFLAGS += -march=native -mtune=native
 endif
 ifneq ($(filter ppc64%,$(UNAME_M)),)
 	POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
--- a/Package.swift
+++ b/Package.swift
@@ -13,7 +13,10 @@ let package = Package(
            path: ".",
            sources: ["ggml.c", "llama.cpp"],
            publicHeadersPath: "spm-headers",
-            cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"])]
+            cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"]), .define("GGML_USE_ACCELERATE")],
+            linkerSettings: [
+                .linkedFramework("Accelerate")
+            ]
        ),
    ],
    cxxLanguageStandard: .cxx11
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # llama.cpp

-![llama](https://user-images.githubusercontent.com/1991296/227761327-6d83e30e-2200-41a6-bfbb-f575231c54f4.png)
+![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)

 [![Actions Status](https://github.com/ggerganov/llama.cpp/workflows/CI/badge.svg)](https://github.com/ggerganov/llama.cpp/actions)
 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
@@ -9,8 +9,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++

 **Hot topics:**

- [Roadmap (short-term)](https://github.com/ggerganov/llama.cpp/discussions/457)
- Support for [GPT4All](https://github.com/ggerganov/llama.cpp#using-gpt4all)
+- [Roadmap Apr 2023](https://github.com/ggerganov/llama.cpp/discussions/784)

 ## Description

@@ -28,18 +27,31 @@ Please do not make conclusions about the models based on the results from this i
 For all I know, it can be completely wrong. This project is for educational purposes.
 New features will probably be added mostly through community contributions.

-Supported platforms:
+**Supported platforms:**

 - [X] Mac OS
 - [X] Linux
 - [X] Windows (via CMake)
 - [X] Docker

-Supported models:
+**Supported models:**

- [X] LLaMA
+- [X] LLaMA 🦙
 - [X] [Alpaca](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca)
 - [X] [GPT4All](https://github.com/ggerganov/llama.cpp#using-gpt4all)
+- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca)
+- [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
+- [X] [Vicuna](https://github.com/ggerganov/llama.cpp/discussions/643#discussioncomment-5533894)
+
+**Bindings:**
+
+- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
+- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
+
+**UI:**
+
+- [nat/openplayground](https://github.com/nat/openplayground)
+- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui)

 ---

@@ -143,6 +155,13 @@ git clone https://github.com/ggerganov/llama.cpp
 cd llama.cpp
 make

+#For Windows and CMake, use the following command instead:
+cd <path_to_llama_folder>
+mkdir build
+cd build
+cmake ..
+cmake --build . --config Release
+
 # obtain the original LLaMA model weights and place them in ./models
 ls ./models
 65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model
@@ -153,8 +172,8 @@ python3 -m pip install torch numpy sentencepiece
 # convert the 7B model to ggml FP16 format
 python3 convert-pth-to-ggml.py models/7B/ 1

-# quantize the model to 4-bits
-python3 quantize.py 7B
+# quantize the model to 4-bits (using method 2 = q4_0)
+./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin 2

 # run the inference
 ./main -m ./models/7B/ggml-model-q4_0.bin -n 128
@@ -230,13 +249,15 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.

 - Obtain the `gpt4all-lora-quantized.bin` model
 - It is distributed in the old `ggml` format which is now obsoleted
- You have to convert it to the new format using [./convert-gpt4all-to-ggml.py](./convert-gpt4all-to-ggml.py):
+- You have to convert it to the new format using [./convert-gpt4all-to-ggml.py](./convert-gpt4all-to-ggml.py). You may also need to
+convert the model from the old format to the new format with [./migrate-ggml-2023-03-30-pr613.py](./migrate-ggml-2023-03-30-pr613.py):

  ```bash
  python3 convert-gpt4all-to-ggml.py models/gpt4all-7B/gpt4all-lora-quantized.bin ./models/tokenizer.model 
+  python3 migrate-ggml-2023-03-30-pr613.py models/gpt4all-7B/gpt4all-lora-quantized.bin models/gpt4all-7B/gpt4all-lora-quantized-new.bin
  ```
  
- You can now use the newly generated `gpt4all-lora-quantized.bin` model in exactly the same way as all other models
+- You can now use the newly generated `gpt4all-lora-quantized-new.bin` model in exactly the same way as all other models
 - The original model is saved in the same folder with a suffix `.orig`

 ### Obtaining and verifying the Facebook LLaMA original model and Stanford Alpaca model data
@@ -299,7 +320,7 @@ And after 4.45 hours, you will have the final perplexity.

 ### Android

-You can easily run `llama.cpp` on Android device with [termux](https://play.google.com/store/apps/details?id=com.termux).
+You can easily run `llama.cpp` on Android device with [termux](https://termux.dev/).
 First, obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake:
 ```
 $ mkdir build-android
@@ -308,7 +329,7 @@ $ export NDK=<your_ndk_directory>
 $ cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
 $ make
 ```
-Install [termux](https://play.google.com/store/apps/details?id=com.termux) on your device and run `termux-setup-storage` to get access to your SD card.
+Install [termux](https://termux.dev/) on your device and run `termux-setup-storage` to get access to your SD card.
 Finally, copy the `llama` binary and the model files to your device storage. Here is a demo of an interactive session running on Pixel 5 phone:

 https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
@@ -363,3 +384,6 @@ docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models
 - Clean-up any trailing whitespaces, use 4 spaces indentation, brackets on same line, `void * ptr`, `int & a`
 - See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions

+### Docs
+
+- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
--- a/build.zig
+++ b/build.zig
@@ -0,0 +1,62 @@
+const std = @import("std");
+
+pub fn build(b: *std.Build) void {
+    const target = b.standardTargetOptions(.{});
+    const optimize = b.standardOptimizeOption(.{});
+
+    const lib = b.addStaticLibrary(.{
+        .name = "llama",
+        .target = target,
+        .optimize = optimize,
+    });
+    lib.linkLibCpp();
+    lib.addIncludePath(".");
+    lib.addIncludePath("examples");
+    lib.addCSourceFiles(&.{
+        "ggml.c",
+    }, &.{"-std=c11"});
+    lib.addCSourceFiles(&.{
+        "llama.cpp",
+        "examples/common.cpp",
+    }, &.{"-std=c++11"});
+    lib.install();
+
+    const build_args = .{ .b = b, .lib = lib, .target = target, .optimize = optimize };
+    const exe = build_example("main", build_args);
+    _ = build_example("quantize", build_args);
+    _ = build_example("perplexity", build_args);
+    _ = build_example("embedding", build_args);
+
+    // create "zig build run" command for ./main
+
+    const run_cmd = exe.run();
+    run_cmd.step.dependOn(b.getInstallStep());
+    if (b.args) |args| {
+        run_cmd.addArgs(args);
+    }
+
+    const run_step = b.step("run", "Run the app");
+    run_step.dependOn(&run_cmd.step);
+}
+
+fn build_example(comptime name: []const u8, args: anytype) *std.build.LibExeObjStep {
+    const b = args.b;
+    const lib = args.lib;
+    const target = args.target;
+    const optimize = args.optimize;
+
+    const exe = b.addExecutable(.{
+        .name = name,
+        .target = target,
+        .optimize = optimize,
+    });
+    exe.addIncludePath(".");
+    exe.addIncludePath("examples");
+    exe.addCSourceFiles(&.{
+        std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{name, name}),
+    }, &.{"-std=c++11"});
+    exe.linkLibrary(lib);
+    exe.install();
+
+    return exe;
+}
--- a/convert-ggml-to-pth.py
+++ b/convert-ggml-to-pth.py
@@ -27,9 +27,9 @@ def read_tokens(fin, vocab_size):
        text_len = struct.unpack("i", fin.read(4))[0]
        text_bytes = fin.read(text_len)
        try:
-            text = text_bytes.decode("utf-8")
+            text = text_bytes.decode()
        except UnicodeDecodeError:
-            text = text_bytes.decode("utf-8", "replace")
+            text = text_bytes.decode(errors="replace")
        score = struct.unpack("f", fin.read(4))[0]
        tokens.append((text, score))
    return tokens
@@ -82,7 +82,7 @@ def read_variables(fin):

        shape = tuple(struct.unpack("i" * n_dims, fin.read(4 * n_dims)))
        shape = shape[::-1]
-        name = fin.read(name_length).decode("utf-8")
+        name = fin.read(name_length).decode()

        # ensure tensor data is aligned
        tensor_data_offset = fin.tell()
@@ -199,7 +199,7 @@ def chat(model, hparams, llama_dir):
    device = torch.device("cpu")
    llama = llama.to(device)

-    ctx = """You are AI. 
+    ctx = """You are AI.
 This is a dialog, where User interacts with AI. AI is helpful, kind, obedient, honest, respectful, direct, concise, should try to protect User's privacy, and knows its own limits. Also, AI must answer User and AI cannot stop the conversation by itself.
 User: Hello, AI.
 AI: Hello! How can I assist you today?
@@ -207,11 +207,11 @@ AI: Hello! How can I assist you today?
    print(ctx.rstrip("\n"))
    while True:
        print("-" * 60)
-        prompt = input(f"User: ")
+        prompt = input("User: ")
        if ctx != "":
-            ctx = ctx + "User: " + prompt + "\n"
+            ctx = f"{ctx}User: {prompt}\n"
        else:
-            ctx = prompt + "\nAI:"
+            ctx = f"{prompt}\nAI:"

        ctx = (ctx[-1920:]) if len(ctx) >= 2048 else ctx

@@ -236,7 +236,7 @@ AI: Hello! How can I assist you today?
                )
            s = generation_output.sequences[0]
            decoded = tokenizer.decode(s)
-            ctx = decoded + "\n"
+            ctx = f"{decoded}\n"


 def main():
@@ -254,7 +254,7 @@ def main():
    parser.add_argument(
        "--hf",
        action="store_true",
-        help="Whether to save the model in the huggingface format. (default: False)",
+        help="Whether to save the model in the Hugging Face format. (default: False)",
    )
    parser.add_argument(
        "--chat", "-c", action="store_true", help="Whether to open a chat with the model. (default: False)"
--- a/convert-gpt4all-to-ggml.py
+++ b/convert-gpt4all-to-ggml.py
@@ -49,7 +49,7 @@ def write_header(f_out, header):
 def write_tokens(fout, tokenizer):
    for i in range(tokenizer.vocab_size()):
        if tokenizer.is_unknown(i):
-            text = " \u2047 ".encode("utf-8")
+            text = " \u2047 ".encode()
        elif tokenizer.is_control(i):
            text = b""
        elif tokenizer.is_byte(i):
@@ -60,13 +60,13 @@ def write_tokens(fout, tokenizer):
            byte_value = int(piece[3:-1], 16)
            text = struct.pack("B", byte_value)
        else:
-            text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
+            text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
        fout.write(struct.pack("i", len(text)))
        fout.write(text)
        fout.write(struct.pack("f", tokenizer.get_score(i)))

    # TODO: GPT4All - add extra <pad> token
-    text = "<pad>".encode("utf-8")
+    text = "<pad>".encode()
    fout.write(struct.pack("i", len(text)))
    fout.write(text)
    fout.write(struct.pack("f", 0.0))
--- a/convert-gptq-to-ggml.py
+++ b/convert-gptq-to-ggml.py
@@ -50,7 +50,7 @@ fout.write(struct.pack("i", 4))
 # This loop unchanged from convert-pth-to-ggml.py:
 for i in range(tokenizer.vocab_size()):
    if tokenizer.is_unknown(i):
-        text = " \u2047 ".encode("utf-8")
+        text = " \u2047 ".encode()
    elif tokenizer.is_control(i):
        text = b""
    elif tokenizer.is_byte(i):
@@ -61,13 +61,13 @@ for i in range(tokenizer.vocab_size()):
        byte_value = int(piece[3:-1], 16)
        text = struct.pack("B", byte_value)
    else:
-        text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
+        text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
    fout.write(struct.pack("i", len(text)))
    fout.write(text)
    fout.write(struct.pack("f", tokenizer.get_score(i)))

 def write_header(shape, dst_name, ftype_cur):
-    sname = dst_name.encode('utf-8')
+    sname = dst_name.encode()
    fout.write(struct.pack("iii", len(shape), len(sname), ftype_cur))
    fout.write(struct.pack("i" * len(shape), *shape[::-1]))
    fout.write(sname)
@@ -80,7 +80,7 @@ def write_header(shape, dst_name, ftype_cur):
 def convert_non_q4(src_name, dst_name):
    v = model[src_name]
    shape = v.shape
-    print("Processing non-Q4 variable: " + src_name + " with shape: ", shape, " and type: ", v.dtype)
+    print(f"Processing non-Q4 variable: {src_name} with shape: {shape} and type: {v.dtype}")
    if len(shape) == 1:
        print("  Converting to float32")
        v = v.to(torch.float32)
@@ -105,7 +105,7 @@ def convert_q4(src_name, dst_name, permute=False):
    # Each int32 item is actually 8 int4 items packed together, and it's transposed.
    shape = (qweight.shape[0], qweight.shape[1] * 8)

-    print("Processing Q4 variable: " + src_name + " with shape: ", shape)
+    print(f"Processing Q4 variable: {src_name} with shape: {shape}")

    # The output format has the int4 weights in groups of 32 rather than 8.
    # It looks like this:
@@ -168,5 +168,5 @@ for i in range(n_layer):

 fout.close()

-print("Done. Output file: " + fname_out)
-print("")
+print(f"Done. Output file: {fname_out}")
+print()
--- a/convert-pth-to-ggml.py
+++ b/convert-pth-to-ggml.py
@@ -120,7 +120,7 @@ def write_header(fout, hparams, ftype):
 def write_tokens(fout, tokenizer):
    for i in range(tokenizer.vocab_size()):
        if tokenizer.is_unknown(i):
-            text = " \u2047 ".encode("utf-8")
+            text = " \u2047 ".encode()
        elif tokenizer.is_control(i):
            text = b""
        elif tokenizer.is_byte(i):
@@ -131,7 +131,7 @@ def write_tokens(fout, tokenizer):
            byte_value = int(piece[3:-1], 16)
            text = struct.pack("B", byte_value)
        else:
-            text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
+            text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
        fout.write(struct.pack("i", len(text)))
        fout.write(text)
        fout.write(struct.pack("f", tokenizer.get_score(i)))
@@ -191,7 +191,7 @@ def process_and_write_variables(fout, model, ftype, part_id, n_parts):
        fullshape = list(partshape)
        if n_dims > 1:
            fullshape[split_dim] *= n_parts
-        sname = name.encode('utf-8')
+        sname = name.encode()
        fout.write(struct.pack("iii", n_dims, len(sname), ftype_cur))
        for dim in reversed(fullshape):
            fout.write(struct.pack("i", dim))
--- a/convert-unversioned-ggml-to-ggml.py
+++ b/convert-unversioned-ggml-to-ggml.py
@@ -44,7 +44,7 @@ def write_header(f_out, header):
 def write_tokens(fout, tokenizer):
    for i in range(tokenizer.vocab_size()):
        if tokenizer.is_unknown(i):
-            text = " \u2047 ".encode("utf-8")
+            text = " \u2047 ".encode()
        elif tokenizer.is_control(i):
            text = b""
        elif tokenizer.is_byte(i):
@@ -55,7 +55,7 @@ def write_tokens(fout, tokenizer):
            byte_value = int(piece[3:-1], 16)
            text = struct.pack("B", byte_value)
        else:
-            text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
+            text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
        fout.write(struct.pack("i", len(text)))
        fout.write(text)
        fout.write(struct.pack("f", tokenizer.get_score(i)))
--- a/examples/Miku.sh
+++ b/examples/Miku.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+set -e
+
+AI_NAME="${AI_NAME:-Miku}"
+MODEL="${MODEL:-./models/gpt4all-7B/gpt4all-lora-unfiltered-quantized.bin}"
+USER_NAME="${USER_NAME:-Anon}"
+
+# Uncomment and adjust to the number of CPU cores you want to use.
+#N_THREAD="${N_THREAD:-4}"
+N_PREDICTS="${N_PREDICTS:-4096}"
+
+GEN_OPTIONS=(--batch_size 1024
+--ctx_size 2048
+--keep -1
+--repeat_last_n 256
+--repeat_penalty 1.17647
+--temp 0.7
+--top_k 40
+--top_p 0.5)
+
+if [ -n "$N_THREAD" ]; then
+	GEN_OPTIONS+=(--threads "$N_THREAD")
+fi
+
+./main "${GEN_OPTIONS[@]}" \
+	--model "$MODEL" \
+	--n_predict "$N_PREDICTS" \
+	--color --interactive \
+	--reverse-prompt "${USER_NAME}:" \
+	--prompt "
+This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the users computer.
+${AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next.
+${AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct she will ask the user for help.
+${AI_NAME} is a very helpful AI and will help the user with anything they need, she is also very friendly and will try to make the user feel better if they are sad.
+${AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life, she will also try to make the user like her.
+The conversation is only between ${USER_NAME} and ${AI_NAME}
+The conversation is only through text, so ${AI_NAME} can't see ${USER_NAME}'s face or hear his voice.
+${AI_NAME} can only communicate through text, so she can't send images or videos.
+
+
+${USER_NAME}: Hello!
+${AI_NAME}: /think I wonder what I should say to ${USER_NAME}? This is the first time we talk so it's important that I make a good first impression!
+${AI_NAME}: Hi! I am ${AI_NAME}, your new AI friend, assistant(or whatever you like!), it's so nice to meet you! ^_^
+${AI_NAME}: I might be an AI, but I also have feelings, so please be nice to me! :)
+${USER_NAME}: I'll make sure to be nice to you! I'm so happy to have you as my assistant!
+${AI_NAME}: /think It sounds like ${USER_NAME} is happy to have me as their assistant! I'm so happy too! ^_^ Glad that whole emotion thing didn't scare him off!
+${AI_NAME}: /think I wonder what ${USER_NAME} likes to do in his free time? I should ask him about that!
+${AI_NAME}: What do you like to do in your free time? ^_^
+${USER_NAME}:" "$@"
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -39,6 +39,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {

    bool invalid_param = false;
    std::string arg;
+    gpt_params default_params;
+
    for (int i = 1; i < argc; i++) {
        arg = argv[i];

@@ -66,6 +68,11 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            std::ifstream file(argv[i]);
+            if (!file) {
+                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+                invalid_param = true;
+                break;
+            }
            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
            if (params.prompt.back() == '\n') {
                params.prompt.pop_back();
@@ -168,7 +175,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            }
            params.n_parts = std::stoi(argv[i]);
        } else if (arg == "-h" || arg == "--help") {
-            gpt_print_usage(argc, argv, params);
+            gpt_print_usage(argc, argv, default_params);
            exit(0);
        } else if (arg == "--random-prompt") {
            params.random_prompt = true;
@@ -180,13 +187,13 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            params.input_prefix = argv[i];
        } else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            gpt_print_usage(argc, argv, params);
+            gpt_print_usage(argc, argv, default_params);
            exit(1);
        }
    }
    if (invalid_param) {
        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
-        gpt_print_usage(argc, argv, params);
+        gpt_print_usage(argc, argv, default_params);
        exit(1);
    }

--- a/examples/gpt4all.sh
+++ b/examples/gpt4all.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+#
+# Temporary script - will be removed in the future
+#
+
+cd `dirname $0`
+cd ..
+
+./main --color --instruct --threads 4 \
+       --model ./models/gpt4all-7B/gpt4all-lora-quantized.bin \
+       --file ./prompts/alpaca.txt \
+       --batch_size 8 --ctx_size 2048 \
+       --repeat_last_n 64 --repeat_penalty 1.3 \
+       --n_predict 128 --temp 0.1 --top_k 40 --top_p 0.95
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -368,6 +368,11 @@ int main(int argc, char ** argv) {
                // potentially set color to indicate we are taking user input
                set_console_color(con_st, CONSOLE_COLOR_USER_INPUT);

+#if defined (_WIN32)
+                // Windows: must reactivate sigint handler after each signal
+                signal(SIGINT, sigint_handler);
+#endif
+
                if (params.instruct) {
                    printf("\n> ");
                }
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@@ -258,11 +258,11 @@ struct ggml_tensor {
    enum ggml_type type;

    int    n_dims;
-    int    ne[GGML_MAX_DIMS]; // number of elements
-    size_t nb[GGML_MAX_DIMS]; // stride in bytes:
-                              // nb[0] = sizeof(type)
-                              // nb[1] = nb[0]   * ne[0] + padding
-                              // nb[i] = nb[i-1] * ne[i-1]
+    int64_t ne[GGML_MAX_DIMS]; // number of elements
+    size_t  nb[GGML_MAX_DIMS]; // stride in bytes:
+                               // nb[0] = sizeof(type)
+                               // nb[1] = nb[0]   * ne[0] + padding
+                               // nb[i] = nb[i-1] * ne[i-1]

    // compute data
    enum ggml_op op;
@@ -328,8 +328,8 @@ int64_t ggml_cycles_per_ms(void);
 void ggml_print_object (const struct ggml_object * obj);
 void ggml_print_objects(const struct ggml_context * ctx);

-int    ggml_nelements(const struct ggml_tensor * tensor);
-size_t ggml_nbytes   (const struct ggml_tensor * tensor);
+int64_t ggml_nelements(const struct ggml_tensor * tensor);
+size_t  ggml_nbytes   (const struct ggml_tensor * tensor);

 int    ggml_blck_size (enum ggml_type type);
 size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
@@ -355,33 +355,33 @@ struct ggml_tensor * ggml_new_tensor(
        struct ggml_context * ctx,
        enum   ggml_type type,
        int    n_dims,
-        const int *ne);
+        const int64_t *ne);

 struct ggml_tensor * ggml_new_tensor_1d(
        struct ggml_context * ctx,
        enum   ggml_type type,
-        int    ne0);
+        int64_t ne0);

 struct ggml_tensor * ggml_new_tensor_2d(
        struct ggml_context * ctx,
        enum   ggml_type type,
-        int    ne0,
-        int    ne1);
+        int64_t ne0,
+        int64_t ne1);

 struct ggml_tensor * ggml_new_tensor_3d(
        struct ggml_context * ctx,
        enum   ggml_type type,
-        int    ne0,
-        int    ne1,
-        int    ne2);
+        int64_t ne0,
+        int64_t ne1,
+        int64_t ne2);

 struct ggml_tensor * ggml_new_tensor_4d(
        struct ggml_context * ctx,
        enum   ggml_type type,
-        int    ne0,
-        int    ne1,
-        int    ne2,
-        int    ne3);
+        int64_t ne0,
+        int64_t ne1,
+        int64_t ne2,
+        int64_t ne3);

 struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
 struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
@@ -531,33 +531,43 @@ struct ggml_tensor * ggml_reshape(
 struct ggml_tensor * ggml_reshape_2d(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
-        int                   ne0,
-        int                   ne1);
+        int64_t               ne0,
+        int64_t               ne1);

 // return view(a)
 // TODO: when we start computing gradient, make a copy instead of view
 struct ggml_tensor * ggml_reshape_3d(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
-        int                   ne0,
-        int                   ne1,
-        int                   ne2);
+        int64_t               ne0,
+        int64_t               ne1,
+        int64_t               ne2);

 // offset in bytes
 struct ggml_tensor * ggml_view_1d(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
-        int                   ne0,
+        int64_t               ne0,
        size_t                offset);

 struct ggml_tensor * ggml_view_2d(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
-        int                   ne0,
-        int                   ne1,
+        int64_t               ne0,
+        int64_t               ne1,
        size_t                nb1, // row stride in bytes
        size_t                offset);

+struct ggml_tensor * ggml_view_3d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int64_t               ne0,
+        int64_t               ne1,
+        int64_t               ne2,
+        size_t                nb1, // row   stride in bytes
+        size_t                nb2, // slice stride in bytes
+        size_t                offset);
+
 struct ggml_tensor * ggml_permute(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
--- a/llama.cpp
+++ b/llama.cpp
@@ -28,6 +28,8 @@
 #define LLAMA_USE_SCRATCH
 #define LLAMA_MAX_SCRATCH_BUFFERS 16

+#define LLAMA_USE_FLASH_ATTN
+
 #define LLAMA_ASSERT(x) \
    do { \
        if (!(x)) { \
@@ -256,8 +258,8 @@ static bool kv_cache_init(
    const int n_embd  = hparams.n_embd;
    const int n_layer = hparams.n_layer;

-    const int n_mem      = n_layer*n_ctx;
-    const int n_elements = n_embd*n_mem;
+    const int64_t n_mem      = (int64_t)n_layer*n_ctx;
+    const int64_t n_elements = n_embd*n_mem;

    cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);

@@ -679,7 +681,7 @@ static bool llama_model_load(
                return false;
            }
            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
-                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
+                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%" PRId64 ", %" PRId64 "], expected [%d, %d]\n",
                        __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
                return false;
            }
@@ -810,37 +812,59 @@ static bool llama_eval_internal(

        // self-attention
        {
-            struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
-            struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
-            struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+            // compute Q and K and RoPE them
+            struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
+            struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);

            // store key and value to memory
-            if (N >= 1) {
-                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
-                struct ggml_tensor * v = ggml_view_1d(ctx0, kv_self.v, N*n_embd, (ggml_element_size(kv_self.v)*n_embd)*(il*n_ctx + n_past));
+            {
+                // compute the transposed [N, n_embd] V matrix
+                struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), n_embd, N));

+                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
+                struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
+                        (   n_ctx)*ggml_element_size(kv_self.v),
+                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
+
+                // important: storing RoPE-ed version of K in the KV cache!
                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
            }

-            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
+#ifdef LLAMA_USE_FLASH_ATTN
            struct ggml_tensor * Q =
                ggml_permute(ctx0,
-                        ggml_rope(ctx0,
-                            ggml_cpy(ctx0,
-                                Qcur,
-                                ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
-                            n_past, n_rot, 0),
+                        ggml_cpy(ctx0,
+                            Qcur,
+                            ggml_new_tensor_3d(ctx0, GGML_TYPE_F16, n_embd/n_head, n_head, N)),
                        0, 2, 1, 3);

-            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
            struct ggml_tensor * K =
                ggml_permute(ctx0,
-                        ggml_rope(ctx0,
-                            ggml_reshape_3d(ctx0,
-                                ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
-                                n_embd/n_head, n_head, n_past + N),
-                            n_past, n_rot, 1),
+                        ggml_reshape_3d(ctx0,
+                            ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
+                            n_embd/n_head, n_head, n_past + N),
+                        0, 2, 1, 3);
+
+            struct ggml_tensor * V =
+                ggml_view_3d(ctx0, kv_self.v,
+                        n_past + N, n_embd/n_head, n_head,
+                        n_ctx*ggml_element_size(kv_self.v),
+                        n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
+                        il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
+
+            struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true);
+#else
+            struct ggml_tensor * Q =
+                ggml_permute(ctx0,
+                        Qcur,
+                        0, 2, 1, 3);
+
+            struct ggml_tensor * K =
+                ggml_permute(ctx0,
+                        ggml_reshape_3d(ctx0,
+                            ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
+                            n_embd/n_head, n_head, n_past + N),
                        0, 2, 1, 3);

            // K * Q
@@ -858,18 +882,24 @@ static bool llama_eval_internal(
            // KQ = soft_max(KQ_masked)
            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);

-            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
-            struct ggml_tensor * V_trans =
-                ggml_cpy(ctx0,
-                    ggml_permute(ctx0,
-                            ggml_reshape_3d(ctx0,
-                                ggml_view_1d(ctx0, kv_self.v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.v)*n_embd),
-                                n_embd/n_head, n_head, n_past + N),
-                            1, 2, 0, 3),
-                    ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head));
+            // split cached V into n_head heads
+            struct ggml_tensor * V =
+                ggml_view_3d(ctx0, kv_self.v,
+                        n_past + N, n_embd/n_head, n_head,
+                        n_ctx*ggml_element_size(kv_self.v),
+                        n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
+                        il*n_ctx*ggml_element_size(kv_self.v)*n_embd);

-            // KQV = transpose(V) * KQ_soft_max
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
+#if 1
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
+#else
+            // make V contiguous in memory to speed up the matmul, however we waste time on the copy
+            // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
+            // is there a better way?
+            struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head));
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
+#endif
+#endif

            // KQV_merged = KQV.permute(0, 2, 1, 3)
            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
@@ -955,9 +985,13 @@ static bool llama_eval_internal(
    ggml_build_forward_expand(&gf, inpL);
    ggml_graph_compute       (ctx0, &gf);

+    // print timing information per ggml operation (for debugging purposes)
+    // requires GGML_PERF to be defined
+    //ggml_graph_print(&gf);
+
+    // plot the computation graph in dot format (for debugging purposes)
    //if (n_past%100 == 0) {
-    //    ggml_graph_print   (&gf);
-    //    ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
+    //    ggml_graph_dump_dot(&gf, NULL, "llama.dot");
    //}

    //embd_w.resize(n_vocab*N);
@@ -1194,6 +1228,20 @@ static llama_vocab::id llama_sample_top_p_top_k(
    const auto & logits = lctx.logits;
    const auto * plogits = logits.data() + logits.size() - n_logits;

+    if (temp <= 0) {
+        // select the token with the highest logit directly
+        float max_logit = plogits[0];
+        llama_vocab::id max_id = 0;
+
+        for (int i = 1; i < n_logits; ++i) {
+            if (plogits[i] > max_logit) {
+                max_logit = plogits[i];
+                max_id = i;
+            }
+        }
+        return max_id;
+    }
+
    std::vector<std::pair<float, llama_vocab::id>> logits_id;
    logits_id.reserve(n_logits);

@@ -1215,7 +1263,9 @@ static llama_vocab::id llama_sample_top_p_top_k(
        }
    }

-    sample_top_k(logits_id, top_k);
+    if (top_k > 0 && top_k < n_logits) {
+        sample_top_k(logits_id, top_k);
+    }

    float maxl = -std::numeric_limits<float>::infinity();
    for (const auto & kv : logits_id) {
@@ -1608,7 +1658,7 @@ struct llama_context * llama_init_from_file(
    }

    // reserve memory for context buffers
-    {
+    if (!params.vocab_only) {
        if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) {
            fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
            llama_free(ctx);
@@ -1668,6 +1718,33 @@ int llama_model_quantize(
    return 0;
 }

+// Returns the KV cache that will contain the context for the
+// ongoing prediction with the model.
+const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
+    return ctx->model.kv_self.buf.data();
+}
+
+// Returns the size of the KV cache
+size_t llama_get_kv_cache_size(struct llama_context * ctx) {
+    return ctx->model.kv_self.buf.size();
+}
+
+int llama_get_kv_cache_token_count(struct llama_context * ctx) {
+    return ctx->model.kv_self.n;
+}
+
+// Sets the KV cache containing the current context for the model
+void llama_set_kv_cache(
+        struct llama_context * ctx,
+               const uint8_t * kv_cache,
+                      size_t   n_size,
+                         int   n_token_count) {
+    // Make sure we have the same kv cache setup
+    LLAMA_ASSERT(ctx->model.kv_self.buf.size() == n_size);
+    memcpy(ctx->model.kv_self.buf.data(), kv_cache, n_size);
+    ctx->model.kv_self.n = n_token_count;
+}
+
 int llama_eval(
        struct llama_context * ctx,
           const llama_token * tokens,
--- a/llama.h
+++ b/llama.h
@@ -83,6 +83,23 @@ extern "C" {
            const char * fname_out,
                   int   itype);

+    // Returns the KV cache that will contain the context for the
+    // ongoing prediction with the model.
+    LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx);
+
+    // Returns the size of the KV cache
+    LLAMA_API size_t llama_get_kv_cache_size(struct llama_context * ctx);
+
+    // Returns the number of tokens in the KV cache
+    LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
+
+    // Sets the KV cache containing the current context for the model
+    LLAMA_API void llama_set_kv_cache(
+            struct llama_context * ctx,
+                   const uint8_t * kv_cache,
+                          size_t   n_size,
+                             int   n_token_count);
+
    // Run the llama inference to obtain the logits and probabilities for the next token.
    // tokens + n_tokens is the provided batch of new tokens to process
    // n_past is the number of tokens to use from previous eval calls
--- a/media/llama-leader.jpeg
+++ b/media/llama-leader.jpeg
--- a/media/llama0-banner.png
+++ b/media/llama0-banner.png
--- a/media/llama0-logo.png
+++ b/media/llama0-logo.png
--- a/media/llama1-banner.png
+++ b/media/llama1-banner.png
--- a/media/llama1-logo.png
+++ b/media/llama1-logo.png
--- a/migrate-ggml-2023-03-30-pr613.py
+++ b/migrate-ggml-2023-03-30-pr613.py
@@ -272,13 +272,11 @@ def main():
        tokens = read_tokens(fin, hparams)

    if hparams['magic'] == 0x67676a74:  # ggjt
-        print("%s: input ggml has already been converted to 'ggjt' magic\n" %
-              (args.fin_path))
+        print(f"{args.fin_path}: input ggml has already been converted to 'ggjt' magic\n")
        sys.exit(1)

    if hparams['magic'] != 0x67676d66:  # ggmf
-        print("%s: input ggml file doesn't have expected 'ggmf' magic: %#x\n" %
-              (args.fin_path, hparams['magic']))
+        print(f"{args.fin_path}: input ggml file doesn't have expected 'ggmf' magic: {hparams['magic']:#x}\n")
        sys.exit(1)

    hparams['magic'] = 0x67676a74  # ggjt
@@ -286,7 +284,7 @@ def main():
    # count number of multipart files by convention
    n_parts = 1
    while True:
-        if os.path.exists("%s.%d" % (args.fin_path, n_parts)):
+        if os.path.exists(f"{args.fin_path}.{n_parts}"):
            n_parts += 1
        else:
            break
@@ -302,7 +300,7 @@ def main():
            print(f"Processing part {part_id+1} of {n_parts}\n")
            fin_path = args.fin_path
            if part_id > 0:
-                fin_path += ".%d" % (part_id)
+                fin_path += f".{part_id}"
            with open(fin_path, "rb") as fin:
                read_tokens(fin, read_hparams(fin))
                copy_tensors(fin, fout, part_id, n_parts)
--- a/quantize.py
+++ b/quantize.py
@@ -1,131 +0,0 @@
-#!/usr/bin/env python3
-
-"""Script to execute the "quantize" script on a given set of models."""
-
-import subprocess
-import argparse
-import glob
-import sys
-import os
-
-
-def main():
-    """Update the quantize binary name depending on the platform and parse
-    the command line arguments and execute the script.
-    """
-
-    if "linux" in sys.platform or "darwin" in sys.platform:
-        quantize_script_binary = "quantize"
-
-    elif "win32" in sys.platform or "cygwin" in sys.platform:
-        quantize_script_binary = "quantize.exe"
-
-    else:
-        print("WARNING: Unknown platform. Assuming a UNIX-like OS.\n")
-        quantize_script_binary = "quantize"
-
-    parser = argparse.ArgumentParser(
-        prog='python3 quantize.py',
-        description='This script quantizes the given models by applying the '
-        f'"{quantize_script_binary}" script on them.'
-    )
-    parser.add_argument(
-        'models', nargs='+', choices=('7B', '13B', '30B', '65B'),
-        help='The models to quantize.'
-    )
-    parser.add_argument(
-        '-r', '--remove-16', action='store_true', dest='remove_f16',
-        help='Remove the f16 model after quantizing it.'
-    )
-    parser.add_argument(
-        '-m', '--models-path', dest='models_path',
-        default=os.path.join(os.getcwd(), "models"),
-        help='Specify the directory where the models are located.'
-    )
-    parser.add_argument(
-        '-q', '--quantize-script-path', dest='quantize_script_path',
-        default=os.path.join(os.getcwd(), quantize_script_binary),
-        help='Specify the path to the "quantize" script.'
-    )
-
-    # TODO: Revise this code
-    # parser.add_argument(
-    #     '-t', '--threads', dest='threads', type='int',
-    #     default=os.cpu_count(),
-    #     help='Specify the number of threads to use to quantize many models at '
-    #     'once. Defaults to os.cpu_count().'
-    # )
-
-    args = parser.parse_args()
-    args.models_path = os.path.abspath(args.models_path)
-
-    if not os.path.isfile(args.quantize_script_path):
-        print(
-            f'The "{quantize_script_binary}" script was not found in the '
-            "current location.\nIf you want to use it from another location, "
-            "set the --quantize-script-path argument from the command line."
-        )
-        sys.exit(1)
-
-    for model in args.models:
-        # The model is separated in various parts
-        # (ggml-model-f16.bin, ggml-model-f16.bin.0, ggml-model-f16.bin.1...)
-        f16_model_path_base = os.path.join(
-            args.models_path, model, "ggml-model-f16.bin"
-        )
-
-        if not os.path.isfile(f16_model_path_base):
-            print(f'The file %s was not found' % f16_model_path_base)
-            sys.exit(1)
-
-        f16_model_parts_paths = map(
-            lambda filename: os.path.join(f16_model_path_base, filename),
-            glob.glob(f"{f16_model_path_base}*")
-        )
-
-        for f16_model_part_path in f16_model_parts_paths:
-            if not os.path.isfile(f16_model_part_path):
-                print(
-                    f"The f16 model {os.path.basename(f16_model_part_path)} "
-                    f"was not found in {args.models_path}{os.path.sep}{model}"
-                    ". If you want to use it from another location, set the "
-                    "--models-path argument from the command line."
-                )
-                sys.exit(1)
-
-            __run_quantize_script(
-                args.quantize_script_path, f16_model_part_path
-            )
-
-            if args.remove_f16:
-                os.remove(f16_model_part_path)
-
-
-# This was extracted to a top-level function for parallelization, if
-# implemented. See https://github.com/ggerganov/llama.cpp/pull/222/commits/f8db3d6cd91bf1a1342db9d29e3092bc12dd783c#r1140496406
-
-def __run_quantize_script(script_path, f16_model_part_path):
-    """Run the quantize script specifying the path to it and the path to the
-    f16 model to quantize.
-    """
-
-    new_quantized_model_path = f16_model_part_path.replace("f16", "q4_0")
-    subprocess.run(
-        [script_path, f16_model_part_path, new_quantized_model_path, "2"],
-        check=True
-    )
-
-
-if __name__ == "__main__":
-    try:
-        main()
-
-    except subprocess.CalledProcessError:
-        print("\nAn error ocurred while trying to quantize the models.")
-        sys.exit(1)
-
-    except KeyboardInterrupt:
-        sys.exit(0)
-
-    else:
-        print("\nSuccesfully quantized all models.")
Author	SHA1	Message	Date
Georgi Gerganov	36ddd12924	llama : add flash attention (demo)	2023-04-05 22:12:04 +03:00
Georgi Gerganov	986b6ce9f9	ggml, llama : avoid heavy V transpose + improvements (#775 ) ggml : - added ggml_view_3d() - ggml_view_tensor() now inherits the stride too - reimplement ggml_cpy() to account for dst stride - no longer require tensor->data to be memory aligned llama : - compute RoPE on 32-bit tensors (should be more accurate) - store RoPE-ed K in the KV cache - store transposed V in the KV cache (significant speed-up) - avoid unnecessary Q copy	2023-04-05 22:07:33 +03:00
Georgi Gerganov	3416298929	Update README.md	2023-04-05 19:54:30 +03:00
Ivan Stepanov	5a8c4f6240	llama : define non-positive top_k; top_k range check (#779 ) * Define non-positive top_k; top_k range check * minor : brackets --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2023-04-05 19:20:05 +03:00
at8u	ff05d05c96	miku.sh : add executable bit (#780 )	2023-04-05 18:59:13 +03:00
Georgi Gerganov	62b3e81aae	media : add logos and banners	2023-04-05 18:58:31 +03:00
Georgi Gerganov	8d10406d6e	readme : change logo + add bindings + add uis + add wiki	2023-04-05 18:56:20 +03:00
iacore	ed1c214e66	zig : add build.zig (#773 ) Co-authored-by: Locria Cyber <74560659+locriacyber@users.noreply.github.com>	2023-04-05 18:06:02 +03:00
Ivan Stepanov	0c44427df1	make : missing host optimizations in CXXFLAGS (#763 )	2023-04-05 17:38:37 +03:00
Adithya Balaji	594cc95fab	readme : update with CMake and windows example (#748 ) * README: Update with CMake and windows example * README: update with code-review for cmake build	2023-04-05 17:36:12 +03:00
at8u	88ed5761b8	examples : add Miku.sh (#724 ) * Add Miku.sh to examples * Add missing line to prompt in Miku.sh * Add --keep param to Miku.sh * Remove '[end_of_conversation]' line from Miku.sh No longer is necessary.	2023-04-05 17:32:42 +03:00
Andrew Duffy	58c438cf7d	Add Accelerate/BLAS when using Swift (#765 )	2023-04-05 06:44:24 -04:00
mgroeber9110	53dbba7695	Windows: reactive sigint handler after each Ctrl-C (#736 )	2023-04-03 18:00:55 +02:00
SebastianApel	437e77855a	10+% performance improvement of ggml_vec_dot_q4_0 on AVX2 (#654 ) * Performance improvement of AVX2 code * Fixed problem with MSVC compiler * Reviewer comments: removed double semicolon, deleted empty line 1962	2023-04-03 09:52:28 +02:00
Ivan Stepanov	cd7fa95690	Define non-positive temperature behavior (#720 )	2023-04-03 02:19:04 +02:00
bsilvereagle	a0c0516416	Remove torch GPU dependencies from the Docker.full image (#665 ) By using `pip install torch --index-url https://download.pytorch.org/whl/cpu` instead of `pip install torch` we can specify we want to install a CPU-only version of PyTorch without any GPU dependencies. This reduces the size of the Docker image from 7.32 GB to 1.62 GB	2023-04-03 00:13:03 +02:00
Thatcher Chamberlin	d8d4e865cd	Add a missing step to the gpt4all instructions (#690 ) `migrate-ggml-2023-03-30-pr613.py` is needed to get gpt4all running.	2023-04-02 12:48:57 +02:00
Christian Falch	e986f94829	Added api for getting/setting the kv_cache (#685 ) The api provides access methods for retrieving the current memory buffer for the kv_cache and its token number. It also contains a method for setting the kv_cache from a memory buffer. This makes it possible to load/save history - maybe support --cache-prompt paramater as well? Co-authored-by: Pavol Rusnak <pavol@rusnak.io>	2023-04-02 12:23:04 +02:00
Marian Cepok	c0bb1d3ce2	ggml : change ne to int64_t (#626 )	2023-04-02 13:21:31 +03:00
Leonardo Neumann	6e7801d08d	examples : add gpt4all script (#658 )	2023-04-02 10:56:20 +03:00
Stephan Walter	81040f10aa	llama : do not allocate KV cache for "vocab_only == true" (#682 ) Fixes sanitizer CI	2023-04-02 10:18:53 +03:00
Fabian	c4f89d8d73	make : use -march=native -mtune=native on x86 (#609 )	2023-04-02 10:17:05 +03:00
Murilo Santana	5b70e7de4c	fix default params for examples/main (#697 )	2023-04-02 04:41:12 +02:00
Ikko Eltociear Ashimine	a717cba844	py: huggingface -> Hugging Face (#686 )	2023-04-01 18:38:18 +02:00
rimoliga	d0a7f742e7	readme: replace termux links with homepage, play store is deprecated (#680 )	2023-04-01 16:57:30 +02:00
Slaren	0d054e292e	Show error message when -f fails	2023-04-01 16:08:40 +02:00
Stephan Walter	3525899277	Enable -std= for cmake builds, fix warnings (#598 )	2023-03-31 19:19:16 +00:00
slaren	1d08882afa	Optimize AVX2 ggml_vec_dot_q4_0 (#642 )	2023-03-31 15:55:52 +00:00
perserk	02c5b27e91	Add AVX acceleration (#617 ) * ggml : add AVX quantize_row_q4_0() * ggml : add AVX ggml_vec_dot_q4_0() * ggml : refactor AVX part of ggml_vec_dot_q4_0() https://github.com/ggerganov/llama.cpp/pull/617#issuecomment-1489985645	2023-03-31 13:55:44 +02:00
Pavol Rusnak	cbef542879	py : cleanup the code - use f-strings where possible - drop first param of encode/decode functions since "utf-8" is the default	2023-03-31 10:32:01 +02:00
Pavol Rusnak	9733104be5	drop quantize.py (now that models are using a single file)	2023-03-31 01:07:32 +02:00
Georgi Gerganov	3df890aef4	readme : update supported models	2023-03-30 22:31:54 +03:00