Adding a simple program to measure speed of dot products (#1041 )

On my Mac, the direct Q4_1 product is marginally slower (~69 vs ~55 us for Q4_0). The SIMD-ified ggml version is now almost 2X slower (~121 us). On a Ryzen 7950X CPU, the direct product for Q4_1 quantization is faster than the AVX2 implementation (~60 vs ~62 us). --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
readme : update hot topics about new LoRA functionality
2026-02-26 14:23:22 +02:00 · 2023-04-18 19:00:14 +00:00 · 2023-04-18 20:10:26 +03:00 · 2023-04-18 19:57:06 +03:00 · 2023-04-18 03:15:50 +02:00 · 2023-04-17 22:34:35 +03:00
33 changed files with 2074 additions and 952 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -8,6 +8,8 @@ on:
        required: true
        type: boolean
  push:
+    branches:
+      - master
    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp']
  pull_request:
    types: [opened, synchronize, edited, reopened, review_requested, ready_for_review]
@@ -18,6 +20,8 @@ env:

 jobs:
  ubuntu-latest-make:
+    if: github.event.pull_request.draft == false
+
    runs-on: ubuntu-latest

    steps:
@@ -37,6 +41,8 @@ jobs:
          make

  ubuntu-latest-cmake:
+    if: github.event.pull_request.draft == false
+
    runs-on: ubuntu-latest

    steps:
@@ -65,6 +71,8 @@ jobs:
          ctest --verbose

  ubuntu-latest-cmake-sanitizer:
+    if: github.event.pull_request.draft == false
+
    runs-on: ubuntu-latest

    continue-on-error: true
@@ -101,6 +109,8 @@ jobs:
          ctest --verbose

  macOS-latest-make:
+    if: github.event.pull_request.draft == false
+
    runs-on: macos-latest

    steps:
@@ -119,6 +129,8 @@ jobs:
          make

  macOS-latest-cmake:
+    if: github.event.pull_request.draft == false
+
    runs-on: macOS-latest

    steps:
@@ -146,6 +158,8 @@ jobs:
          ctest --verbose

  windows-latest-cmake:
+    if: github.event.pull_request.draft == false
+
    runs-on: windows-latest

    strategy:
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -18,6 +18,8 @@ on:
 jobs:
  push_to_registry:
    name: Push Docker image to Docker Hub
+    if: github.event.pull_request.draft == false
+
    runs-on: ubuntu-latest
    env:
      COMMIT_SHA: ${{ github.sha }}
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -55,6 +55,8 @@ option(LLAMA_SANITIZE_UNDEFINED     "llama: enable undefined sanitizer"
 option(LLAMA_AVX                    "llama: enable AVX"                                     ON)
 option(LLAMA_AVX2                   "llama: enable AVX2"                                    ON)
 option(LLAMA_AVX512                 "llama: enable AVX512"                                  OFF)
+option(LLAMA_AVX512_VBMI            "llama: enable AVX512-VBMI"                             OFF)
+option(LLAMA_AVX512_VNNI            "llama: enable AVX512-VNNI"                             OFF)
 option(LLAMA_FMA                    "llama: enable FMA"                                     ON)
 # in MSVC F16C is implied with AVX2/AVX512
 if (NOT MSVC)
@@ -120,6 +122,21 @@ if (LLAMA_OPENBLAS)
        add_compile_definitions(GGML_USE_OPENBLAS)
        add_link_options(${BLAS_LIBRARIES})
        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} openblas)
+
+        # find header file
+        set(OPENBLAS_INCLUDE_SEARCH_PATHS
+            /usr/include
+            /usr/include/openblas
+            /usr/include/openblas-base
+            /usr/local/include
+            /usr/local/include/openblas
+            /usr/local/include/openblas-base
+            /opt/OpenBLAS/include
+            $ENV{OpenBLAS_HOME}
+            $ENV{OpenBLAS_HOME}/include
+            )
+        find_path(OPENBLAS_INC NAMES cblas.h PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
+        add_compile_options(-I${OPENBLAS_INC})
    else()
        message(WARNING "OpenBLAS not found")
    endif()
@@ -205,6 +222,16 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
    if (MSVC)
        if (LLAMA_AVX512)
            add_compile_options(/arch:AVX512)
+            # MSVC has no compile-time flags enabling specific
+            # AVX512 extensions, neither it defines the
+            # macros corresponding to the extensions.
+            # Do it manually.
+            if (LLAMA_AVX512_VBMI)
+                add_compile_definitions(__AVX512VBMI__)
+            endif()
+            if (LLAMA_AVX512_VNNI)
+                add_compile_definitions(__AVX512VNNI__)
+            endif()
        elseif (LLAMA_AVX2)
            add_compile_options(/arch:AVX2)
        elseif (LLAMA_AVX)
@@ -225,9 +252,13 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
        endif()
        if (LLAMA_AVX512)
            add_compile_options(-mavx512f)
-            # add_compile_options(-mavx512cd)
-            # add_compile_options(-mavx512dq)
-            # add_compile_options(-mavx512bw)
+            add_compile_options(-mavx512bw)
+        endif()
+        if (LLAMA_AVX512_VBMI)
+            add_compile_options(-mavx512vbmi)
+        endif()
+        if (LLAMA_AVX512_VNNI)
+            add_compile_options(-mavx512vnni)
        endif()
    endif()
 else()
@@ -274,4 +305,5 @@ endif ()

 if (LLAMA_BUILD_EXAMPLES)
    add_subdirectory(examples)
+    add_subdirectory(pocs)
 endif()
--- a/5
+++ b/5
@@ -133,7 +133,7 @@ $(info I CC:       $(CCV))
 $(info I CXX:      $(CXXV))
 $(info )

-default: main quantize perplexity embedding
+default: main quantize quantize-stats perplexity embedding vdot

 #
 # Build library
@@ -169,6 +169,9 @@ perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o
 embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

+vdot: pocs/vdot/vdot.cpp ggml.o
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
+
 libllama.so: llama.o ggml.o
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)

--- a/README.md
+++ b/README.md
@@ -9,6 +9,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++

 **Hot topics:**

+- [Added LoRA support](https://github.com/ggerganov/llama.cpp/pull/820)
 - [Add GPU support to ggml](https://github.com/ggerganov/llama.cpp/discussions/915)
 - [Roadmap Apr 2023](https://github.com/ggerganov/llama.cpp/discussions/784)

@@ -50,6 +51,7 @@ New features will probably be added mostly through community contributions.
 - Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
 - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
 - Node.js: [hlhr202/llama-node](https://github.com/hlhr202/llama-node)
+- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)

 **UI:**

--- a/configs/alpaca-native-enhanced.txt
+++ b/configs/alpaca-native-enhanced.txt
@@ -1,21 +0,0 @@
--ctx_size 2048
--batch_size 16
--repeat_penalty 1.15
--temp 0.4
--top_k 30
--top_p 0.18
-
--interactive-first
--keep -1
-
--ins-prefix-bos
--ins-prefix "\n\nUser: "
--ins-suffix "\n\nAssistant: "
--reverse-prompt "User: "
-
-p "You are an AI language model designed to assist the User by answering their questions, offering advice, and engaging in casual conversation in a friendly, helpful, and informative manner. You respond clearly, coherently, and you consider the conversation history.
-
-User: Hey, how's it going?
-
-Assistant: Hey there! I'm doing great, thank you. What can I help you with today? Let's have a fun chat!"
-
--- a/configs/alpaca.txt
+++ b/configs/alpaca.txt
@@ -1,9 +0,0 @@
--clean-interface
--interactive-first
--keep -1
--ins-prefix-bos
--ins-prefix "\n\n### Instruction:\n\n"
--ins-suffix "\n\n### Response:\n\n"
--reverse-prompt "### Instruction:\n\n"
-
-p "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n"
--- a/configs/chat-with-bob.txt
+++ b/configs/chat-with-bob.txt
@@ -1,15 +0,0 @@
--interactive-first
--keep -1
--ins-prefix-bos
--ins-prefix "\nUser: "
--ins-suffix "\nBob: "
--reverse-prompt "User: "
--rm-trailing-space-workaround
-
-p "Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
-
-User: Hello, Bob.
-Bob: Hello. How may I help you today?
-User: Please tell me the largest city in Europe.
-Bob: Sure. The largest city in Europe is Moscow, the capital of Russia."
-
--- a/configs/llama.txt
+++ b/configs/llama.txt
@@ -1,3 +0,0 @@
--interactive-first
--keep -1
--temp 0.1
--- a/configs/vicuna-simple.txt
+++ b/configs/vicuna-simple.txt
@@ -1,7 +0,0 @@
--interactive-first
--keep -1
--ins-prefix-bos
--ins-prefix "\n### Human: "
--ins-suffix "\n### Assistant: "
--reverse-prompt "### Human: "
--rm-trailing-space-workaround
--- a/configs/vicuna-stop.txt
+++ b/configs/vicuna-stop.txt
@@ -1,8 +0,0 @@
--interactive-first
--keep -1
--ins-prefix-bos
--ins-prefix "\n### Human: "
--ins-suffix "\n### Assistant: "
--reverse-prompt "### Human: "
--stop-prompt "### Assistant: "
--rm-trailing-space-workaround
--- a/configs/vicuna.txt
+++ b/configs/vicuna.txt
@@ -1,9 +0,0 @@
--interactive-first
--keep -1
--ins-prefix-bos
--ins-prefix "\n### Human: "
--ins-suffix "\n### Assistant: "
--reverse-prompt "### Human: "
--rm-trailing-space-workaround
-
-p "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
--- a/convert-lora-to-ggml.py
+++ b/convert-lora-to-ggml.py
@@ -0,0 +1,124 @@
+import json
+import os
+import re
+import struct
+import sys
+from typing import Any, Dict, Sequence, TextIO
+
+import torch
+
+from convert import DATA_TYPE_TO_FTYPE, NUMPY_TYPE_TO_DATA_TYPE, DataType
+
+HF_SUBLAYER_TO_GGML = {
+    "self_attn.q_proj": "attention.wq",
+    "self_attn.k_proj": "attention.wk",
+    "self_attn.v_proj": "attention.wv",
+    "self_attn.o_proj": "attention.wo",
+    "mlp.gate_proj": "feed_forward.w1",
+    "mlp.down_proj": "feed_forward.w2",
+    "mlp.up_proj": "feed_forward.w3",
+    "input_layernorm": "attention_norm",
+    "post_attention_layernorm": "ffn_norm",
+    # "norm": "norm",
+    # "embed_tokens": "tok_embeddings",
+    # "lm_head": "output",
+}
+
+
+def translate_tensor_name(t: str) -> str:
+    match = re.match(r".*layers\.(\d+)\.(\w+\.\w+)\.lora_(A|B)\.weight", t)
+    if match:
+        nn = match.group(1)
+        sub_layer = match.group(2)
+        lora_type = match.group(3)
+
+        sub_layer_renamed = HF_SUBLAYER_TO_GGML.get(sub_layer)
+        if sub_layer_renamed is None:
+            print(f"Error: unrecognized sub-layer {sub_layer} in tensor {t}")
+            sys.exit(1)
+
+        output_string = (
+            f"layers.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}"
+        )
+        return output_string
+    else:
+        print(f"Error: unrecognized tensor {t}")
+        sys.exit(1)
+
+
+def write_file_header(fout: TextIO, params: Dict[str, Any]) -> None:
+    fout.write(b"ggla"[::-1])  # magic (ggml lora)
+    fout.write(struct.pack("i", 1))  # file version
+    fout.write(struct.pack("ii", params["r"], params["lora_alpha"]))
+
+
+def write_tensor_header(
+    self, name: str, shape: Sequence[int], data_type: DataType
+) -> None:
+    sname = name.encode("utf-8")
+    fout.write(
+        struct.pack(
+            "iii",
+            len(shape),
+            len(sname),
+            DATA_TYPE_TO_FTYPE[NUMPY_TYPE_TO_DATA_TYPE[data_type]],
+        )
+    )
+    fout.write(struct.pack("i" * len(shape), *shape[::-1]))
+    fout.write(sname)
+    fout.seek((fout.tell() + 31) & -32)
+
+
+if len(sys.argv) != 2:
+    print(f"Usage: python {sys.argv[0]} <path>")
+    print(
+        "Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'"
+    )
+    sys.exit(1)
+
+input_json = os.path.join(sys.argv[1], "adapter_config.json")
+input_model = os.path.join(sys.argv[1], "adapter_model.bin")
+output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
+
+model = torch.load(input_model, map_location="cpu")
+
+with open(input_json, "r") as f:
+    params = json.load(f)
+
+if params["peft_type"] != "LORA":
+    print(f"Error: unsupported adapter type {params['peft_type']}, expected LORA")
+    sys.exit(1)
+
+if params["fan_in_fan_out"] == True:
+    print("Error: param fan_in_fan_out is not supported")
+    sys.exit(1)
+
+if params["bias"] is not None and params["bias"] != "none":
+    print("Error: param bias is not supported")
+    sys.exit(1)
+
+# TODO: these seem to be layers that have been trained but without lora.
+# doesn't seem widely used but eventually should be supported
+if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0:
+    print("Error: param modules_to_save is not supported")
+    sys.exit(1)
+
+with open(output_path, "wb") as fout:
+    fout.truncate()
+
+    write_file_header(fout, params)
+    for k, v in model.items():
+        if k.endswith("lora_A.weight"):
+            if v.dtype != torch.float16 and v.dtype != torch.float32:
+                v = v.float()
+            v = v.T
+        else:
+            v = v.float()
+
+        t = v.numpy()
+        tname = translate_tensor_name(k)
+        print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
+        write_tensor_header(fout, tname, t.shape, t.dtype)
+        t.tofile(fout)
+
+print(f"Converted {input_json} and {input_model} to {output_path}")
--- a/convert.py
+++ b/convert.py
@@ -735,7 +735,7 @@ def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus:
    header: Dict[str, Dict[str, Any]] = json.loads(fp.read(header_size))
    # Use mmap for the actual data to avoid race conditions with the file offset.
    mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ))
-    byte_buf = mapped[fp.tell():]
+    byte_buf = mapped[8 + header_size:]

    def convert(info: Dict[str, Any]) -> LazyTensor:
        data_type = SAFETENSORS_DATA_TYPES[info['dtype']]
@@ -761,7 +761,7 @@ def must_read(fp: IO[bytes], length: int) -> bytes:
    return ret


-def lazy_load_ggml_file(fp: IO[bytes], path: Path) -> ModelPlus:
+def lazy_load_ggml_file(fp: io.BufferedReader, path: Path) -> ModelPlus:
    magic = must_read(fp, 4)[::-1]
    if magic in (b'ggmf', b'ggjt'):
        version, = struct.unpack("i", must_read(fp, 4))
@@ -795,7 +795,9 @@ def lazy_load_ggml_file(fp: IO[bytes], path: Path) -> ModelPlus:

    model: LazyModel = {}
    # Use mmap for the actual data to avoid race conditions with the file offset.
+    off = fp.raw.tell()
    mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ))
+    fp.raw.seek(off) # needed on Windows

    def read_tensor() -> None:  # this is a function so that variables captured in `load` don't change
        shape_len, name_len, ftype = struct.unpack("iii", must_read(fp, 12))
@@ -949,8 +951,9 @@ class OutputFile:

        ndarrays = bounded_parallel_map(do_item, model.items(), concurrency=8)
        for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
-            size = ' x '.join(map(str, lazy_tensor.shape))
-            print(f"[{i+1}/{len(model)}] Writing tensor {name}, size {size}...")
+            size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
+            padi = len(str(len(model)))
+            print(f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type}")
            of.write_tensor_header(name, lazy_tensor.shape, lazy_tensor.data_type)
            ndarray.tofile(of.fout)
        of.fout.close()
@@ -1082,6 +1085,7 @@ def default_outfile(model_paths: List[Path], params: Params) -> Path:
    namestr = {
        GGMLFileType.AllF32: "f32",
        GGMLFileType.MostlyF16: "f16",
+        GGMLFileType.MostlyQ4_0: "q4_0",
        GGMLFileType.MostlyQ4_1: "q4_1",
        GGMLFileType.PerLayerIsQ4_1: "q4_1",
    }[params.file_type]
@@ -1105,7 +1109,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
    parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
    parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
    parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
-    parser.add_argument("--outtype", choices=["f32", "f16", "q4_1"], help="output format (default: based on input)")
+    parser.add_argument("--outtype", choices=["f32", "f16", "q4_1", "q4_0"], help="output format (default: based on input)")
    parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
    parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
    parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
--- a/examples/benchmark/benchmark-q4_0-matmult.c
+++ b/examples/benchmark/benchmark-q4_0-matmult.c
@@ -24,7 +24,7 @@

 float tensor_sum_elements(struct ggml_tensor * tensor) {
    float sum = 0;
-    if (tensor->type==6) {
+    if (tensor->type==GGML_TYPE_F32) {
        for (int j = 0; j < tensor->ne[1]; j++) {
            for (int k = 0; k < tensor->ne[0]; k++) {
                sum +=  ((float *) tensor->data)[j*tensor->ne[0]+k];
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -2,13 +2,10 @@

 #include <cassert>
 #include <cstring>
-#include <iostream>
 #include <fstream>
-#include <sstream>
 #include <string>
 #include <iterator>
 #include <algorithm>
-#include <regex>

 #if defined (_WIN32)
 #include <fcntl.h>
@@ -26,43 +23,6 @@ extern "C" __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int
 #define CP_UTF8 65001
 #endif

-void split_args(const std::string & args_string, std::vector<std::string> & output_args)
-{
-    std::string current_arg = "";
-    bool in_quotes = false;
-    char quote_type;
-
-    for (char c : args_string) {
-        if (c == '"' || c == '\'') {
-            if (!in_quotes) {
-                in_quotes = true;
-                quote_type = c;
-            } else if (quote_type == c) {
-                in_quotes = false;
-            } else {
-                current_arg += c;
-            }
-        } else if (in_quotes) {
-            current_arg += c;
-        } else if (std::isspace(c)) {
-            if (current_arg != "") {
-                output_args.push_back(current_arg);
-                current_arg = "";
-            }
-        } else {
-            current_arg += c;
-        }
-    }
-
-    if (current_arg != "") {
-        output_args.push_back(current_arg);
-    }
-}
-
-std::string unescape(const std::string & str) {
-    return std::regex_replace(str, std::regex("\\\\n"), "\n");
-}
-
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
    // determine sensible default number of threads.
    // std::thread::hardware_concurrency may not be equal to the number of cores, or may return 0.
@@ -80,11 +40,28 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
    std::string arg;
    gpt_params default_params;

-    // get additional arguments from config files
-    std::vector<std::string> args;
    for (int i = 1; i < argc; i++) {
        arg = argv[i];
-        if (arg == "--config") {
+
+        if (arg == "-s" || arg == "--seed") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.seed = std::stoi(argv[i]);
+        } else if (arg == "-t" || arg == "--threads") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.n_threads = std::stoi(argv[i]);
+        } else if (arg == "-p" || arg == "--prompt") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.prompt = argv[i];
+        } else if (arg == "-f" || arg == "--file") {
            if (++i >= argc) {
                invalid_param = true;
                break;
@@ -95,153 +72,98 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                invalid_param = true;
                break;
            }
-            std::string args_string;
-            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(args_string));
-            if (args_string.back() == '\n') {
-                args_string.pop_back();
-            }
-            split_args(args_string, args);
-            for (int j = 0; j < args.size(); j++) {
-                args[j] = unescape(args[j]);
-            }
-        } else {
-            args.emplace_back(argv[i]);
-        }
-    }
-
-    // parse args
-    int args_c = static_cast<int>(args.size());
-    for (int i = 0; i < args_c && !invalid_param; i++) {
-        arg = args[i];
-
-        if (arg == "-s" || arg == "--seed") {
-            if (++i >= args_c) {
-                invalid_param = true;
-                break;
-            }
-            params.seed = std::stoi(args[i]);
-        } else if (arg == "-t" || arg == "--threads") {
-            if (++i >= args_c) {
-                invalid_param = true;
-                break;
-            }
-            params.n_threads = std::stoi(args[i]);
-        } else if (arg == "-p" || arg == "--prompt") {
-            if (++i >= args_c) {
-                invalid_param = true;
-                break;
-            }
-            params.prompt = args[i];
-        } else if (arg == "-f" || arg == "--file") {
-            if (++i >= args_c) {
-                invalid_param = true;
-                break;
-            }
-            std::ifstream file(args[i]);
-            if (!file) {
-                fprintf(stderr, "error: failed to open file '%s'\n", args[i].c_str());
-                invalid_param = true;
-                break;
-            }
            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
            if (params.prompt.back() == '\n') {
                params.prompt.pop_back();
            }
        } else if (arg == "-n" || arg == "--n_predict") {
-            if (++i >= args_c) {
+            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            params.n_predict = std::stoi(args[i]);
+            params.n_predict = std::stoi(argv[i]);
        } else if (arg == "--top_k") {
-            if (++i >= args_c) {
+            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            params.top_k = std::stoi(args[i]);
+            params.top_k = std::stoi(argv[i]);
        } else if (arg == "-c" || arg == "--ctx_size") {
-            if (++i >= args_c) {
+            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            params.n_ctx = std::stoi(args[i]);
+            params.n_ctx = std::stoi(argv[i]);
        } else if (arg == "--memory_f32") {
            params.memory_f16 = false;
        } else if (arg == "--top_p") {
-            if (++i >= args_c) {
+            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            params.top_p = std::stof(args[i]);
+            params.top_p = std::stof(argv[i]);
        } else if (arg == "--temp") {
-            if (++i >= args_c) {
+            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            params.temp = std::stof(args[i]);
+            params.temp = std::stof(argv[i]);
        } else if (arg == "--repeat_last_n") {
-            if (++i >= args_c) {
+            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            params.repeat_last_n = std::stoi(args[i]);
+            params.repeat_last_n = std::stoi(argv[i]);
        } else if (arg == "--repeat_penalty") {
-            if (++i >= args_c) {
+            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            params.repeat_penalty = std::stof(args[i]);
+            params.repeat_penalty = std::stof(argv[i]);
        } else if (arg == "-b" || arg == "--batch_size") {
-            if (++i >= args_c) {
+            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            params.n_batch = std::stoi(args[i]);
+            params.n_batch = std::stoi(argv[i]);
            params.n_batch = std::min(512, params.n_batch);
        } else if (arg == "--keep") {
-            if (++i >= args_c) {
+            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            params.n_keep = std::stoi(args[i]);
+            params.n_keep = std::stoi(argv[i]);
        } else if (arg == "-m" || arg == "--model") {
-            if (++i >= args_c) {
+            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            params.model = args[i];
+            params.model = argv[i];
+        } else if (arg == "--lora") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.lora_adapter = argv[i];
+            params.use_mmap = false;
+        } else if (arg == "--lora-base") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.lora_base = argv[i];
        } else if (arg == "-i" || arg == "--interactive") {
            params.interactive = true;
        } else if (arg == "--embedding") {
            params.embedding = true;
-        } else if (arg == "--clean-interface") {
-            params.clean_interface = true;
        } else if (arg == "--interactive-start") {
            params.interactive = true;
        } else if (arg == "--interactive-first") {
            params.interactive_start = true;
        } else if (arg == "-ins" || arg == "--instruct") {
-            fprintf(stderr, "\n\nWarning: instruct mode is deprecated! Use: \n"
-                "--clean-interface "
-                "--interactive-first "
-                "--keep -1 "
-                "--ins-prefix-bos "
-                "--ins-prefix \"\\n\\n### Instruction:\\n\\n\" "
-                "--ins-suffix \"\\n\\n### Response:\\n\\n\" "
-                "-r \"### Instruction:\\n\\n\" "
-            "\n\n");
-            // params.instruct = true;
-            params.clean_interface = true;
-            params.interactive_start = true;
-            params.n_keep = -1;
-            params.instruct_prefix_bos = true;
-            params.instruct_prefix = "\n\n### Instruction:\n\n";
-            params.instruct_suffix = "\n\n### Response:\n\n";
-            params.antiprompt.push_back("### Instruction:\n\n");
+            params.instruct = true;
        } else if (arg == "--color") {
            params.use_color = true;
-        } else if (arg == "--disable-multiline") {
-            params.multiline_mode = false;
        } else if (arg == "--mlock") {
            params.use_mlock = true;
        } else if (arg == "--no-mmap") {
@@ -251,94 +173,65 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
        } else if (arg == "--verbose-prompt") {
            params.verbose_prompt = true;
        } else if (arg == "-r" || arg == "--reverse-prompt") {
-            if (++i >= args_c) {
+            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            params.antiprompt.push_back(args[i]);
-        } else if (arg == "--stop-prompt") {
-            if (++i >= args_c) {
-                invalid_param = true;
-                break;
-            }
-            params.stopprompt.push_back(args[i]);
-        } else if (arg == "--rm-trailing-space-workaround") {
-            params.rm_trailing_space_workaround = true;
+            params.antiprompt.push_back(argv[i]);
        } else if (arg == "--perplexity") {
            params.perplexity = true;
        } else if (arg == "--ignore-eos") {
            params.ignore_eos = true;
        } else if (arg == "--n_parts") {
-            if (++i >= args_c) {
+            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            params.n_parts = std::stoi(args[i]);
+            params.n_parts = std::stoi(argv[i]);
        } else if (arg == "-h" || arg == "--help") {
-            gpt_print_usage(argv[0], default_params);
+            gpt_print_usage(argc, argv, default_params);
            exit(0);
        } else if (arg == "--random-prompt") {
            params.random_prompt = true;
        } else if (arg == "--in-prefix") {
-            if (++i >= args_c) {
+            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            params.input_prefix = args[i];
-        } else if (arg == "--ins-prefix-bos") {
-            params.instruct_prefix_bos = true;
-        } else if (arg == "--ins-prefix") {
-            if (++i >= args_c) {
-                invalid_param = true;
-                break;
-            }
-            params.instruct_prefix = args[i];
-        } else if (arg == "--ins-suffix-bos") {
-            params.instruct_suffix_bos = true;
-        } else if (arg == "--ins-suffix") {
-            if (++i >= args_c) {
-                invalid_param = true;
-                break;
-            }
-            params.instruct_suffix = args[i];
+            params.input_prefix = argv[i];
        } else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            gpt_print_usage(argv[0], default_params);
+            gpt_print_usage(argc, argv, default_params);
            exit(1);
        }
    }
    if (invalid_param) {
        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
-        gpt_print_usage(argv[0], default_params);
+        gpt_print_usage(argc, argv, default_params);
        exit(1);
    }

    return true;
 }

-void gpt_print_usage(char * argv_0, const gpt_params & params) {
-    fprintf(stderr, "usage: %s [options]\n", argv_0);
+void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
+    fprintf(stderr, "usage: %s [options]\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "options:\n");
    fprintf(stderr, "  -h, --help            show this help message and exit\n");
    fprintf(stderr, "  -i, --interactive     run in interactive mode\n");
    fprintf(stderr, "  --interactive-first   run in interactive mode and wait for input right away\n");
-    fprintf(stderr, "  --clean-interface     hides input prefix & suffix and displays '>' instead\n");
+    fprintf(stderr, "  -ins, --instruct      run in instruction mode (use with Alpaca models)\n");
    fprintf(stderr, "  -r PROMPT, --reverse-prompt PROMPT\n");
    fprintf(stderr, "                        run in interactive mode and poll user input upon seeing PROMPT (can be\n");
    fprintf(stderr, "                        specified more than once for multiple prompts).\n");
    fprintf(stderr, "  --color               colorise output to distinguish prompt and user input from generations\n");
-    fprintf(stderr, "  --disable-multiline   disable multiline mode (use Ctrl+D on Linux/Mac and Ctrl+Z then Return on Windows to toggle multiline)\n");
    fprintf(stderr, "  -s SEED, --seed SEED  RNG seed (default: -1, use random seed for <= 0)\n");
    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
    fprintf(stderr, "  -p PROMPT, --prompt PROMPT\n");
    fprintf(stderr, "                        prompt to start generation with (default: empty)\n");
    fprintf(stderr, "  --random-prompt       start with a randomized prompt.\n");
    fprintf(stderr, "  --in-prefix STRING    string to prefix user inputs with (default: empty)\n");
-    fprintf(stderr, "  --ins-prefix STRING   (instruct) prefix user inputs with tokenized string (default: empty)\n");
-    fprintf(stderr, "  --ins-prefix-bos      (instruct) prepend bos token to instruct prefix.\n");
-    fprintf(stderr, "  --ins-suffix STRING   (instruct) suffix user inputs with tokenized string (default: empty)\n");
-    fprintf(stderr, "  --ins-suffix-bos      (instruct) prepend bos token to instruct suffix.\n");
    fprintf(stderr, "  -f FNAME, --file FNAME\n");
    fprintf(stderr, "                        prompt file to start generation.\n");
    fprintf(stderr, "  -n N, --n_predict N   number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict);
@@ -362,6 +255,8 @@ void gpt_print_usage(char * argv_0, const gpt_params & params) {
    }
    fprintf(stderr, "  --mtest               compute maximum memory usage\n");
    fprintf(stderr, "  --verbose-prompt      print prompt before generation\n");
+    fprintf(stderr, "  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
+    fprintf(stderr, "  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
    fprintf(stderr, "  -m FNAME, --model FNAME\n");
    fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
    fprintf(stderr, "\n");
@@ -448,61 +343,3 @@ void win32_utf8_encode(const std::wstring & wstr, std::string & str) {
    str = strTo;
 }
 #endif
-
-bool get_input_text(std::string & input_text, bool eof_toggled_multiline_mode) {
-    bool another_line = true;
-    bool is_eof_multiline_toggled = false;
-    do {
-        std::string line;
-#if defined(_WIN32)
-        auto & stdcin = std::wcin;
-        std::wstring wline;
-        if (!std::getline(stdcin, wline)) {
-            // input stream is bad or EOF received
-            if (stdcin.bad()) {
-                fprintf(stderr, "%s: error: input stream bad\n", __func__);
-                return 1;
-            }
-        }
-        win32_utf8_encode(wline, line);
-#else
-        auto & stdcin = std::cin;
-        if (!std::getline(stdcin, line)) {
-            // input stream is bad or EOF received
-            if (stdcin.bad()) {
-                fprintf(stderr, "%s: error: input stream bad\n", __func__);
-                return 1;
-            }
-        }
-#endif
-        if (stdcin.eof()) {
-            stdcin.clear();
-            stdcin.seekg(0, std::ios::beg);
-            if (!eof_toggled_multiline_mode) {
-                another_line = false;
-            } else {
-                is_eof_multiline_toggled = !is_eof_multiline_toggled;
-                if (is_eof_multiline_toggled) {
-                    input_text += line;
-                    continue;
-                }
-            }
-        }
-        if (!eof_toggled_multiline_mode) {
-            if (line.empty() || line.back() != '\\') {
-                another_line = false;
-            } else {
-                line.pop_back(); // Remove the continue character
-            }
-        } else {
-            if (!is_eof_multiline_toggled) {
-                another_line = false;
-            }
-        }
-        input_text += line;
-        if (another_line) {
-            input_text += '\n'; // Append the line to the result
-        }
-    } while (another_line);
-    return true;
-}
--- a/examples/common.h
+++ b/examples/common.h
@@ -14,14 +14,14 @@
 //

 struct gpt_params {
-    int32_t seed          = -1;    // RNG seed
-    int32_t n_threads     = std::min(4, (int32_t) std::thread::hardware_concurrency()); // max 4 threads (default)
-    int32_t n_predict     = 128;   // new tokens to predict
-    int32_t repeat_last_n = 64;    // last n tokens to penalize
-    int32_t n_parts       = -1;    // amount of model parts (-1 = determine from model dimensions)
-    int32_t n_ctx         = 512;   // context size
-    int32_t n_batch       = 8;     // batch size for prompt processing
-    int32_t n_keep        = 0;     // number of tokens to keep from initial prompt (-1 for all)
+    int32_t seed          = -1;   // RNG seed
+    int32_t n_threads     = std::min(4, (int32_t) std::thread::hardware_concurrency());
+    int32_t n_predict     = 128;  // new tokens to predict
+    int32_t repeat_last_n = 64;   // last n tokens to penalize
+    int32_t n_parts       = -1;   // amount of model parts (-1 = determine from model dimensions)
+    int32_t n_ctx         = 512;  // context size
+    int32_t n_batch       = 8;    // batch size for prompt processing
+    int32_t n_keep        = 0;    // number of tokens to keep from initial prompt

    // sampling parameters
    int32_t top_k = 40;
@@ -31,17 +31,11 @@ struct gpt_params {

    std::string model  = "models/lamma-7B/ggml-model.bin"; // model path
    std::string prompt = "";
-    std::string input_prefix = ""; // string to prefix user inputs with
-
-    std::string instruct_prefix = ""; // prefix user inputs with tokenized string
-    bool instruct_prefix_bos = false; // prepend bos token to instruct prefix
-    std::string instruct_suffix = ""; // suffix user inputs with tokenized string
-    bool instruct_suffix_bos = false; // prepend bos token to instruct suffix
-
+    std::string input_prefix = "";       // string to prefix user inputs with
    std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
-    std::vector<std::string> stopprompt; // string upon seeing which more user input is prompted (without adding instruct prefixes and suffixes)

-    bool rm_trailing_space_workaround = false; // workaround for removing trailing space from reverse/stop prompts
+    std::string lora_adapter = "";  // lora adapter path
+    std::string lora_base = "";     // base model path for the lora adapter

    bool memory_f16        = true;  // use f16 instead of f32 for memory kv
    bool random_prompt     = false; // do not randomize prompt if none provided
@@ -58,14 +52,11 @@ struct gpt_params {
    bool use_mlock         = false; // use mlock to keep model in memory
    bool mem_test          = false; // compute maximum memory usage
    bool verbose_prompt    = false; // print prompt tokens before generation
-
-    bool clean_interface   = false; // hides input prefix & suffix and displays '>'
-    bool multiline_mode    = true; // enables multi-line mode, to send input press CTRL+D on Linux/Max, Ctrl+Z then Return on Windows
 };

 bool gpt_params_parse(int argc, char ** argv, gpt_params & params);

-void gpt_print_usage(char * argv_0, const gpt_params & params);
+void gpt_print_usage(int argc, char ** argv, const gpt_params & params);

 std::string gpt_random_prompt(std::mt19937 & rng);

@@ -105,5 +96,3 @@ void set_console_color(console_state & con_st, console_color_t color);
 void win32_console_init(bool enable_color);
 void win32_utf8_encode(const std::wstring & wstr, std::string & str);
 #endif
-
-bool get_input_text(std::string & input_text, bool escape_newline_mode);
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -1,6 +1,8 @@
 #include "common.h"
 #include "llama.h"

+#include <ctime>
+
 int main(int argc, char ** argv) {
    gpt_params params;
    params.model = "models/llama-7B/ggml-model.bin";
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -11,6 +11,7 @@
 #include <cmath>
 #include <cstdio>
 #include <cstring>
+#include <ctime>
 #include <fstream>
 #include <iostream>
 #include <string>
@@ -30,8 +31,7 @@ static bool is_interacting = false;
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
 void sigint_handler(int signo) {
    set_console_color(con_st, CONSOLE_COLOR_DEFAULT);
-    fflush(stdout);
-    fflush(stderr);
+    printf("\n"); // this also force flush stdout.
    if (signo == SIGINT) {
        if (!is_interacting) {
            is_interacting=true;
@@ -90,8 +90,6 @@ int main(int argc, char ** argv) {
        params.prompt = gpt_random_prompt(rng);
    }

-    bool instruct_mode = !params.instruct_prefix.empty() || !params.instruct_suffix.empty();
-
 //    params.prompt = R"(// this function checks if the number n is prime
 //bool is_prime(int n) {)";

@@ -116,6 +114,17 @@ int main(int argc, char ** argv) {
        }
    }

+    if (!params.lora_adapter.empty()) {
+        int err = llama_apply_lora_from_file(ctx,
+                                             params.lora_adapter.c_str(),
+                                             params.lora_base.empty() ? NULL : params.lora_base.c_str(),
+                                             params.n_threads);
+        if (err != 0) {
+            fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
+            return 1;
+        }
+    }
+
    // print system information
    {
        fprintf(stderr, "\n");
@@ -156,20 +165,22 @@ int main(int argc, char ** argv) {
    }

    // number of tokens to keep when resetting context
-    if (params.n_keep < 0 || params.n_keep > (int)embd_inp.size()) {
+    if (params.n_keep < 0 || params.n_keep > (int)embd_inp.size() || params.instruct) {
        params.n_keep = (int)embd_inp.size();
    }

    // prefix & suffix for instruct mode
-    const auto inp_pfx = ::llama_tokenize(ctx, params.instruct_prefix, params.instruct_prefix_bos);
-    std::string instruct_suffix = params.instruct_suffix;
-    if (params.rm_trailing_space_workaround) {
-        if (instruct_suffix.back() == ' ') { instruct_suffix.pop_back(); }
+    const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true);
+    const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false);
+
+    // in instruct mode, we inject a prefix and a suffix to each input by the user
+    if (params.instruct) {
+        params.interactive_start = true;
+        params.antiprompt.push_back("### Instruction:\n\n");
    }
-    const auto inp_sfx = ::llama_tokenize(ctx, instruct_suffix, params.instruct_suffix_bos);

    // enable interactive mode if reverse prompt or interactive start is specified
-    if (params.antiprompt.size() != 0 || params.stopprompt.size() != 0 || params.interactive_start) {
+    if (params.antiprompt.size() != 0 || params.interactive_start) {
        params.interactive = true;
    }

@@ -211,21 +222,10 @@ int main(int argc, char ** argv) {
                fprintf(stderr, "Reverse prompt: '%s'\n", antiprompt.c_str());
            }
        }
-        if (params.stopprompt.size()) {
-            for (auto stopprompt : params.stopprompt) {
-                fprintf(stderr, "Stop prompt: '%s'\n", stopprompt.c_str());
-            }
-        }

        if (!params.input_prefix.empty()) {
            fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str());
        }
-        if (!params.instruct_prefix.empty()) {
-            fprintf(stderr, "Instruct prefix %s: '%s'\n", params.instruct_prefix_bos ? "(with bos token)" : "", params.instruct_prefix.c_str());
-        }
-        if (!params.instruct_suffix.empty()) {
-            fprintf(stderr, "Instruct suffix %s: '%s'\n", params.instruct_suffix_bos ? "(with bos token)" : "", params.instruct_suffix.c_str());
-        }
    }
    fprintf(stderr, "sampling: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n",
        params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
@@ -241,29 +241,12 @@ int main(int argc, char ** argv) {
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
               " - Press Ctrl+C to interject at any time.\n"
 #endif
-        );
-        if (params.multiline_mode) {
-            fprintf(stderr, " - Press Return to return control to LLaMa.\n"
-#if defined (_WIN32)
-                            " - [MULTILINE MODE] Press Ctrl+Z then Return (EOF) to toggle.\n\n");
-#else
-                            " - [MULTILINE MODE] Press Ctrl+D (EOF) to toggle.\n\n");
-#endif
-        }
-        else {
-            fprintf(stderr, " - Press Return to return control to LLaMa.\n"
-                            " - If you want to submit another line, end your input in '\\'.\n\n");
-        }
+               " - Press Return to return control to LLaMa.\n"
+               " - If you want to submit another line, end your input in '\\'.\n\n");
        is_interacting = params.interactive_start;
    }

-    struct Antiprompt {
-        bool any = false;
-        bool trailing_space = false;
-        size_t len;
-        bool is_stop_prompt = false;
-    } antiprompt;
-
+    bool is_antiprompt = false;
    bool input_noecho  = false;

    int n_past     = 0;
@@ -333,7 +316,7 @@ int main(int argc, char ** argv) {
            }

            // replace end of text token with newline token when in interactive mode
-            if (id == llama_token_eos() && params.interactive && !instruct_mode) {
+            if (id == llama_token_eos() && params.interactive && !params.instruct) {
                id = llama_token_newline.front();
                if (params.antiprompt.size() != 0) {
                    // tokenize and inject first reverse prompt
@@ -379,72 +362,27 @@ int main(int argc, char ** argv) {
        // check if we should prompt the user for more
        if (params.interactive && (int) embd_inp.size() <= n_consumed) {

-            // check for reverse prompt or stop prompt
-            if (params.antiprompt.size() || params.stopprompt.size()) {
+            // check for reverse prompt
+            if (params.antiprompt.size()) {
                std::string last_output;
                for (auto id : last_n_tokens) {
                    last_output += llama_token_to_str(ctx, id);
                }

-                antiprompt.any = false;
-                antiprompt.is_stop_prompt = false;
+                is_antiprompt = false;
                // Check if each of the reverse prompts appears at the end of the output.
-                for (std::string & prompt : params.antiprompt) {
-                    if (params.rm_trailing_space_workaround) {
-                        antiprompt.trailing_space = prompt.back() == ' ';
-                        antiprompt.len = prompt.length() - (antiprompt.trailing_space ? 1 : 0);
-                    }
-                    if (last_output.find(prompt.c_str(), last_output.length() - antiprompt.len, antiprompt.len) != std::string::npos) {
+                for (std::string & antiprompt : params.antiprompt) {
+                    if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
                        is_interacting = true;
-                        antiprompt.any = true;
+                        is_antiprompt = true;
                        set_console_color(con_st, CONSOLE_COLOR_USER_INPUT);
                        fflush(stdout);
                        break;
                    }
                }
-                if (!antiprompt.any) {
-                    for (std::string & prompt : params.stopprompt) {
-                        if (params.rm_trailing_space_workaround) {
-                            antiprompt.trailing_space = prompt.back() == ' ';
-                            antiprompt.len = prompt.length() - (antiprompt.trailing_space ? 1 : 0);
-                        }
-                        if (last_output.find(prompt.c_str(), last_output.length() - antiprompt.len, antiprompt.len) != std::string::npos) {
-                            is_interacting = true;
-                            antiprompt.any = true;
-                            antiprompt.is_stop_prompt = true;
-                            set_console_color(con_st, CONSOLE_COLOR_USER_INPUT);
-                            fflush(stdout);
-                            break;
-                        }
-                    }
-                }
            }

-            if (n_past > 0 && is_interacting)
-            {
-                std::string buffer;
-                if (!params.clean_interface && !params.instruct_prefix.empty() && !antiprompt.any) {
-                    // avoid printing again user's new line (TODO: try to revert enter press and print newline)
-                    int i = params.instruct_prefix.front() == '\n' ? 1 : 0;
-                    for (; i < inp_pfx.size(); i++) {
-                        printf("%s", llama_token_to_str(ctx, inp_pfx[i]));
-                    }
-                    fflush(stdout);
-                }
-                if (params.rm_trailing_space_workaround) {
-                    // add only if not stopprompt (as stopprompt could be used to pause
-                        //     assistant and then continue without input - adding back trailing
-                        //     space may mess it up.)
-                    if (!antiprompt.is_stop_prompt && antiprompt.any && antiprompt.trailing_space) {
-                        // add back removed trailing space to buffer(workaround)
-                        buffer += ' ';
-                        if (!params.clean_interface) {
-                            printf("%s", buffer.c_str());
-                        }
-                        fflush(stdout);
-                    }
-                }
-
+            if (n_past > 0 && is_interacting) {
                // potentially set color to indicate we are taking user input
                set_console_color(con_st, CONSOLE_COLOR_USER_INPUT);

@@ -453,45 +391,49 @@ int main(int argc, char ** argv) {
                signal(SIGINT, sigint_handler);
 #endif

-                if (params.clean_interface) {
+                if (params.instruct) {
                    printf("\n> ");
                }

+                std::string buffer;
                if (!params.input_prefix.empty()) {
                    buffer += params.input_prefix;
                    printf("%s", buffer.c_str());
                }

-                if (!get_input_text(buffer, params.multiline_mode)) {
-                    // input stream is bad
-                    return 1;
-                }
-                if (!antiprompt.is_stop_prompt) {
-                    buffer += "\n";
-                }
+                std::string line;
+                bool another_line = true;
+                do {
+#if defined(_WIN32)
+                    std::wstring wline;
+                    if (!std::getline(std::wcin, wline)) {
+                        // input stream is bad or EOF received
+                        return 0;
+                    }
+                    win32_utf8_encode(wline, line);
+#else
+                    if (!std::getline(std::cin, line)) {
+                        // input stream is bad or EOF received
+                        return 0;
+                    }
+#endif
+                    if (line.empty() || line.back() != '\\') {
+                        another_line = false;
+                    } else {
+                        line.pop_back(); // Remove the continue character
+                    }
+                    buffer += line + '\n'; // Append the line to the result
+                } while (another_line);

                // done taking input, reset color
                set_console_color(con_st, CONSOLE_COLOR_DEFAULT);

-                if (!params.clean_interface && !params.instruct_suffix.empty() && !antiprompt.is_stop_prompt) {
-                    // avoid printing again user's new line (TODO: try to revert enter press and print newline)
-                    int i = params.instruct_suffix.front() == '\n' ? 1 : 0;
-                    for (; i < inp_sfx.size(); i++) {
-                        printf("%s", llama_token_to_str(ctx, inp_sfx[i]));
-                    }
-                    // if (remove trailing space workaround) {
-                    //     We won't add back removed trailing space here, because assistant continues here,
-                    //         and it may mess up it's output (remove trailing space workaround).
-                    // }
-                    fflush(stdout);
-                }
-
                // Add tokens to embd only if the input buffer is non-empty
                // Entering a empty line lets the user pass control back
                if (buffer.length() > 1) {

-                    // insert input prefix
-                    if (!params.instruct_prefix.empty() && !antiprompt.any) {
+                    // instruct mode: insert instruction prefix
+                    if (params.instruct && !is_antiprompt) {
                        n_consumed = embd_inp.size();
                        embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
                    }
@@ -499,8 +441,8 @@ int main(int argc, char ** argv) {
                    auto line_inp = ::llama_tokenize(ctx, buffer, false);
                    embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());

-                    // insert response suffix
-                    if (!params.instruct_suffix.empty() && !antiprompt.is_stop_prompt) {
+                    // instruct mode: insert response suffix
+                    if (params.instruct) {
                        embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
                    }

@@ -517,7 +459,7 @@ int main(int argc, char ** argv) {

        // end of text token
        if (!embd.empty() && embd.back() == llama_token_eos()) {
-            if (instruct_mode) {
+            if (params.instruct) {
                is_interacting = true;
            } else {
                fprintf(stderr, " [end of text]\n");
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -2,6 +2,7 @@
 #include "llama.h"

 #include <cmath>
+#include <ctime>

 std::vector<float> softmax(const std::vector<float>& logits) {
    std::vector<float> probs(logits.size());
@@ -133,6 +134,17 @@ int main(int argc, char ** argv) {
        }
    }

+    if (!params.lora_adapter.empty()) {
+        int err = llama_apply_lora_from_file(ctx,
+                                             params.lora_adapter.c_str(),
+                                             params.lora_base.empty() ? NULL : params.lora_base.c_str(),
+                                             params.n_threads);
+        if (err != 0) {
+            fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
+            return 1;
+        }
+    }
+
    // print system information
    {
        fprintf(stderr, "\n");
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -221,7 +221,7 @@ int main(int argc, char ** argv) {
                break;
            }
            int j;
-            for (j = 0; j < GGML_TYPE_COUNT && strcmp(argv[i], ggml_type_name((ggml_type) i)) != 0; j++) {
+            for (j = 0; j < GGML_TYPE_COUNT && strcmp(argv[i], ggml_type_name((ggml_type) j)) != 0; j++) {
                // find match
            }
            if (j < GGML_TYPE_COUNT) {
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@@ -204,6 +204,7 @@ enum ggml_type {
    GGML_TYPE_F16  = 1,
    GGML_TYPE_Q4_0 = 2,
    GGML_TYPE_Q4_1 = 3,
+    GGML_TYPE_Q8_0 = 4,
    GGML_TYPE_I8,
    GGML_TYPE_I16,
    GGML_TYPE_I32,
@@ -429,6 +430,12 @@ struct ggml_tensor * ggml_add(
        struct ggml_tensor  * a,
        struct ggml_tensor  * b);

+
+struct ggml_tensor * ggml_add_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b);
+
 struct ggml_tensor * ggml_sub(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
@@ -807,6 +814,8 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t *
 int ggml_cpu_has_avx(void);
 int ggml_cpu_has_avx2(void);
 int ggml_cpu_has_avx512(void);
+int ggml_cpu_has_avx512_vbmi(void);
+int ggml_cpu_has_avx512_vnni(void);
 int ggml_cpu_has_fma(void);
 int ggml_cpu_has_neon(void);
 int ggml_cpu_has_arm_fma(void);
@@ -836,6 +845,7 @@ typedef struct {
    dequantize_row_q_t dequantize_row_q;
    quantize_row_q_t   quantize_row_q;
    quantize_row_q_t   quantize_row_q_reference;
+    quantize_row_q_t   quantize_row_q_dot;
    vec_dot_q_t        vec_dot_q;
 } quantize_fns_t;

--- a/llama.cpp
+++ b/llama.cpp
@@ -1,6 +1,8 @@
 // Defines fileno on msys:
 #ifndef _GNU_SOURCE
 #define _GNU_SOURCE
+#include <cstdint>
+#include <cstdio>
 #endif

 #include "llama_util.h"
@@ -9,6 +11,7 @@
 #include "ggml.h"

 #include <array>
+#include <ctime>
 #include <cinttypes>
 #include <fstream>
 #include <random>
@@ -41,35 +44,51 @@ static const size_t MB = 1024*1024;
 // TODO: dynamically determine these sizes
 //       needs modifications in ggml

-static const std::map<e_model, size_t> MEM_REQ_SCRATCH0 = {
-    { MODEL_7B,    512ull*MB },
-    { MODEL_13B,   512ull*MB },
-    { MODEL_30B,   512ull*MB },
-    { MODEL_65B,   512ull*MB },
-};
+static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
+{
+    static std::map<e_model, size_t> _MEM_REQ_SCRATCH0 = {
+        { MODEL_7B,    512ull * MB },
+        { MODEL_13B,   512ull * MB },
+        { MODEL_30B,   512ull * MB },
+        { MODEL_65B,   512ull * MB },
+    };
+    return _MEM_REQ_SCRATCH0;
+}

-static const std::map<e_model, size_t> MEM_REQ_SCRATCH1 = {
-    { MODEL_7B,    512ull*MB },
-    { MODEL_13B,   512ull*MB },
-    { MODEL_30B,   512ull*MB },
-    { MODEL_65B,   512ull*MB },
+static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
+{
+    static std::map<e_model, size_t> _MEM_REQ_SCRATCH1 = {
+        { MODEL_7B,    512ull * MB },
+        { MODEL_13B,   512ull * MB },
+        { MODEL_30B,   512ull * MB },
+        { MODEL_65B,   512ull * MB },
+    };
+    return _MEM_REQ_SCRATCH1;
 };

 // 2*n_embd*n_ctx*n_layer*sizeof(float16)
-static const std::map<e_model, size_t> MEM_REQ_KV_SELF = {
-    { MODEL_7B,   1026ull*MB },
-    { MODEL_13B,  1608ull*MB },
-    { MODEL_30B,  3124ull*MB },
-    { MODEL_65B,  5120ull*MB },
+static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
+{
+    static std::map<e_model, size_t> _MEM_REQ_KV_SELF = {
+        { MODEL_7B,   1026ull * MB },
+        { MODEL_13B,  1608ull * MB },
+        { MODEL_30B,  3124ull * MB },
+        { MODEL_65B,  5120ull * MB },
+    };
+    return _MEM_REQ_KV_SELF;
 };

 // this is mostly needed for temporary mul_mat buffers to dequantize the data
 // not actually needed if BLAS is disabled
-static const std::map<e_model, size_t> MEM_REQ_EVAL = {
-    { MODEL_7B,   768ull*MB },
-    { MODEL_13B, 1024ull*MB },
-    { MODEL_30B, 1280ull*MB },
-    { MODEL_65B, 1536ull*MB },
+static const std::map<e_model, size_t> & MEM_REQ_EVAL()
+{
+    static std::map<e_model, size_t> _MEM_REQ_EVAL = {
+        { MODEL_7B,   768ull * MB },
+        { MODEL_13B, 1024ull * MB },
+        { MODEL_30B, 1280ull * MB },
+        { MODEL_65B, 1536ull * MB },
+    };
+    return _MEM_REQ_EVAL;
 };

 // default hparams (LLaMA 7B)
@@ -261,12 +280,12 @@ static size_t checked_div(size_t a, size_t b) {
 }

 static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) {
-    std::string ret = "[" + std::to_string(ne.at(0));
+    char buf[256];
+    snprintf(buf, sizeof(buf), "%5u", ne.at(0));
    for (size_t i = 1; i < ne.size(); i++) {
-        ret += " x " + std::to_string(ne.at(i));
+        snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), " x %5u", ne.at(i));
    }
-    ret += "]";
-    return ret;
+    return buf;
 }

 static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml_type type) {
@@ -616,6 +635,7 @@ struct llama_model_loader {
            throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
                         name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
        }
+
        return get_tensor_for(lt);
    }

@@ -898,13 +918,13 @@ static void llama_model_load_internal(
        const size_t mem_required =
            ctx_size +
            mmapped_size +
-            MEM_REQ_SCRATCH0.at(model.type) +
-            MEM_REQ_SCRATCH1.at(model.type) +
-            MEM_REQ_EVAL.at    (model.type);
+            MEM_REQ_SCRATCH0().at(model.type) +
+            MEM_REQ_SCRATCH1().at(model.type) +
+            MEM_REQ_EVAL().at(model.type);

        // this is the memory required by one llama_state
        const size_t mem_required_state =
-            scale*MEM_REQ_KV_SELF.at(model.type);
+            scale*MEM_REQ_KV_SELF().at(model.type);

        fprintf(stderr, "%s: mem required  = %7.2f MB (+ %7.2f MB per state)\n", __func__,
                mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
@@ -941,8 +961,8 @@ static void llama_model_load_internal(
        ml->ggml_ctx = ctx;

        model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
-        model.norm   = ml->get_tensor("norm.weight", {n_embd});
-        model.output = ml->get_tensor("output.weight", {n_embd, n_vocab});
+        model.norm           = ml->get_tensor("norm.weight",           {n_embd});
+        model.output         = ml->get_tensor("output.weight",         {n_embd, n_vocab});

        model.layers.resize(n_layer);
        for (uint32_t i = 0; i < n_layer; ++i) {
@@ -1569,7 +1589,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        tensor.data = read_data.addr;
        model_loader->load_data_for(tensor);

-        printf("[%zu/%zu] %36s - %s, type = %6s, ",
+        printf("[%4zu/%4zu] %36s - %16s, type = %6s, ",
               ++idx, model_loader->tensors_map.tensors.size(),
               tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(),
               ggml_type_name(tensor.type));
@@ -1731,10 +1751,10 @@ struct llama_context * llama_init_from_file(
            ctx->embedding.resize(hparams.n_embd);
        }

-        ctx->buf_compute.resize(MEM_REQ_EVAL.at(ctx->model.type));
+        ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));

-        ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0.at(ctx->model.type));
-        ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1.at(ctx->model.type));
+        ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
+        ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
    }

    return ctx;
@@ -1757,6 +1777,254 @@ int llama_model_quantize(
    }
 }

+int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
+    fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
+
+    auto & model = ctx->model;
+
+    const int64_t t_start_lora_us = ggml_time_us();
+
+    auto fin = std::ifstream(path_lora, std::ios::binary);
+    if (!fin) {
+        fprintf(stderr, "%s: failed to open '%s'\n", __func__, path_lora);
+        return 1;
+    }
+
+    // verify magic and version
+    {
+        uint32_t magic;
+        fin.read((char *) &magic, sizeof(magic));
+        if (magic != 'ggla') {
+            fprintf(stderr, "%s: bad file magic\n", __func__);
+            return 1;
+        }
+        uint32_t format_version;
+        fin.read((char *) &format_version, sizeof(format_version));
+
+        if (format_version != 1) {
+            fprintf(stderr, "%s: unsupported file version\n", __func__ );
+            return 1;
+        }
+    }
+
+    int32_t lora_r;
+    int32_t lora_alpha;
+    fin.read((char *) &lora_r, sizeof(lora_r));
+    fin.read((char *) &lora_alpha, sizeof(lora_alpha));
+    float scaling = (float)lora_alpha / (float)lora_r;
+
+    fprintf(stderr, "%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
+
+
+    // create a temporary ggml context to store the lora tensors
+    // todo: calculate size from biggest possible tensor
+    std::vector<uint8_t> lora_buf(1024ull * 1024ull * 1024ull);
+    struct ggml_init_params params;
+    params.mem_size   = lora_buf.size();
+    params.mem_buffer = lora_buf.data();
+    params.no_alloc   = false;
+
+    ggml_context * lora_ctx = ggml_init(params);
+    std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
+
+    // create a name -> tensor map of the model to accelerate lookups
+    std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
+    for (auto & kv: model.tensors_by_name) {
+        model_tensors.insert(kv);
+    }
+
+
+    // load base model
+    std::unique_ptr<llama_model_loader> model_loader;
+    ggml_context * base_ctx = NULL;
+    llama_buffer base_buf;
+    if (path_base_model) {
+        fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
+        model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
+
+        size_t ctx_size, mmapped_size;
+        model_loader->calc_sizes(&ctx_size, &mmapped_size);
+        base_buf.resize(ctx_size);
+
+        ggml_init_params base_params;
+        base_params.mem_size   = base_buf.size;
+        base_params.mem_buffer = base_buf.addr;
+        base_params.no_alloc   = model_loader->use_mmap;
+
+        base_ctx = ggml_init(base_params);
+
+        model_loader->ggml_ctx = base_ctx;
+
+        // maybe this should in llama_model_loader
+        if (model_loader->use_mmap) {
+            model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false));
+        }
+    }
+
+    // read tensors and apply
+    bool warned = false;
+    int n_tensors = 0;
+    while (true) {
+        int32_t n_dims;
+        int32_t length;
+        int32_t ftype;
+
+        fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
+        fin.read(reinterpret_cast<char *>(&length), sizeof(length));
+        fin.read(reinterpret_cast<char *>(&ftype),  sizeof(ftype));
+        if (fin.eof()) {
+            break;
+        }
+
+        int32_t ne[2] = { 1, 1 };
+        for (int i = 0; i < n_dims; ++i) {
+            fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
+        }
+
+        std::string name(length, 0);
+        fin.read(&name[0], length);
+
+        // check for lora suffix and get the type of tensor
+        const std::string lora_suffix = ".lora";
+        size_t pos = name.rfind(lora_suffix);
+        if (pos == std::string::npos) {
+            fprintf(stderr, "%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
+            return 1;
+        }
+
+        std::string lora_type = name.substr(pos + lora_suffix.length());
+        std::string base_name = name;
+        base_name.erase(pos);
+        // fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
+
+        if (model_tensors.find(base_name.data()) == model_tensors.end()) {
+            fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
+            return 1;
+        }
+
+        // create ggml tensor
+        ggml_type wtype;
+        switch (ftype) {
+            case 0: wtype = GGML_TYPE_F32;  break;
+            case 1: wtype = GGML_TYPE_F16;  break;
+            default:
+                    {
+                        fprintf(stderr, "%s: invalid tensor data type '%d'\n",
+                                __func__, ftype);
+                        return false;
+                    }
+        }
+        ggml_tensor* lora_tensor;
+        if (n_dims == 2) {
+            lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
+        }
+        else {
+            fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims);
+            return 1;
+        }
+
+        // load tensor data
+        size_t offset = fin.tellg();
+        size_t tensor_data_size = ggml_nbytes(lora_tensor);
+        offset = (offset + 31) & -32;
+        fin.seekg(offset);
+        fin.read((char*)lora_tensor->data, tensor_data_size);
+
+        lora_tensors[name] = lora_tensor;
+
+        // check if we have both A and B tensors and apply
+        if (lora_tensors.find(base_name + ".loraA") != lora_tensors.end() &&
+            lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) {
+
+            ggml_tensor * dest_t = model_tensors[base_name];
+            ggml_tensor * base_t;
+            if (model_loader) {
+                // load from base model
+                if (model_loader->tensors_map.name_to_idx.find(base_name) == model_loader->tensors_map.name_to_idx.end()) {
+                    fprintf(stderr, "%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
+                    return 1;
+                }
+                size_t idx = model_loader->tensors_map.name_to_idx[base_name];
+                llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
+                base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] });
+                lt.data = (uint8_t *) lt.ggml_tensor->data;
+                model_loader->load_data_for(lt);
+                lt.ggml_tensor->data = lt.data;
+            }
+            else {
+                base_t = dest_t;
+            }
+
+            if (base_t->type == GGML_TYPE_Q4_0 || base_t->type == GGML_TYPE_Q4_1) {
+                if (!warned) {
+                    fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, "
+                                    "use a f16 or f32 base model with --lora-base\n", __func__);
+                    warned = true;
+                }
+            }
+
+            ggml_tensor * loraA = lora_tensors[base_name + ".loraA"];
+            ggml_tensor * loraB = lora_tensors[base_name + ".loraB"];
+
+            if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
+                fprintf(stderr, "%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
+                               " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
+                return 1;
+            }
+
+            // w = w + BA*s
+            ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
+
+            if (scaling != 1.0f) {
+                ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
+                BA = ggml_scale(lora_ctx, BA, scale_tensor);
+            }
+
+            ggml_tensor * r;
+            if (base_t == dest_t) {
+                r = ggml_add_inplace(lora_ctx, dest_t, BA);
+            }
+            else {
+                r = ggml_add(lora_ctx, base_t, BA);
+                r = ggml_cpy(lora_ctx, r, dest_t);
+            }
+
+            struct ggml_cgraph gf = ggml_build_forward(r);
+            gf.n_threads = n_threads;
+            ggml_graph_compute(lora_ctx, &gf);
+
+            // we won't need these tensors again, reset the context to save memory
+            ggml_free(lora_ctx);
+            lora_ctx = ggml_init(params);
+            lora_tensors.clear();
+
+            n_tensors++;
+            if (n_tensors % 4 == 0)
+                fprintf(stderr, ".");
+        }
+    }
+
+    // TODO: this should be in a destructor, it will leak on failure
+    ggml_free(lora_ctx);
+    if (base_ctx) {
+        ggml_free(base_ctx);
+    }
+
+    const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
+    fprintf(stderr, " done (%.2f ms)\n", t_lora_us / 1000.0);
+
+    return 0;
+}
+
+int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
+    try {
+        return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
+    } catch (const std::string & err) {
+        fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.c_str());
+        return 1;
+    }
+}
+
 // Returns the KV cache that will contain the context for the
 // ongoing prediction with the model.
 const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
@@ -1914,18 +2182,20 @@ const char * llama_print_system_info(void) {
    static std::string s;

    s  = "";
-    s += "AVX = "       + std::to_string(ggml_cpu_has_avx())       + " | ";
-    s += "AVX2 = "      + std::to_string(ggml_cpu_has_avx2())      + " | ";
-    s += "AVX512 = "    + std::to_string(ggml_cpu_has_avx512())    + " | ";
-    s += "FMA = "       + std::to_string(ggml_cpu_has_fma())       + " | ";
-    s += "NEON = "      + std::to_string(ggml_cpu_has_neon())      + " | ";
-    s += "ARM_FMA = "   + std::to_string(ggml_cpu_has_arm_fma())   + " | ";
-    s += "F16C = "      + std::to_string(ggml_cpu_has_f16c())      + " | ";
-    s += "FP16_VA = "   + std::to_string(ggml_cpu_has_fp16_va())   + " | ";
-    s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
-    s += "BLAS = "      + std::to_string(ggml_cpu_has_blas())      + " | ";
-    s += "SSE3 = "      + std::to_string(ggml_cpu_has_sse3())      + " | ";
-    s += "VSX = "       + std::to_string(ggml_cpu_has_vsx())       + " | ";
+    s += "AVX = "         + std::to_string(ggml_cpu_has_avx())         + " | ";
+    s += "AVX2 = "        + std::to_string(ggml_cpu_has_avx2())        + " | ";
+    s += "AVX512 = "      + std::to_string(ggml_cpu_has_avx512())      + " | ";
+    s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
+    s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
+    s += "FMA = "         + std::to_string(ggml_cpu_has_fma())         + " | ";
+    s += "NEON = "        + std::to_string(ggml_cpu_has_neon())        + " | ";
+    s += "ARM_FMA = "     + std::to_string(ggml_cpu_has_arm_fma())     + " | ";
+    s += "F16C = "        + std::to_string(ggml_cpu_has_f16c())        + " | ";
+    s += "FP16_VA = "     + std::to_string(ggml_cpu_has_fp16_va())     + " | ";
+    s += "WASM_SIMD = "   + std::to_string(ggml_cpu_has_wasm_simd())   + " | ";
+    s += "BLAS = "        + std::to_string(ggml_cpu_has_blas())        + " | ";
+    s += "SSE3 = "        + std::to_string(ggml_cpu_has_sse3())        + " | ";
+    s += "VSX = "         + std::to_string(ggml_cpu_has_vsx())         + " | ";

    return s.c_str();
 }
--- a/llama.h
+++ b/llama.h
@@ -96,6 +96,18 @@ extern "C" {
            const char * fname_out,
      enum llama_ftype   ftype);

+    // Apply a LoRA adapter to a loaded model
+    // path_base_model is the path to a higher quality model to use as a base for
+    // the layers modified by the adapter. Can be NULL to use the current loaded model.
+    // The model needs to be reloaded before applying a new adapter, otherwise the adapter
+    // will be applied on top of the previous one
+    // Returns 0 on success
+    LLAMA_API int llama_apply_lora_from_file(
+            struct llama_context * ctx,
+                      const char * path_lora,
+                      const char * path_base_model,
+                             int   n_threads);
+
    // Returns the KV cache that will contain the context for the
    // ongoing prediction with the model.
    LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx);
--- a/llama_util.h
+++ b/llama_util.h
@@ -43,8 +43,12 @@
    } while (0)

 #ifdef __GNUC__
+#ifdef __MINGW32__
+__attribute__((format(gnu_printf, 1, 2)))
+#else
 __attribute__((format(printf, 1, 2)))
 #endif
+#endif
 static std::string format(const char * fmt, ...) {
    va_list ap, ap2;
    va_start(ap, fmt);
@@ -57,7 +61,7 @@ static std::string format(const char * fmt, ...) {
    va_end(ap2);
    va_end(ap);
    return std::string(buf.data(), size);
-};
+}

 struct llama_file {
    // use FILE * so we don't have to re-open the file to mmap
@@ -164,7 +168,7 @@ struct llama_mmap {
 #ifdef _POSIX_MAPPED_FILES
    static constexpr bool SUPPORTED = true;

-    llama_mmap(struct llama_file * file) {
+    llama_mmap(struct llama_file * file, bool prefetch = true) {
        size = file->size;
        int fd = fileno(file->fp);
        int flags = MAP_SHARED;
@@ -172,15 +176,16 @@ struct llama_mmap {
        flags |= MAP_POPULATE;
 #endif
        addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
-        close(fd);
        if (addr == MAP_FAILED) {
            throw format("mmap failed: %s", strerror(errno));
        }

-        // Advise the kernel to preload the mapped memory
-        if (madvise(addr, file->size, MADV_WILLNEED)) {
-            fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
-                    strerror(errno));
+        if (prefetch) {
+            // Advise the kernel to preload the mapped memory
+            if (madvise(addr, file->size, MADV_WILLNEED)) {
+                fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
+                        strerror(errno));
+            }
        }
    }

@@ -190,14 +195,13 @@ struct llama_mmap {
 #elif defined(_WIN32)
    static constexpr bool SUPPORTED = true;

-    llama_mmap(struct llama_file * file) {
+    llama_mmap(struct llama_file * file, bool prefetch = true) {
        size = file->size;

        HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));

        HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
        DWORD error = GetLastError();
-        CloseHandle(hFile);

        if (hMapping == NULL) {
            throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
@@ -212,13 +216,15 @@ struct llama_mmap {
        }

        #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
-        // Advise the kernel to preload the mapped memory
-        WIN32_MEMORY_RANGE_ENTRY range;
-        range.VirtualAddress = addr;
-        range.NumberOfBytes = (SIZE_T)size;
-        if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
-            fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
-                    llama_format_win_err(GetLastError()).c_str());
+        if (prefetch) {
+            // Advise the kernel to preload the mapped memory
+            WIN32_MEMORY_RANGE_ENTRY range;
+            range.VirtualAddress = addr;
+            range.NumberOfBytes = (SIZE_T)size;
+            if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
+                fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
+                        llama_format_win_err(GetLastError()).c_str());
+            }
        }
        #else
        #pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
--- a/pocs/CMakeLists.txt
+++ b/pocs/CMakeLists.txt
@@ -0,0 +1,12 @@
+# dependencies
+
+find_package(Threads REQUIRED)
+
+# third-party
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+
+if (EMSCRIPTEN)
+else()
+    add_subdirectory(vdot)
+endif()
--- a/pocs/vdot/CMakeLists.txt
+++ b/pocs/vdot/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(TARGET vdot)
+add_executable(${TARGET} vdot.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/pocs/vdot/vdot.cpp
+++ b/pocs/vdot/vdot.cpp
@@ -0,0 +1,305 @@
+#include <cstdio>
+#include <vector>
+#include <random>
+#include <chrono>
+#include <cstdlib>
+#include <cmath>
+#include <cassert>
+#include <cstring>
+#include <array>
+
+#include <ggml.h>
+
+constexpr int kVecSize = 1 << 18;
+
+float drawFromGaussianPdf(std::mt19937& rndm) {
+    constexpr double kScale = 1./(1. + std::mt19937::max());
+    constexpr double kTwoPiTimesScale = 6.28318530717958647692*kScale;
+    static float lastX;
+    static bool haveX = false;
+    if (haveX) { haveX = false; return lastX; }
+    auto r = sqrt(-2*log(1 - kScale*rndm()));
+    auto phi = kTwoPiTimesScale * rndm();
+    lastX = r*sin(phi);
+    haveX = true;
+    return r*cos(phi);
+}
+void fillRandomGaussianFloats(std::vector<float>& values, std::mt19937& rndm, float mean = 0) {
+    for (auto& v : values) v = mean + drawFromGaussianPdf(rndm);
+}
+
+// Copy-pasted from ggml.c
+#define QK4_0 32
+typedef struct {
+    float   d;          // delta
+    uint8_t qs[QK4_0 / 2];  // nibbles / quants
+} block_q4_0;
+static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding");
+
+#define QK4_1 32
+typedef struct {
+    float   d;          // delta
+    float   m;          // min
+    uint8_t qs[QK4_1 / 2];  // nibbles / quants
+} block_q4_1;
+static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
+
+// Copy-pasted from ggml.c
+#define QK8_0 32
+typedef struct {
+    float   d;          // delta
+    int8_t  qs[QK8_0];  // quants
+} block_q8_0;
+static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding");
+
+// "Scalar" dot product between the quantized vector x and float vector y
+inline double dot(int n, const block_q4_0* x, const float* y) {
+    const static float kValues[16] = {-8.f, -7.f, -6.f, -5.f, -4.f, -3.f, -2.f, -1.f, 0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f};
+    constexpr uint32_t kMask1 = 0x0f0f0f0f;
+    uint32_t u1, u2;
+    auto q1 = (const uint8_t*)&u1;
+    auto q2 = (const uint8_t*)&u2;
+    double sum = 0;
+    for (int i=0; i<n; ++i) {
+        float d = x->d;
+        auto u = (const uint32_t*)x->qs;
+        float s = 0;
+        for (int k=0; k<4; ++k) {
+            u1 = u[k] & kMask1;
+            u2 = (u[k] >> 4) & kMask1;
+            s += y[0]*kValues[q1[0]] + y[1]*kValues[q2[0]] +
+                 y[2]*kValues[q1[1]] + y[3]*kValues[q2[1]] +
+                 y[4]*kValues[q1[2]] + y[5]*kValues[q2[2]] +
+                 y[6]*kValues[q1[3]] + y[7]*kValues[q2[3]];
+            y += 8;
+        }
+        sum += s*d;
+        ++x;
+    }
+    return sum;
+}
+// Alternative version of the above. Faster on my Mac (~45 us vs ~55 us per dot product),
+// but about the same on X86_64 (Ryzen 7950X CPU).
+inline double dot3(int n, const block_q4_0* x, const float* y) {
+    const static std::pair<float,float> kValues[256] = {
+        {-8.f, -8.f}, {-7.f, -8.f}, {-6.f, -8.f}, {-5.f, -8.f}, {-4.f, -8.f}, {-3.f, -8.f}, {-2.f, -8.f}, {-1.f, -8.f},
+        { 0.f, -8.f}, { 1.f, -8.f}, { 2.f, -8.f}, { 3.f, -8.f}, { 4.f, -8.f}, { 5.f, -8.f}, { 6.f, -8.f}, { 7.f, -8.f},
+        {-8.f, -7.f}, {-7.f, -7.f}, {-6.f, -7.f}, {-5.f, -7.f}, {-4.f, -7.f}, {-3.f, -7.f}, {-2.f, -7.f}, {-1.f, -7.f},
+        { 0.f, -7.f}, { 1.f, -7.f}, { 2.f, -7.f}, { 3.f, -7.f}, { 4.f, -7.f}, { 5.f, -7.f}, { 6.f, -7.f}, { 7.f, -7.f},
+        {-8.f, -6.f}, {-7.f, -6.f}, {-6.f, -6.f}, {-5.f, -6.f}, {-4.f, -6.f}, {-3.f, -6.f}, {-2.f, -6.f}, {-1.f, -6.f},
+        { 0.f, -6.f}, { 1.f, -6.f}, { 2.f, -6.f}, { 3.f, -6.f}, { 4.f, -6.f}, { 5.f, -6.f}, { 6.f, -6.f}, { 7.f, -6.f},
+        {-8.f, -5.f}, {-7.f, -5.f}, {-6.f, -5.f}, {-5.f, -5.f}, {-4.f, -5.f}, {-3.f, -5.f}, {-2.f, -5.f}, {-1.f, -5.f},
+        { 0.f, -5.f}, { 1.f, -5.f}, { 2.f, -5.f}, { 3.f, -5.f}, { 4.f, -5.f}, { 5.f, -5.f}, { 6.f, -5.f}, { 7.f, -5.f},
+        {-8.f, -4.f}, {-7.f, -4.f}, {-6.f, -4.f}, {-5.f, -4.f}, {-4.f, -4.f}, {-3.f, -4.f}, {-2.f, -4.f}, {-1.f, -4.f},
+        { 0.f, -4.f}, { 1.f, -4.f}, { 2.f, -4.f}, { 3.f, -4.f}, { 4.f, -4.f}, { 5.f, -4.f}, { 6.f, -4.f}, { 7.f, -4.f},
+        {-8.f, -3.f}, {-7.f, -3.f}, {-6.f, -3.f}, {-5.f, -3.f}, {-4.f, -3.f}, {-3.f, -3.f}, {-2.f, -3.f}, {-1.f, -3.f},
+        { 0.f, -3.f}, { 1.f, -3.f}, { 2.f, -3.f}, { 3.f, -3.f}, { 4.f, -3.f}, { 5.f, -3.f}, { 6.f, -3.f}, { 7.f, -3.f},
+        {-8.f, -2.f}, {-7.f, -2.f}, {-6.f, -2.f}, {-5.f, -2.f}, {-4.f, -2.f}, {-3.f, -2.f}, {-2.f, -2.f}, {-1.f, -2.f},
+        { 0.f, -2.f}, { 1.f, -2.f}, { 2.f, -2.f}, { 3.f, -2.f}, { 4.f, -2.f}, { 5.f, -2.f}, { 6.f, -2.f}, { 7.f, -2.f},
+        {-8.f, -1.f}, {-7.f, -1.f}, {-6.f, -1.f}, {-5.f, -1.f}, {-4.f, -1.f}, {-3.f, -1.f}, {-2.f, -1.f}, {-1.f, -1.f},
+        { 0.f, -1.f}, { 1.f, -1.f}, { 2.f, -1.f}, { 3.f, -1.f}, { 4.f, -1.f}, { 5.f, -1.f}, { 6.f, -1.f}, { 7.f, -1.f},
+        {-8.f,  0.f}, {-7.f,  0.f}, {-6.f,  0.f}, {-5.f,  0.f}, {-4.f,  0.f}, {-3.f,  0.f}, {-2.f,  0.f}, {-1.f,  0.f},
+        { 0.f,  0.f}, { 1.f,  0.f}, { 2.f,  0.f}, { 3.f,  0.f}, { 4.f,  0.f}, { 5.f,  0.f}, { 6.f,  0.f}, { 7.f,  0.f},
+        {-8.f,  1.f}, {-7.f,  1.f}, {-6.f,  1.f}, {-5.f,  1.f}, {-4.f,  1.f}, {-3.f,  1.f}, {-2.f,  1.f}, {-1.f,  1.f},
+        { 0.f,  1.f}, { 1.f,  1.f}, { 2.f,  1.f}, { 3.f,  1.f}, { 4.f,  1.f}, { 5.f,  1.f}, { 6.f,  1.f}, { 7.f,  1.f},
+        {-8.f,  2.f}, {-7.f,  2.f}, {-6.f,  2.f}, {-5.f,  2.f}, {-4.f,  2.f}, {-3.f,  2.f}, {-2.f,  2.f}, {-1.f,  2.f},
+        { 0.f,  2.f}, { 1.f,  2.f}, { 2.f,  2.f}, { 3.f,  2.f}, { 4.f,  2.f}, { 5.f,  2.f}, { 6.f,  2.f}, { 7.f,  2.f},
+        {-8.f,  3.f}, {-7.f,  3.f}, {-6.f,  3.f}, {-5.f,  3.f}, {-4.f,  3.f}, {-3.f,  3.f}, {-2.f,  3.f}, {-1.f,  3.f},
+        { 0.f,  3.f}, { 1.f,  3.f}, { 2.f,  3.f}, { 3.f,  3.f}, { 4.f,  3.f}, { 5.f,  3.f}, { 6.f,  3.f}, { 7.f,  3.f},
+        {-8.f,  4.f}, {-7.f,  4.f}, {-6.f,  4.f}, {-5.f,  4.f}, {-4.f,  4.f}, {-3.f,  4.f}, {-2.f,  4.f}, {-1.f,  4.f},
+        { 0.f,  4.f}, { 1.f,  4.f}, { 2.f,  4.f}, { 3.f,  4.f}, { 4.f,  4.f}, { 5.f,  4.f}, { 6.f,  4.f}, { 7.f,  4.f},
+        {-8.f,  5.f}, {-7.f,  5.f}, {-6.f,  5.f}, {-5.f,  5.f}, {-4.f,  5.f}, {-3.f,  5.f}, {-2.f,  5.f}, {-1.f,  5.f},
+        { 0.f,  5.f}, { 1.f,  5.f}, { 2.f,  5.f}, { 3.f,  5.f}, { 4.f,  5.f}, { 5.f,  5.f}, { 6.f,  5.f}, { 7.f,  5.f},
+        {-8.f,  6.f}, {-7.f,  6.f}, {-6.f,  6.f}, {-5.f,  6.f}, {-4.f,  6.f}, {-3.f,  6.f}, {-2.f,  6.f}, {-1.f,  6.f},
+        { 0.f,  6.f}, { 1.f,  6.f}, { 2.f,  6.f}, { 3.f,  6.f}, { 4.f,  6.f}, { 5.f,  6.f}, { 6.f,  6.f}, { 7.f,  6.f},
+        {-8.f,  7.f}, {-7.f,  7.f}, {-6.f,  7.f}, {-5.f,  7.f}, {-4.f,  7.f}, {-3.f,  7.f}, {-2.f,  7.f}, {-1.f,  7.f},
+        { 0.f,  7.f}, { 1.f,  7.f}, { 2.f,  7.f}, { 3.f,  7.f}, { 4.f,  7.f}, { 5.f,  7.f}, { 6.f,  7.f}, { 7.f,  7.f}
+    };
+    double sum = 0;
+    for (int i=0; i<n; ++i) {
+        float d = x->d;
+        auto q = x->qs;
+        float s = 0;
+        for (int k=0; k<4; ++k) {
+            s += y[0]*kValues[q[0]].first + y[1]*kValues[q[0]].second +
+                 y[2]*kValues[q[1]].first + y[3]*kValues[q[1]].second +
+                 y[4]*kValues[q[2]].first + y[5]*kValues[q[2]].second +
+                 y[6]*kValues[q[3]].first + y[7]*kValues[q[3]].second;
+            y += 8; q += 4;
+        }
+        sum += s*d;
+        ++x;
+    }
+    return sum;
+}
+
+inline double dot41(int n, const block_q4_1* x, const float* y) {
+    const static float kValues[16] = {0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f};
+    constexpr uint32_t kMask1 = 0x0f0f0f0f;
+    uint32_t u1, u2;
+    auto q1 = (const uint8_t*)&u1;
+    auto q2 = (const uint8_t*)&u2;
+    double sum = 0;
+    for (int i=0; i<n; ++i) {
+        auto u = (const uint32_t*)x->qs;
+        float s = 0, s1 = 0;
+        for (int k=0; k<4; ++k) {
+            u1 = u[k] & kMask1;
+            u2 = (u[k] >> 4) & kMask1;
+            s += y[0]*kValues[q1[0]] + y[1]*kValues[q2[0]] +
+                 y[2]*kValues[q1[1]] + y[3]*kValues[q2[1]] +
+                 y[4]*kValues[q1[2]] + y[5]*kValues[q2[2]] +
+                 y[6]*kValues[q1[3]] + y[7]*kValues[q2[3]];
+            s1 += y[0] + y[1] + y[2] + y[3] + y[4] + y[5] + y[6] + y[7];
+            y += 8;
+        }
+        sum += s*x->d + s1*x->m;
+        ++x;
+    }
+    return sum;
+}
+
+// Copy-pasted from ggml.c
+static void quantize_row_q8_0_reference(const float *x, block_q8_0 *y, int k) {
+    assert(k % QK8_0 == 0);
+    const int nb = k / QK8_0;
+
+    for (int i = 0; i < nb; i++) {
+        float amax = 0.0f; // absolute max
+
+        for (int l = 0; l < QK8_0; l++) {
+            const float v = x[i*QK8_0 + l];
+            amax = std::max(amax, fabsf(v));
+        }
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = d;
+
+        for (int l = 0; l < QK8_0; ++l) {
+            const float   v  = x[i*QK8_0 + l]*id;
+            y[i].qs[l] = roundf(v);
+        }
+    }
+}
+
+// Copy-pasted from ggml.c
+static void dot_q4_q8(const int n, float* s, const void* vx, const void* vy) {
+    const int nb = n / QK8_0;
+    const block_q4_0* x = (const block_q4_0*)vx;
+    const block_q8_0* y = (const block_q8_0*)vy;
+    float sumf = 0;
+    for (int i = 0; i < nb; i++) {
+        const float d0 = x[i].d;
+        const float d1 = y[i].d;
+
+        const uint8_t * p0 = x[i].qs;
+        const  int8_t * p1 = y[i].qs;
+
+        int sumi = 0;
+        for (int j = 0; j < QK8_0/2; j++) {
+            const uint8_t v0 = p0[j];
+
+            const int i0 = (int8_t) (v0 & 0xf) - 8;
+            const int i1 = (int8_t) (v0 >> 4)  - 8;
+
+            const int i2 = p1[2*j + 0];
+            const int i3 = p1[2*j + 1];
+
+            sumi += i0*i2 + i1*i3;
+        }
+        sumf += d0*d1*sumi;
+    }
+    *s = sumf;
+}
+
+int main(int argc, char** argv) {
+
+    int nloop = argc > 1 ? atoi(argv[1]) : 10;
+    bool scalar = argc > 2 ? atoi(argv[2]) : false;
+    bool useQ4_1 = argc > 3 ? atoi(argv[3]) : false;
+
+    if (scalar && useQ4_1) {
+        printf("It is not possible to use Q4_1 quantization and scalar implementations\n");
+        return 1;
+    }
+
+    std::mt19937 rndm(1234);
+
+    std::vector<float> x1(kVecSize), y1(kVecSize);
+    int n4 = useQ4_1 ? kVecSize / QK4_1 : kVecSize / QK4_0; n4 = 64*((n4 + 63)/64);
+    int n8 = kVecSize / QK8_0; n8 = 64*((n8 + 63)/64);
+
+    auto funcs = useQ4_1 ? ggml_internal_get_quantize_fn(GGML_TYPE_Q4_1) : ggml_internal_get_quantize_fn(GGML_TYPE_Q4_0);
+
+    std::vector<block_q4_0> q40;
+    std::vector<block_q4_1> q41;
+    if (useQ4_1) q41.resize(n4);
+    else q40.resize(n4);
+    std::vector<block_q8_0> q8(n8);
+    std::vector<int64_t> H(16, 0);
+    double sumt = 0, sumt2 = 0, maxt = 0;
+    double sumqt = 0, sumqt2 = 0, maxqt = 0;
+    double sum = 0, sumq = 0, exactSum = 0;
+    for (int iloop=0; iloop<nloop; ++iloop) {
+
+        // Fill vector x with random numbers
+        fillRandomGaussianFloats(x1, rndm);
+
+        // Fill vector y with random numbers
+        fillRandomGaussianFloats(y1, rndm);
+
+        // Compute the exact dot product
+        for (int k=0; k<kVecSize; ++k) exactSum += x1[k]*y1[k];
+
+        // quantize x.
+        // Note, we do not include this in the timing as in practical application
+        // we already have the quantized model weights.
+        if (useQ4_1) {
+            funcs.quantize_row_q(x1.data(), q41.data(), kVecSize);
+        } else {
+            funcs.quantize_row_q(x1.data(), q40.data(), kVecSize);
+        }
+
+        // Now measure time the dot product needs using the "scalar" version above
+        auto t1 = std::chrono::high_resolution_clock::now();
+        if (useQ4_1) sum += dot41(kVecSize / QK4_1, q41.data(), y1.data());
+        else sum += dot(kVecSize / QK4_0, q40.data(), y1.data());
+        auto t2 = std::chrono::high_resolution_clock::now();
+        auto t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
+        sumt += t; sumt2 += t*t; maxt = std::max(maxt, t);
+
+        // And now measure the time needed to quantize y and perform the dot product with the quantized y
+        t1 = std::chrono::high_resolution_clock::now();
+        float result;
+        if (scalar) {
+            quantize_row_q8_0_reference(y1.data(), q8.data(), kVecSize);
+            dot_q4_q8(kVecSize, &result, q40.data(), q8.data());
+        }
+        else {
+            funcs.quantize_row_q_dot(y1.data(), q8.data(), kVecSize);
+            if (useQ4_1) funcs.vec_dot_q(kVecSize, &result, q41.data(), q8.data());
+            else funcs.vec_dot_q(kVecSize, &result, q40.data(), q8.data());
+        }
+        sumq += result;
+        t2 = std::chrono::high_resolution_clock::now();
+        t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
+        sumqt += t; sumqt2 += t*t; maxqt = std::max(maxqt, t);
+
+    }
+
+    // Report the time (and the average of the dot products so the compiler does not come up with the idea
+    // of optimizing away the function calls after figuring that the result is not used).
+    sum /= nloop; sumq /= nloop;
+    exactSum /= nloop;
+    printf("Exact result: <dot> = %g\n",exactSum);
+    printf("<dot> = %g, %g\n",sum,sumq);
+    sumt /= nloop; sumt2 /= nloop; sumt2 -= sumt*sumt;
+    if (sumt2 > 0) sumt2 = sqrt(sumt2);
+    printf("time = %g +/- %g us. maxt = %g us\n",sumt,sumt2,maxt);
+    sumqt /= nloop; sumqt2 /= nloop; sumqt2 -= sumqt*sumqt;
+    if (sumqt2 > 0) sumqt2 = sqrt(sumqt2);
+    printf("timeq = %g +/- %g us. maxt = %g us\n",sumqt,sumqt2,maxqt);
+    return 0;
+}
--- a/prompts/alpaca.txt
+++ b/prompts/alpaca.txt
@@ -0,0 +1 @@
+Below is an instruction that describes a task. Write a response that appropriately completes the request.
--- a/prompts/chat-with-bob.txt
+++ b/prompts/chat-with-bob.txt
@@ -0,0 +1,7 @@
+Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
+
+User: Hello, Bob.
+Bob: Hello. How may I help you today?
+User: Please tell me the largest city in Europe.
+Bob: Sure. The largest city in Europe is Moscow, the capital of Russia.
+User:
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,2 @@
 numpy==1.24
-sentencepiece==0.1.97
+sentencepiece==0.1.98
--- a/tests/test-tokenizer-0.cpp
+++ b/tests/test-tokenizer-0.cpp
@@ -5,13 +5,17 @@
 #include <map>
 #include <vector>

-static const std::map<std::string, std::vector<llama_token>> k_tests = {
-    { "Hello World",        { 1,  10994,   2787, }, },
-    { " Hello World",       { 1,  15043,   2787, }, },
-    { " Hello World!",      { 1,  15043,   2787,  29991, }, },
-    { " this is 🦙.cpp",    { 1,    445,    338,  29871,    243,    162,    169,    156,  29889,   8223, }, },
-    { "w048 7tuijk dsdfhu", { 1,  29893,  29900,  29946,  29947,  29871,  29955,   9161,  13535,  18031,   2176,   6905, }, },
-    { "нещо на Български",  { 1,    821,   4851,    665,   1386,  29713,   1305, }, },
+static const std::map<std::string, std::vector<llama_token>> & k_tests()
+{
+    static std::map<std::string, std::vector<llama_token>> _k_tests = {
+        { "Hello World",        { 1,  10994,   2787, }, },
+        { " Hello World",       { 1,  15043,   2787, }, },
+        { " Hello World!",      { 1,  15043,   2787,  29991, }, },
+        { " this is 🦙.cpp",    { 1,    445,    338,  29871,    243,    162,    169,    156,  29889,   8223, }, },
+        { "w048 7tuijk dsdfhu", { 1,  29893,  29900,  29946,  29947,  29871,  29955,   9161,  13535,  18031,   2176,   6905, }, },
+        { "нещо на Български",  { 1,    821,   4851,    665,   1386,  29713,   1305, }, },
+    };
+    return _k_tests;
 };

 int main(int argc, char **argv) {
@@ -47,7 +51,7 @@ int main(int argc, char **argv) {
        return 2;
    }

-    for (const auto & test_kv : k_tests) {
+    for (const auto & test_kv : k_tests()) {
        std::vector<llama_token> res(test_kv.first.size());
        const int n = llama_tokenize(ctx, test_kv.first.c_str(), res.data(), res.size(), true);
        res.resize(n);
Author	SHA1	Message	Date
Kawrakow	5ecff35151	Adding a simple program to measure speed of dot products (#1041 ) On my Mac, the direct Q4_1 product is marginally slower (~69 vs ~55 us for Q4_0). The SIMD-ified ggml version is now almost 2X slower (~121 us). On a Ryzen 7950X CPU, the direct product for Q4_1 quantization is faster than the AVX2 implementation (~60 vs ~62 us). --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>	2023-04-18 19:00:14 +00:00
Georgi Gerganov	7faa7460f0	readme : update hot topics about new LoRA functionality	2023-04-18 20:10:26 +03:00
Georgi Gerganov	5af8e32238	ci : do not run on drafts	2023-04-18 19:57:06 +03:00
Ivan Komarov	42747220b4	Do not close file after mmap (Windows version) (#1034 )	2023-04-18 03:15:50 +02:00
Atsushi Tatsuma	e9298af389	readme : add Ruby bindings (#1029 )	2023-04-17 22:34:35 +03:00
Cameron	4ad73137a1	add 4_0 to default outfile namestr dict (#1031 ) this came up when trying to convert the gpt4all-lora-unfiltered-quantized.bin file	2023-04-17 20:26:23 +02:00
slaren	315a95a4d3	Add LoRA support (#820 )	2023-04-17 17:28:55 +02:00
Arik Poznanski	efd05648c8	llama : well-defined static initialization of complex objects (#927 ) * Replaced static initialization of complex objects with a initialization on first use. This prevents an undefined behavior on program run, for example, crash in Release build, works in Debug build * replaced use of auto with exact type to avoid using -std=c++14 * Made the assessors functions for static maps be static const	2023-04-17 17:41:53 +03:00
Georgi Gerganov	eb17a026fd	quantize-stats : fix bug in --type argument	2023-04-17 17:31:06 +03:00
Georgi Gerganov	69b740289f	ggml : avoid using ggml_fp16_to_fp32() and ggml_fp32_to_fp16() in ggml.c	2023-04-17 16:16:23 +03:00
Ivan Komarov	f266259ad9	Speedup the AVX-512 implementation of ggml_vec_dot_q4_0() (#933 )	2023-04-17 15:10:57 +02:00
slaren	47f61aaa5f	Fix: do not close file on mmap (#1017 )	2023-04-16 21:27:38 +02:00
Georgi Gerganov	3173a62eb9	stdout : vertical align outputs for better readibility	2023-04-16 13:59:27 +03:00
Pavol Rusnak	489537e6cf	examples: add missing <ctime> include for time() (#1011 )	2023-04-16 10:13:00 +00:00
nanahi	2d3481c721	Fix msys2 build error and warnings (#1009 )	2023-04-16 11:13:42 +02:00
comex	74f5899df4	convert.py: Fix loading safetensors and ggml format on Windows (#991 ) Calling `mmap.mmap` on Windows apparently resets the file offset of the raw file object (and makes the BufferedReader return a negative file offset). For safetensors, avoid using the file offset after calling mmap. For GGML format, explicitly save and restore the offset. Fixes #966.	2023-04-15 23:53:21 +02:00
Stephan Walter	2f7c8e014e	Fix potential int8 overflow in non-SIMD vec_dot (#986 )	2023-04-15 18:28:56 +00:00
Stephan Walter	0ad964631f	Refactor ggml.c for future tensor types (#1001 )	2023-04-15 16:25:38 +00:00
Georgi Gerganov	e95b6554b4	ggml : add Q8_0 quantization for intermediate results (#951 ) * ggml : add Q8_0 quantization for intermediate results * quantize-stats : fix test + add it to Makefile default * Q8: use int8_t, AVX/AVX2 optimizations * ggml : fix quantize_row_q8_0() ARM_NEON rounding * minor : updates after rebase to latest master * quantize-stats : delete obsolete strings * ggml : fix q4_1 dot func --------- Co-authored-by: Stephan Walter <stephan@walter.name>	2023-04-15 17:53:22 +03:00
Georgi Gerganov	aa485cee33	ggml : use posix_memalign on non-Windows env	2023-04-15 14:25:45 +03:00
Ivan Komarov	c12b14b77f	benchmark : fix result validation in benchmark-q4_0-matmult (#987 )	2023-04-15 08:51:54 +03:00
katsu560	106faaf297	cmake : add finding the OpenBLAS header file (#992 )	2023-04-15 08:51:11 +03:00
Pavol Rusnak	c85e03d12e	Revert "main : alternative instruct mode (Vicuna support, etc.) (#863 )" (#982 ) This reverts commit `f4d277ae17`.	2023-04-14 22:58:43 +03:00
Pavol Rusnak	489093548c	py : bump sentencepiece to 0.1.98 to support Python 3.11 (#976 )	2023-04-14 19:46:49 +00:00
				`@@ -0,0 +1 @@`
				`Below is an instruction that describes a task. Write a response that appropriately completes the request.`