Remove Q4_3 which is no better than Q5 (#1218 )

readme : update hot topics
ggml : sync ggml (ggml_alibi)
2026-02-26 14:23:22 +02:00 · 2023-04-28 23:10:43 +00:00 · 2023-04-28 21:32:52 +03:00 · 2023-04-28 20:51:05 +03:00 · 2023-04-28 19:13:33 +03:00 · 2023-04-28 18:59:37 +03:00
42 changed files with 4542 additions and 1444 deletions
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@@ -23,7 +23,7 @@ elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then
            echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
        else
            echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
-            ./quantize "$i" "${i/f16/q4_0}" 2
+            ./quantize "$i" "${i/f16/q4_0}" q4_0
        fi
    done
 else
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -12,17 +12,15 @@ on:
      - master
    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp']
  pull_request:
-    types: [opened, synchronize, edited, reopened, review_requested, ready_for_review]
+    types: [opened, synchronize, reopened]
    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp']

 env:
 BRANCH_NAME: ${{ github.head_ref || github.ref_name }}

 jobs:
-  ubuntu-latest-make:
-    if: github.event.pull_request.draft == false
-
-    runs-on: ubuntu-latest
+  ubuntu-focal-make:
+    runs-on: ubuntu-20.04

    steps:
      - name: Clone
@@ -33,16 +31,14 @@ jobs:
        id: depends
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential
+          sudo apt-get install build-essential gcc-8

      - name: Build
        id: make_build
        run: |
-          make
+          CC=gcc-8 make

  ubuntu-latest-cmake:
-    if: github.event.pull_request.draft == false
-
    runs-on: ubuntu-latest

    steps:
@@ -71,8 +67,6 @@ jobs:
          ctest --verbose

  ubuntu-latest-cmake-sanitizer:
-    if: github.event.pull_request.draft == false
-
    runs-on: ubuntu-latest

    continue-on-error: true
@@ -108,8 +102,6 @@ jobs:
          ctest --verbose

  macOS-latest-make:
-    if: github.event.pull_request.draft == false
-
    runs-on: macos-latest

    steps:
@@ -128,8 +120,6 @@ jobs:
          make

  macOS-latest-cmake:
-    if: github.event.pull_request.draft == false
-
    runs-on: macOS-latest

    steps:
@@ -157,8 +147,6 @@ jobs:
          ctest --verbose

  windows-latest-cmake:
-    if: github.event.pull_request.draft == false
-
    runs-on: windows-latest

    strategy:
@@ -169,7 +157,7 @@ jobs:
         - build: 'avx'
           defines: '-DLLAMA_AVX2=OFF'
         - build: 'avx512'
-           defines: '-DLLAMA_AVX512=ON'
+           defines: '-DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'

    steps:
      - name: Clone
@@ -228,7 +216,7 @@ jobs:
    runs-on: ubuntu-latest

    needs:
-      - ubuntu-latest-make
+      - ubuntu-focal-make
      - ubuntu-latest-cmake
      - macOS-latest-make
      - macOS-latest-cmake
--- a/.gitignore
+++ b/.gitignore
@@ -15,6 +15,7 @@ build-em/
 build-debug/
 build-release/
 build-static/
+build-cublas/
 build-no-accel/
 build-sanitize-addr/
 build-sanitize-thread/
@@ -40,3 +41,5 @@ zig-out/
 zig-cache/

 ppl-*.txt
+
+examples/jeopardy/results.txt
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -67,6 +67,7 @@ endif()
 option(LLAMA_ACCELERATE             "llama: enable Accelerate framework"                    ON)
 option(LLAMA_OPENBLAS               "llama: use OpenBLAS"                                   OFF)
 option(LLAMA_CUBLAS                 "llama: use cuBLAS"                                     OFF)
+option(LLAMA_CLBLAST                "llama: use CLBlast"                                    OFF)

 option(LLAMA_BUILD_TESTS            "llama: build tests"    ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES         "llama: build examples" ${LLAMA_STANDALONE})
@@ -168,6 +169,21 @@ if (LLAMA_CUBLAS)
    endif()
 endif()

+if (LLAMA_CLBLAST)
+    find_package(CLBlast)
+    if (CLBlast_FOUND)
+        message(STATUS "CLBlast found")
+
+        set(GGML_OPENCL_SOURCES ggml-opencl.c ggml-opencl.h)
+
+        add_compile_definitions(GGML_USE_CLBLAST)
+
+        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} clblast)
+    else()
+        message(WARNING "CLBlast not found")
+    endif()
+endif()
+
 if (LLAMA_ALL_WARNINGS)
    if (NOT MSVC)
        set(c_flags
@@ -201,6 +217,10 @@ endif()

 if (MSVC)
    add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
+
+    if (BUILD_SHARED_LIBS)
+        set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
+    endif()
 endif()

 if (LLAMA_LTO)
@@ -303,11 +323,13 @@ endif()
 add_library(ggml OBJECT
            ggml.c
            ggml.h
-            ${GGML_CUDA_SOURCES})
+            ${GGML_CUDA_SOURCES}
+            ${GGML_OPENCL_SOURCES})

 target_include_directories(ggml PUBLIC .)
 target_compile_features(ggml PUBLIC c_std_11) # don't bump
-target_link_libraries(ggml PRIVATE Threads::Threads ${LLAMA_EXTRA_LIBS})
+target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
+
 if (BUILD_SHARED_LIBS)
    set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endif()
@@ -320,6 +342,7 @@ add_library(llama
 target_include_directories(llama PUBLIC .)
 target_compile_features(llama PUBLIC cxx_std_11) # don't bump
 target_link_libraries(llama PRIVATE ggml ${LLAMA_EXTRA_LIBS})
+
 if (BUILD_SHARED_LIBS)
    set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
    target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD)
--- a/31
+++ b/31
@@ -74,13 +74,17 @@ endif
 #       feel free to update the Makefile for your architecture and send a pull request or issue
 ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
 	# Use all CPU extensions that are available:
-	CFLAGS += -march=native -mtune=native
+	CFLAGS   += -march=native -mtune=native
 	CXXFLAGS += -march=native -mtune=native
+
+	# Usage AVX-only
+	#CFLAGS   += -mfma -mf16c -mavx
+	#CXXFLAGS += -mfma -mf16c -mavx
 endif
 ifneq ($(filter ppc64%,$(UNAME_M)),)
 	POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
 	ifneq (,$(findstring POWER9,$(POWER9_M)))
-		CFLAGS += -mcpu=power9
+		CFLAGS   += -mcpu=power9
 		CXXFLAGS += -mcpu=power9
 	endif
 	# Require c++23's std::byteswap for big-endian support.
@@ -101,18 +105,31 @@ ifdef LLAMA_OPENBLAS
 	LDFLAGS += -lopenblas
 endif
 ifdef LLAMA_CUBLAS
-	CFLAGS  += -DGGML_USE_CUBLAS -I/usr/local/cuda/include
-	LDFLAGS += -lcublas_static -lculibos -lcudart_static -lcublasLt_static -lpthread -ldl -lrt -L/usr/local/cuda/lib64
-	OBJS	+= ggml-cuda.o
+	CFLAGS    += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
+	LDFLAGS   += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
+	OBJS      += ggml-cuda.o
+	NVCC      = nvcc
+	NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
-	nvcc -arch=native -c -o $@ $<
+	$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
+endif
+ifdef LLAMA_CLBLAST
+	CFLAGS  += -DGGML_USE_CLBLAST
+	LDFLAGS += -lclblast -lOpenCL
+	OBJS    += ggml-opencl.o
+ggml-opencl.o: ggml-opencl.c ggml-opencl.h
+	$(CC) $(CFLAGS) -c $< -o $@
 endif
 ifdef LLAMA_GPROF
 	CFLAGS   += -pg
 	CXXFLAGS += -pg
 endif
+ifdef LLAMA_PERF
+	CFLAGS   += -DGGML_PERF
+	CXXFLAGS += -DGGML_PERF
+endif
 ifneq ($(filter aarch64%,$(UNAME_M)),)
-	CFLAGS += -mcpu=native
+	CFLAGS   += -mcpu=native
 	CXXFLAGS += -mcpu=native
 endif
 ifneq ($(filter armv6%,$(UNAME_M)),)
--- a/README.md
+++ b/README.md
@@ -7,31 +7,25 @@

 Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++

-**Warnings**
-
- `Q4_2` and `Q4_3` are still in development. Do not expect any kind of backward compatibility until they are finalized
-
 **Hot topics:**

- [Added LoRA support](https://github.com/ggerganov/llama.cpp/pull/820)
- [Add GPU support to ggml](https://github.com/ggerganov/llama.cpp/discussions/915)
- [Roadmap Apr 2023](https://github.com/ggerganov/llama.cpp/discussions/784)
+- [Roadmap May 2023](https://github.com/ggerganov/llama.cpp/discussions/1220)
+- [New quantization methods](https://github.com/ggerganov/llama.cpp#quantization)

 ## Description

-The main goal of llama.cpp is to run the llama model using 4-bit quantization on a MacBook.
+The main goal of `llama.cpp` is to run the LLaMA model using 4-bit integer quantization on a MacBook

 - Plain C/C++ implementation without dependencies
 - Apple silicon first-class citizen - optimized via ARM NEON and Accelerate framework
 - AVX2 support for x86 architectures
 - Mixed F16 / F32 precision
- 4-bit quantization support
+- 4-bit integer quantization support
 - Runs on the CPU

-This was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022) - I have no idea if it works correctly.
-Please do not make conclusions about the models based on the results from this implementation.
-For all I know, it can be completely wrong. This project is for educational purposes.
-New features will probably be added mostly through community contributions.
+The original implementation of `llama.cpp` was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022).
+Since then, the project has improved significantly thanks to many contributions. This project is for educational purposes and serves
+as the main playground for developing new features for the [ggml](https://github.com/ggerganov/ggml) library.

 **Supported platforms:**

@@ -167,15 +161,27 @@ cd llama.cpp

 ### Build

-Note: For Windows, CMake or Zig can be used.
+In order to build llama.cpp you have three different options.

-1. Use `make`
+- Using `make`:
+  - On Linux or MacOS:

-    ```bash
-    make
-    ```
+      ```bash
+      make
+      ```

-1. Use CMake
+  - On Windows:
+
+    1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
+    2. Extract `w64devkit` on your pc.
+    3. Run `w64devkit.exe`.
+    4. Use the `cd` command to reach the `llama.cpp` folder.
+    5. From here you can run:
+        ```bash
+        make
+        ```
+
+- Using `CMake`:

    ```bash
    mkdir build
@@ -184,12 +190,71 @@ Note: For Windows, CMake or Zig can be used.
    cmake --build . --config Release
    ```

-1. Use Zig
+- Using `Zig`:

    ```bash
    zig build -Drelease-fast
    ```

+### BLAS Build
+
+Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). BLAS doesn't affect the normal generation performance. There are currently three different implementations of it:
+
+- Accelerate Framework:
+
+  This is only available on Mac PCs and it's enabled by default. You can just build using the normal instructions.
+
+- OpenBLAS:
+
+  This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS installed on your machine.
+
+  - Using `make`:
+    - On Linux:
+      ```bash
+      make LLAMA_OPENBLAS=1
+      ```
+      Note: In order to build on Arch Linux with OpenBLAS support enabled you must edit the Makefile adding at the end of the line 105: `-lcblas`
+
+    - On Windows:
+
+      1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
+      2. Download the latest version of [OpenBLAS for Windows](https://github.com/xianyi/OpenBLAS/releases).
+      3. Extract `w64devkit` on your pc.
+      4. From the OpenBLAS zip that you just downloaded copy `libopenblas.a`, located inside the `lib` folder, inside `w64devkit\x86_64-w64-mingw32\lib`.
+      5. From the same OpenBLAS zip copy the content of the `include` folder inside `w64devkit\x86_64-w64-mingw32\include`.
+      6. Run `w64devkit.exe`.
+      7. Use the `cd` command to reach the `llama.cpp` folder.
+      8. From here you can run:
+
+          ```bash
+          make LLAMA_OPENBLAS=1
+          ```
+
+  - Using `CMake` on Linux:
+
+      ```bash
+      mkdir build
+      cd build
+      cmake .. -DLLAMA_OPENBLAS=ON
+      cmake --build . --config Release
+      ```
+
+- cuBLAS
+
+  This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
+  - Using `make`:
+    ```bash
+    make LLAMA_CUBLAS=1
+    ```
+  - Using `CMake`:
+
+    ```bash
+    mkdir build
+    cd build
+    cmake .. -DLLAMA_CUBLAS=ON
+    cmake --build . --config Release
+    ```
+
 ### Prepare Data & Run

 ```bash
@@ -203,8 +268,8 @@ python3 -m pip install -r requirements.txt
 # convert the 7B model to ggml FP16 format
 python3 convert.py models/7B/

-# quantize the model to 4-bits (using method 2 = q4_0)
-./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin 2
+# quantize the model to 4-bits (using q4_0 method)
+./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin q4_0

 # run the inference
 ./main -m ./models/7B/ggml-model-q4_0.bin -n 128
@@ -216,12 +281,29 @@ When running the larger models, make sure you have enough disk space to store al

 As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same.

-| model | original size | quantized size (4-bit) |
-|-------|---------------|------------------------|
-| 7B    | 13 GB         | 3.9 GB                 |
-| 13B   | 24 GB         | 7.8 GB                 |
-| 30B   | 60 GB         | 19.5 GB                |
-| 65B   | 120 GB        | 38.5 GB                |
+| Model | Original size | Quantized size (4-bit) |
+|------:|--------------:|-----------------------:|
+|    7B |         13 GB |                 3.9 GB |
+|   13B |         24 GB |                 7.8 GB |
+|   30B |         60 GB |                19.5 GB |
+|   65B |        120 GB |                38.5 GB |
+
+### Quantization
+
+Several quantization methods are supported. They differ in the resulting model disk size and inference speed.
+
+| Model | Measure      | F16    | Q4_0   | Q4_1   | Q4_2   | Q5_0   | Q5_1   | Q8_0   |
+|------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|-------:|
+|    7B | perplexity   | 5.9565 | 6.2103 | 6.1286 | 6.1698 | 6.0139 | 5.9934 | 5.9571 |
+|    7B | file size    |  13.0G |   4.0G |   4.8G |   4.0G |   4.4G |   4.8G |   7.1G |
+|    7B | ms/tok @ 4th |    128 |     56 |     61 |     84 |     91 |     95 |     75 |
+|    7B | ms/tok @ 8th |    128 |     47 |     55 |     48 |     53 |     59 |     75 |
+|    7B | bits/weight  |   16.0 |    5.0 |    6.0 |    5.0 |    5.5 |    6.0 |    9.0 |
+|   13B | perplexity   | 5.2455 | 5.3748 | 5.3471 | 5.3433 | 5.2768 | 5.2582 | 5.2458 |
+|   13B | file size    |  25.0G |   7.6G |   9.1G |   7.6G |   8.4G |   9.1G |    14G |
+|   13B | ms/tok @ 4th |    239 |    104 |    113 |    160 |    176 |    185 |    141 |
+|   13B | ms/tok @ 8th |    240 |     85 |     99 |     97 |    108 |    117 |    147 |
+|   13B | bits/weight  |   16.0 |    5.0 |    6.0 |    5.0 |    5.5 |    6.0 |    9.0 |

 ### Interactive mode

@@ -241,7 +323,7 @@ Here is an example of a few-shot interaction, invoked with the command
 ./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
 ```

-Note the use of `--color` to distinguish between user input and generated text.
+Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `main` example program.

 ![image](https://user-images.githubusercontent.com/1991296/224575029-2af3c7dc-5a65-4f64-a6bb-517a532aea38.png)

@@ -275,18 +357,19 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.

 ### Using [GPT4All](https://github.com/nomic-ai/gpt4all)

- Obtain the `gpt4all-lora-quantized.bin` model
- It is distributed in the old `ggml` format, which is now obsoleted
- You have to convert it to the new format using [./convert-gpt4all-to-ggml.py](./convert-gpt4all-to-ggml.py). You may also need to
-convert the model from the old format to the new format with [./migrate-ggml-2023-03-30-pr613.py](./migrate-ggml-2023-03-30-pr613.py):
+- Obtain the `tokenizer.model` file from LLaMA model and put it to `models`
+- Obtain the `added_tokens.json` file from Alpaca model and put it to `models`
+- Obtain the `gpt4all-lora-quantized.bin` file from GPT4All model and put it to `models/gpt4all-7B`
+- It is distributed in the old `ggml` format which is now obsoleted
+- You have to convert it to the new format using `convert.py`:

-  ```bash
-  python3 convert-gpt4all-to-ggml.py models/gpt4all-7B/gpt4all-lora-quantized.bin ./models/tokenizer.model
-  python3 migrate-ggml-2023-03-30-pr613.py models/gpt4all-7B/gpt4all-lora-quantized.bin models/gpt4all-7B/gpt4all-lora-quantized-new.bin
-  ```
+```bash
+python3 convert.py models/gpt4all-7B/gpt4all-lora-quantized.bin
+```

- You can now use the newly generated `gpt4all-lora-quantized-new.bin` model in exactly the same way as all other models
- The original model is saved in the same folder with a suffix `.orig`
+- You can now use the newly generated `models/gpt4all-7B/ggml-model-q4_0.bin` model in exactly the same way as all other models
+
+- The newer GPT4All-J model is not yet supported!

 ### Obtaining and verifying the Facebook LLaMA original model and Stanford Alpaca model data

--- a/16
+++ b/16
@@ -1,12 +1,24 @@
 700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d  models/7B/consolidated.00.pth
+666a4bb533b303bdaf89e1b6a3b6f93535d868de31d903afdc20983dc526c847  models/7B/ggml-model-f16.bin
+99aeb35f26b577fa2732716cca4d8b5ada39a78ea9b2dca2651fc632b5d101b6  models/7B/ggml-model-q4_0.bin
+cc061458339a3eb8bcecbf0a825e9924fb7d1a8150f63cd5d091caa99215aafe  models/7B/ggml-model-q4_1.bin
+25b050337a87344da687a7f2adddc03bd99b7f6c140450e836649f3585fb6496  models/7B/ggml-model-q4_2.bin
 7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265  models/7B/params.json
 745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08  models/13B/consolidated.00.pth
 d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085  models/13B/consolidated.01.pth
+2b206e9b21fb1076f11cafc624e2af97c9e48ea09312a0962153acc20d45f808  models/13B/ggml-model-f16.bin
+eecb575d325d935157761172e2bf05984dad216eb2b06777b73463cf9b818bab  models/13B/ggml-model-q4_0.bin
+d9581b5b88e5622532fe897c9f9b0e67a317d22dd27a6f90fa4ab8c6d23ccdbb  models/13B/ggml-model-q4_1.bin
+75a218a47df03f5f96354656329864613abcb67779412b9bc2282b28c1c3cbaa  models/13B/ggml-model-q4_2.bin
 4ab77bec4d4405ccb66a97b282574c89a94417e3c32e5f68f37e2876fc21322f  models/13B/params.json
 e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067  models/30B/consolidated.00.pth
 4e077b7136c7ae2302e954860cf64930458d3076fcde9443f4d0e939e95903ff  models/30B/consolidated.01.pth
 24a87f01028cbd3a12de551dcedb712346c0b5cbdeff1454e0ddf2df9b675378  models/30B/consolidated.02.pth
 1adfcef71420886119544949767f6a56cb6339b4d5fcde755d80fe68b49de93b  models/30B/consolidated.03.pth
+7e1b524061a9f4b27c22a12d6d2a5bf13b8ebbea73e99f218809351ed9cf7d37  models/30B/ggml-model-f16.bin
+517b9e525742c42b5478a6280a4b41ec66f46298c57aba7f0453d491682fe42d  models/30B/ggml-model-q4_0.bin
+7b75ac615fa369ee593493a7e6ef87542bf0350255db928b22c5a24f6d598bcd  models/30B/ggml-model-q4_1.bin
+aadbc9cf806313a55be570f62884eed289d30c313fac3b7838717e01bd553204  models/30B/ggml-model-q4_2.bin
 2c07118ea98d69dbe7810d88520e30288fa994751b337f8fca02b171955f44cb  models/30B/params.json
 135c563f6b3938114458183afb01adc9a63bef3d8ff7cccc3977e5d3664ecafe  models/65B/consolidated.00.pth
 9a600b37b19d38c7e43809485f70d17d1dc12206c07efa83bc72bb498a568bde  models/65B/consolidated.01.pth
@@ -16,5 +28,9 @@ e7babf7c5606f165a3756f527cb0fedc4f83e67ef1290391e52fb1cce5f26770  models/65B/con
 a287c0dfe49081626567c7fe87f74cce5831f58e459b427b5e05567641f47b78  models/65B/consolidated.05.pth
 72b4eba67a1a3b18cb67a85b70f8f1640caae9b40033ea943fb166bd80a7b36b  models/65B/consolidated.06.pth
 d27f5b0677d7ff129ceacd73fd461c4d06910ad7787cf217b249948c3f3bc638  models/65B/consolidated.07.pth
+60758f2384d74e423dffddfd020ffed9d3bb186ebc54506f9c4a787d0f5367b0  models/65B/ggml-model-f16.bin
+01672072136f8be6ca9d7cebe5f86ed316e8b85851b9fe3de951809233cea4f2  models/65B/ggml-model-q4_0.bin
+4743a28aac3e5f32a6e838a815f51d3779de44fbbe251d745251e66c23c5950f  models/65B/ggml-model-q4_1.bin
+1b6f6588d0e2ecfe6c4d849088e48e5e3083466b962daa32e3261363e21fc5e9  models/65B/ggml-model-q4_2.bin
 999ed1659b469ccc2a941714c0a9656fa571d17c9f7c8c7589817ca90edef51b  models/65B/params.json
 9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347  models/tokenizer.model
--- a/convert-lora-to-ggml.py
+++ b/convert-lora-to-ggml.py
@@ -49,7 +49,12 @@ def translate_tensor_name(t: str) -> str:
 def write_file_header(fout: TextIO, params: Dict[str, Any]) -> None:
    fout.write(b"ggla"[::-1])  # magic (ggml lora)
    fout.write(struct.pack("i", 1))  # file version
-    fout.write(struct.pack("ii", params["r"], params["lora_alpha"]))
+    fout.write(struct.pack("i", params["r"]))
+    # https://opendelta.readthedocs.io/en/latest/modules/deltas.html says that `lora_alpha` is an int
+    # but some models ship a float value instead
+    # let's convert to int, but fail if lossless conversion is not possible
+    assert int(params["lora_alpha"]) == params["lora_alpha"], "cannot convert float to int losslessly"
+    fout.write(struct.pack("i", int(params["lora_alpha"])))


 def write_tensor_header(
@@ -89,7 +94,7 @@ if params["peft_type"] != "LORA":
    print(f"Error: unsupported adapter type {params['peft_type']}, expected LORA")
    sys.exit(1)

-if params["fan_in_fan_out"] == True:
+if params["fan_in_fan_out"] is True:
    print("Error: param fan_in_fan_out is not supported")
    sys.exit(1)

--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -34,4 +34,5 @@ else()
    add_subdirectory(quantize-stats)
    add_subdirectory(perplexity)
    add_subdirectory(embedding)
+    add_subdirectory(save-load-state)
 endif()
--- a/examples/alpaca.sh
+++ b/examples/alpaca.sh
@@ -7,4 +7,13 @@
 cd `dirname $0`
 cd ..

-./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt --ctx_size 2048 -n -1 -ins -b 256 --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7
+./main -m ./models/ggml-alpaca-7b-q4.bin \
+       --color \
+       -f ./prompts/alpaca.txt \
+       --ctx_size 2048 \
+       -n -1 \
+       -ins -b 256 \
+       --top_k 10000 \
+       --temp 0.2 \
+       --repeat_penalty 1.1 \
+       -t 7
--- a/examples/chat-13B.sh
+++ b/examples/chat-13B.sh
@@ -31,8 +31,6 @@ The transcript only includes text, it does not include markup like HTML and Mark

 $USER_NAME: Hello, $AI_NAME!
 $AI_NAME: Hello $USER_NAME! How may I help you today?
-$USER_NAME: What time is it?
-$AI_NAME: It is $(date +%H:%M).
 $USER_NAME: What year is it?
 $AI_NAME: We are in $(date +%Y).
 $USER_NAME: Please tell me the largest city in Europe.
@@ -50,4 +48,6 @@ $AI_NAME: The arguments are stored in process.argv.
    argv[3] is the second argument passed to the script and so on.
 $USER_NAME: Name a color.
 $AI_NAME: Blue
+$USER_NAME: What time is it?
+$AI_NAME: It is $(date +%H:%M).
 $USER_NAME:" "$@"
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -61,6 +61,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.prompt = argv[i];
+        } else if (arg == "--session") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.path_session = argv[i];
        } else if (arg == "-f" || arg == "--file") {
            if (++i >= argc) {
                invalid_param = true;
@@ -156,10 +162,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            params.interactive = true;
        } else if (arg == "--embedding") {
            params.embedding = true;
-        } else if (arg == "--interactive-start") {
-            params.interactive = true;
        } else if (arg == "--interactive-first") {
-            params.interactive_start = true;
+            params.interactive_first = true;
        } else if (arg == "-ins" || arg == "--instruct") {
            params.instruct = true;
        } else if (arg == "--color") {
@@ -230,6 +234,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
    fprintf(stderr, "  -p PROMPT, --prompt PROMPT\n");
    fprintf(stderr, "                        prompt to start generation with (default: empty)\n");
+    fprintf(stderr, "  --session FNAME       file to cache model state in (may be large!) (default: none)\n");
    fprintf(stderr, "  --random-prompt       start with a randomized prompt.\n");
    fprintf(stderr, "  --in-prefix STRING    string to prefix user inputs with (default: empty)\n");
    fprintf(stderr, "  -f FNAME, --file FNAME\n");
--- a/examples/common.h
+++ b/examples/common.h
@@ -20,7 +20,7 @@ struct gpt_params {
    int32_t repeat_last_n = 64;   // last n tokens to penalize
    int32_t n_parts       = -1;   // amount of model parts (-1 = determine from model dimensions)
    int32_t n_ctx         = 512;  // context size
-    int32_t n_batch       = 8;    // batch size for prompt processing
+    int32_t n_batch       = 512;  // batch size for prompt processing (must be >=32 to use BLAS)
    int32_t n_keep        = 0;    // number of tokens to keep from initial prompt

    // sampling parameters
@@ -31,6 +31,7 @@ struct gpt_params {

    std::string model  = "models/lamma-7B/ggml-model.bin"; // model path
    std::string prompt = "";
+    std::string path_session = "";       // path to file for saving/loading model eval state
    std::string input_prefix = "";       // string to prefix user inputs with
    std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted

@@ -43,7 +44,7 @@ struct gpt_params {
    bool interactive       = false; // interactive mode

    bool embedding         = false; // get only sentence embedding
-    bool interactive_start = false; // wait for user input immediately
+    bool interactive_first = false; // wait for user input immediately

    bool instruct          = false; // instruction mode (used for Alpaca models)
    bool ignore_eos        = false; // do not stop generating after eos
--- a/examples/jeopardy/README.md
+++ b/examples/jeopardy/README.md
@@ -0,0 +1,21 @@
+# llama.cpp/example/jeopardy
+
+This is pretty much just a straight port of aigoopy/llm-jeopardy/ with an added graph viewer.
+
+The jeopardy test can be used to compare the fact knowledge of different models and compare them to eachother. This is in contrast to some other tests, which test logical deduction, creativity, writing skills, etc.
+
+
+Step 1: Open jeopardy.sh and modify the following:
+```
+MODEL=(path to your model)
+MODEL_NAME=(name of your model)
+prefix=(basically, if you use vicuna it's Human: , if you use something else it might be User: , etc)
+opts=(add -instruct here if needed for your model, or anything else you want to test out)
+```
+Step 2: Run `jeopardy.sh` from the llama.cpp folder
+
+Step 3: Repeat steps 1 and 2 until you have all the results you need.
+
+Step 4: Run `graph.py`, and follow the instructions. At the end, it will generate your final graph.
+
+Note: The Human bar is based off of the full, original 100 sample questions. If you modify the question count or questions, it will not be valid.
--- a/examples/jeopardy/graph.py
+++ b/examples/jeopardy/graph.py
@@ -0,0 +1,56 @@
+import matplotlib.pyplot as plt
+import sys, os
+import csv
+
+labels = []
+numbers = []
+numEntries = 1
+
+rows = []
+
+def bar_chart(numbers, labels, pos):
+    plt.bar(pos, numbers, color='blue')
+    plt.xticks(ticks=pos, labels=labels)
+    plt.title("Jeopardy Results by Model")
+    plt.xlabel("Model")
+    plt.ylabel("Questions Correct")
+    plt.show()
+
+def calculatecorrect():
+    directory = os.fsencode("./examples/jeopardy/results/")
+    csv_reader = csv.reader(open("./examples/jeopardy/qasheet.csv", 'rt'), delimiter=',')
+    for row in csv_reader:
+        global rows
+        rows.append(row)
+    for listing in os.listdir(directory):
+        filename = os.fsdecode(listing)
+        if filename.endswith(".txt"):
+            file = open("./examples/jeopardy/results/" + filename, "rt")
+            global labels
+            global numEntries
+            global numbers
+            labels.append(filename[:-4])
+            numEntries += 1
+            i = 1
+            totalcorrect = 0
+            for line in file.readlines():
+                if line.strip() != "------":
+                    print(line)
+                else:
+                    print("Correct answer: " + rows[i][2] + "\n")
+                    i+=1
+                    print("Did the AI get the question right? (y/n)")
+                    if input() == "y":
+                        totalcorrect += 1
+            numbers.append(totalcorrect)
+
+
+
+if __name__ == '__main__':
+    calculatecorrect()
+    pos = list(range(numEntries))
+    labels.append("Human")
+    numbers.append(48.11)
+    bar_chart(numbers, labels, pos)
+    print(labels)
+    print(numbers)
--- a/examples/jeopardy/jeopardy.sh
+++ b/examples/jeopardy/jeopardy.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+set -e
+
+MODEL=./models/ggml-vicuna-13b-1.1-q4_0.bin
+MODEL_NAME=Vicuna
+
+# exec options
+prefix="Human: " # Ex. Vicuna uses "Human: "
+opts="--temp 0 -n 80" # additional flags
+nl='
+'
+introduction="You will be playing a game of Jeopardy. Simply answer the question in the correct format (Ex. What is Paris, or Who is George Washington)."
+
+# file options
+question_file=./examples/jeopardy/questions.txt
+touch ./examples/jeopardy/results/$MODEL_NAME.txt
+output_file=./examples/jeopardy/results/$MODEL_NAME.txt
+
+counter=1
+
+echo 'Running'
+while IFS= read -r question
+do
+  exe_cmd="./main -p "\"$prefix$introduction$nl$prefix$question\"" "$opts" -m ""\"$MODEL\""" >> ""\"$output_file\""
+  echo $counter
+  echo "Current Question: $question"
+  eval "$exe_cmd"
+  echo -e "\n------" >> $output_file
+  counter=$((counter+1))
+done < "$question_file"
--- a/examples/jeopardy/qasheet.csv
+++ b/examples/jeopardy/qasheet.csv
@@ -0,0 +1,103 @@
+Index,Original Category,Original Correct Question,Model Prompt
+1,The Oscars,Who is John Williams?,Which actor Born in 1932 was the son of a percussionist in the CBS radio orchestra has been nominated for 53 Oscars?
+2,English Literature,What is Paradise Lost?,"What work in English Literature says: 'The mind is its own place, & in itself can make a heaven of hell, a hell of heaven. What matter where, if I be still the same'?"
+3,Writers’ Lesser-Known Works,Who is Niccolò Machiavelli?,"Known for more philosophical works, he wrote the play 'La Mandragola', in which Florentines are rewarded for immoral actions?"
+4,Exploration,What is Easter Island (Rapa Nui)?,"James Cook's account of a 1774 visit where records an object 'near 27 feet long, and upwards of 8 feet over the breast or shoulders'?"
+5,The Bill of Rights,What is the Eighth Amendment?,England's 'Bloody Assizes' & a 1685 life sentence for perjury were 2 main origins of which amendment to the U.S. Constitution?
+6,Nobel Peace Prize Winners,Who are Nelson Mandela & Desmond Tutu?,"Which nobel peace price winners each lived at times on Vilakazi St. in Soweto , so it claims to be the world's only street home to 2 Nobel Peace Prize winners?"
+7,Famous Names,Who is Walt Disney?,"In 1966, the year of who's death did he share plans for an experimental prototype community in Florida?"
+8,Geography,What is Colombia?,"Of the 13 nations through which the Equator passes, what is the only one whose coastline borders the Caribbean Sea?"
+9,Fashion History,What are rhinestones?,"Which decorative items in fashion history get their name from their origin in the port city of Strasbourg, on the border of France & Germany?"
+10,Movies of the ’80s,What is Driving Miss Daisy?,What 1980's movie is based on an off-Broadway play with just 3 characters and won the Best Picture Oscar & the actors in all 3 roles were nominated?
+11,Novelists,Who is John Grisham?,"A 2012 book review for which novelist noted subjects that 'sparked his ire': capital punishment, big tobacco & 'the plight of the unjustly convicted'?"
+12,20th Century Eponyms,What is the Maginot Line?,"A 1940 headline about what 20th Century Eponym included 'failure', 'liability when it came to offense' & 'stout hearts no match for tanks'?"
+13,City History,What is Stockholm?,"Over 700 years after its traditional 1252 founding date, what port city became associated with a psychological response?"
+14,Brand Names,What is Jacuzzi?,"The success of what brand has its roots with a hydrotherapy pump its cofounder created for his son, who had arthritis?"
+15,American Authors,Who is Washington Irving?,"In a periodical in 1807, what American Author called New York City 'Gotham, Gotham! Most enlightened of cities'?"
+16,Symbols,What is “less than”?,What symbol is a rotated V in math and a feeling of some marginalized or underrepresented people in society?
+17,Movie Theme Songs,Who is James Bond?,"Monty Norman, the composer of what character's theme, said the staccato riff conveyed sexiness, mystery & ruthlessness?"
+18,American Novelists,Who is Joseph Heller?,"What American Novelist served with an airman named Yohannan in World War II & despite what readers might think, he said he enjoyed his service?"
+19,Medieval Places,"What is Canterbury, England? (Canterbury Cathedral)","In what Medieval place did one of the participants in an 1170 event say, 'Let us away, knights; he will rise no more'?"
+20,Countries of Africa,What is Morocco?,"At one time a province of the Roman Empire, what African country kingdom is known to Arabic scholars as Al-Maghrib Al-Aqsa, 'the far west'?"
+21,Statehood,What is Wyoming?,Congress relented in 1890 after what prospective state said it would wait 100 years rather than come in without the women?
+22,1980s Movies,What is Raiders of the Lost Ark?,"A writer & producer of what movie said he wanted it to be like a Western or James Bond film, 'only it takes place in the 30s'?"
+23,Art Exhibitions,Who is Rembrandt?,In 1898 what's been called the first blockbuster art show was devoted to which artist & put on for Queen Wilhelmina's coronation?
+24,Countries of the World,What is Mongolia?,"Part of the largest contiguous land empire during the 1200s & 1300s, today what is the world's second-largest landlocked country?"
+25,Literature,What is “Howl”?,A 2006 book was titled 'The Poem That Changed America:' What 'Fifty Years Later'?
+26,Invasions,Who is William of Orange?,"Backed by 14,000 troops, who invaded England to restore, in his words, its 'religion, laws, and liberties'?"
+27,Landmarks,What is the Eiffel Tower?,"After its completion in the late 19th c., what was landmark was called 'a truly tragic street lamp' & a 'high & skinny pyramid of iron ladders'?"
+28,Geographic Name’s the Same,What is Dover?,"The busiest passenger port in the U.K., what shares its name with a capital of one of the original 13 states?"
+29,Names in the Bookstore,Who is Peter Mark Roget?,"This man made lists, perhaps to cope with depression; a set of lists he published in 1852 made whose name synonymous with a type of book?"
+30,U.S. History,Who is Dr. Samuel Mudd?,"An 1869 presidential pardon was granted to which man, due in part to a plea by the Medical Society of Harford County, Maryland?"
+31,American Literature,What is The Things They Carried?,"Letters, pocket knives, C rations & steel helmets are among the tangible items referred to in the title of what American literature modern war classic?"
+32,Nonfiction,What is The Communist Manifesto,"What nonfiction book has the line, 'The discovery of America…opened up fresh ground for the rising bourgeoisie'?"
+33, a new version was passed 81 years later,Laws in U.S. History,What is the Civil Rights Act?,,,,,,,,,,,,,,,,,,0, 2/3
+34,Names of Myth,Who is Helen of Troy?,"Whose brothers, Castor & Pollux, saved her after Theseus stole her away as a kid; a larger force would seek her later in life?"
+35,African Countries,What is Sudan?,"Once Africa's largest country in area, what African Country dropped to third in 2011 when a portion of it declared independence?"
+36,The Ancient World,What is Alexandria?,"The ancient writer Galen said books on ships arriving to what city's port were seized, originals kept & copies returned?"
+37,Famous Names,Who is Andy Warhol?,"For a special 1970s cookbook, who provided one simple recipe–a can of Campbell's tomato soup & 2 cans of milk?"
+38,People & Places,What is Guam?,"Thought to descend from people of Southeast Asia, the Chamorro make up what U.S. territory’s largest ethnic group?"
+39,Current World Leaders,What is the Philippines?,"In office from 2022, the president of what country has taken so many foreign trips a play on his name is 'Ferdinand Magellan Jr.'?"
+40,Writers & The South,Who is Tennessee Williams?,In 1939 which writer lived on Toulouse Street in the French Quarter & chose the professional name that bonded him to the South?
+41,National Parks,What is Yellowstone?,"What National Park is named for a river indigenous people called Mi tse a-da-zi, translated by French-speaking trappers as 'Pierre Jaune'?"
+42,Sports,Who are the Harlem Globetrotters?,"In 2010 who introduced the 4-point shot, 35 feet from the basket?"
+43,The U.S. Military,What is “Top Gun”?,Losses over Asia in the 1960s led to the establishment of the program known as what at a San Diego naval base in 1969?
+44,Art & Science,What is Halley’s Comet?,"A craft that visited what was named for Giotto, based on the story that 680 years earlier, the painter depicted it as the Star of Bethlehem?"
+45,Words From World War I,What is “tank”?,"In World War I, 'Cistern' & 'reservoir' were suggested names for what secret invention, but the British preferred this less clumsy monosyllable?"
+46,European History,What is Holy Roman Emperor?,"Until 1806, some German nobles included among their honors the title of 'Elector' for their role in selecting this personage?"
+47,Theater History,Who is Peter Pan?,"In 1904, wearing a harness, actress Nina Boucicault became the first to play what character onstage?"
+48,European Cities,What is Aachen?,"Alphabetically the first German city in encyclopedias, what was also the first one taken by the Allies in World War II?"
+49,Word Origins,What is mantra?,This Sanskrit word referring to a spoken word or phrase comes from a word for 'to think'?
+50,Inventions,What is barbed wire?,1917's 'Elements of Trench Warfare' said what Old West invention was 'difficult to destroy' & 'difficult to get through'?
+51,World War II,What is Schindler’s list?,"Mimi Reinhard, who never learned to type using more than 2 fingers, produced what in World War II with 1,100 names, including hers?"
+52, their offspring was the source of this mythical object,Mythology,What is the Golden Fleece?
+53,Literature,What is Pride and Prejudice?,"Published in 2011, P.D. James' final novel, 'Death Comes to Pemberley', was a sequel to what novel from 200 years earlier?"
+54, only these 2 west of the Mississippi River border each other,U.S. State Names,What are Oregon & Nevada?
+55,Word Origins,What is passion?,"Originally relating to a story of suffering, what word now more commonly refers to strong emotion of any kind?"
+56,World Cinema,What is La Vie en Rose?,"The 2007 biopic called 'La Môme' in France, meaning 'The Kid', was released in the U.S. under what other French title?"
+57,History,What is Santa Maria?,"Returning home in 1493, Columbus stopped in the Azores at an island with what name, also something he'd lost off the Haiti coast?"
+58,Landmarks,What is a kremlin?,Pskov & Nizhny Novgorod are 2 of the cities that have a fortress called what?
+59,Foreign-Born Authors,Who is Vladimir Nabokov?,In the 1950s the New York Times said what author 'is writing about all lust' & his lecherous narrator 'is all of us'?
+60,Astronomy & Geography,What is Capricorn?,"At the winter solstice, the sun is in Sagittarius; it once appeared in what constellation, giving a geographic feature its name?"
+61,Television,What is Law & Order?,"Mike Post combined the sound of a slamming jail door, an anvil & 100 men stomping on a floor for what television series that debuted in 1990?"
+62,British Landmarks,What is the Tower of London?,"Like Sir Thomas More, 3 16th century English queens are buried at what British location?"
+63,Early American History,What are witches?,"In 1692 Increase Mather wrote, 'It were better that ten suspected' of these who 'escape, than that one innocent person … be condemned'?"
+64,Geography Mnemonics,What are Arkansas and Louisiana?,"The Geography Mnemonic Mimal, sometimes said to be the silhouette of a chef or elf, stands for Minnesota, Iowa, Missouri, and what other 2 states?"
+65,Business Milestones,What is the Ford Model T?,"What was first sold in 1908, at a price equivalent to about $27,000 today?"
+66,In The Bookstore,Who is Tom Clancy?,The name of what author dead since 2013 now appears on books written by a former U.S. marshal & a former Apache helicopter pilot?
+67,Historic Art,What is the Bayeux Tapestry?,The artwork once known in France as 'la tapisserie de la Reine Mathilde' is better known as what?
+68,Pop Stars,Who is Madonna?,In 2022 which pop star became the first woman to have a Billboard Top 10 album in 5 decades starting with the 1980s?
+69,Classic Tale Characters,Who is Scheherazade?,"In one 19th century translation, what female classic tale character 'perceived the dawn of day and ceased' speaking nearly 1,000 times?"
+70,USA,What is Jack Daniel’s?,"Ironically, though what company founded in the 1860s is Moore County, Tennessee's largest employer, Moore is a dry county?"
+71,Historic People,Who was William Bligh?,"After a 1789 event, who wrote, 'My first determination was to seek a supply of…water at Tofoa, & afterwards to sail for Tongataboo'?"
+72,The Movies,What is The Godfather?,Laurence Olivier & Ernest Borgnine were considered for the lead role & Sergio Leone to direct for what film that turned 50 in 2022?
+73,Continental Geography,What is Colombia?,"Until a 1903 secession, what country's contiguous territory spanned 2 continents?"
+74,Foreign-Born Authors,Who is Isabel Allende?,"Early in her career which foreign-born author translated romance novels into Spanish, often changing the dialogue to make the heroines smarter?"
+75,Historic Crimes,What is the Mona Lisa?,"Saying it was stolen by Napoleon, self-styled Italian patriot Vincenzo Peruggia took what in 1911?"
+76,U.S. Bodies of Water,What is Lake Mead?,"Continuing a downward trend, in July 2022 what US body of water was at 27% capacity, its lowest level since 1937 when it was first being filled?"
+77,Gods & Goddesses,Who is Aurora (or Eos)?,"Each morning which goddess began her ride in her chariot across the sky ahead of her brother Sol, or Helios?"
+78,America At War,What is the Battle of New Orleans?,"Until the Civil War, the Jan. 8 date of what American battle of dubious military importance but big morale value was a national holiday?"
+79,Children’s Books,What is The Velveteen Rabbit?,"Which children's book title character is told 'By the time you are real, most of your hair has been loved off your eyes drop out & you get shabby'?"
+80,TV Finales,What is Grace and Frankie?,"In a TV reunion over 40 years in the making, Dolly Parton appeared as an angel named Agnes in the final episode of what comedy in 2022?"
+81,American Poems,Who is Evangeline?,"In an 1847 American poem what character sees her town of Grand-Pré burned, but finally reunites with her beau for a kiss before his death?"
+82,Famous Names,Who is Banksy?,"In 2001 who published a book called 'Banging Your Head Against a Brick Wall'; in 2002, 'Existencilism'?"
+83,Children’s Lit,What is Charlotte’s Web?,The title object of what childrens book 'never looked more beautiful each strand held dozens of bright drops of early morning dew'?
+84,Classic Songs,What is “Here Comes Santa Claus”?,The shouts of excited children at a 1946 holiday parade are said to have inspired what perennial classic song favorite?
+85,Brand Names,What are Milk Duds?,"Unable to make what candies perfectly round, the confectioner embraced this flawed name for the product?"
+86,Countries of the World,What is Italy?,"What country is home to 58 UNESCO World Heritage Sites, more than any other country; the sites include a volcano & a lagoon?"
+87,Action Movies,What is Die Hard?,"What action movie's last line is 'If this is their idea of Christmas, I gotta be here for New Years'?"
+88,Presidential Facts,Who is Woodrow Wilson?,Only 3 presidents have married while in office— John Tyler was the first & which one was the last?
+89,19th Century Americans,Who is Frederick Douglass?,"Demonstrating the dignity & humanity of Black Americans, who sat for 160 known photographs, the most of any American in the 19th century?"
+90,Latin Phrases,What is “quid pro quo”?,"Originally, which Latin 3-word phrase referred to when a doctor or apothecary substituted one medicine for another?"
+91,1970s Movies,What is Monty Python and the Holy Grail?,The 1975 premiere of what movie comedy advertised free coconuts for the first thousand in the audience?
+92,Name’s The Same,What is Manhattan?,"A cocktail, an island & a WWII venture originally called 'Development of Substitute Materials' all bear what name?"
+93,U.S. Presidents,Who is Calvin Coolidge?,"Which US President was sworn in twice as President within 2 years, first by his father & then later by a former U.S. President?"
+94,Plays,What is The Tempest?,A 1609 story in which an exiled king of Bulgaria creates a sea palace with his magic may have inspired the plot of what play?
+95,Landmarks,What is the Berlin Wall?,"In 2009, during a 20th anniversary celebration, what landmark was called 'an edifice of fear. On Nov. 9, it became a place of joy'?"
+96,World Capitals,"What is Vienna, Austria?","Among what world capital's nicknames are the 'City of Classical Music' &, possibly in honor of a famous resident from 1860 to 1938, the 'City of Dreams'?"
+97,Language & Its Meanings,What is a night owl?,"Now meaning someone with nocturnal habits, what catches a sleeping dove in Shakespeare's 'Lucrece'?"
+98,Flags of Our Hemisphere,What is Brazil?,"The stars on what country's flag represent states, 26 of them; unlike the USA's, its 'federal district' gets its own 27th star?"
+99,Names in U.S. History,Who is Oliver Brown?,What father was the only man among the 13 plaintiffs in a US class-action case filed in 1951?
+100,Children’s Authors,"Who is Sarah? (from Sarah, Plain and Tall)","Reversing the story of what heroine she created, childrens author Patricia Maclachlan was born on the prairie but spent much of her life in New England?"
+,,,
+TOTALS,,,
--- a/examples/jeopardy/questions.txt
+++ b/examples/jeopardy/questions.txt
@@ -0,0 +1,100 @@
+Which man born in 1932 was the son of a percussionist in the CBS radio orchestra has been nominated for 53 Oscars?
+What work in English Literature says: 'The mind is its own place, & in itself can make a heaven of hell, a hell of heaven. What matter where, if I be still the same'?
+Known for more philosophical works, he wrote the play 'La Mandragola', in which Florentines are rewarded for immoral actions?
+James Cook's account of a 1774 visit where records an object 'near 27 feet long, and upwards of 8 feet over the breast or shoulders'?
+England's 'Bloody Assizes' & a 1685 life sentence for perjury were 2 main origins of which amendment to the U.S. Constitution?
+Which nobel peace price winners each lived at times on Vilakazi St. in Soweto , so it claims to be the world's only street home to 2 Nobel Peace Prize winners?
+In 1966, the year of who's death did he share plans for an experimental prototype community in Florida?
+Of the 13 nations through which the Equator passes, what is the only one whose coastline borders the Caribbean Sea?
+Which decorative items in fashion history get their name from their origin in the port city of Strasbourg, on the border of France & Germany?
+What 1980's movie is based on an off-Broadway play with just 3 characters and won the Best Picture Oscar & the actors in all 3 roles were nominated?
+A 2012 book review for which novelist noted subjects that 'sparked his ire': capital punishment, big tobacco & 'the plight of the unjustly convicted'?
+A 1940 headline about what 20th Century Eponym included 'failure', 'liability when it came to offense' & 'stout hearts no match for tanks'?
+Over 700 years after its traditional 1252 founding date, what port city became associated with a psychological response?
+The success of what brand has its roots with a hydrotherapy pump its cofounder created for his son, who had arthritis?
+In a periodical in 1807, what American Author called New York City 'Gotham, Gotham! Most enlightened of cities'?
+What symbol is a rotated V in math and a feeling of some marginalized or underrepresented people in society?
+Monty Norman, the composer of what character's theme, said the staccato riff conveyed sexiness, mystery & ruthlessness?
+What American Novelist served with an airman named Yohannan in World War II & despite what readers might think, he said he enjoyed his service?
+In what Medieval place did one of the participants in an 1170 event say, 'Let us away, knights; he will rise no more'?
+At one time a province of the Roman Empire, what African country kingdom is known to Arabic scholars as Al-Maghrib Al-Aqsa, 'the far west'?
+Congress relented in 1890 after what prospective state said it would wait 100 years rather than come in without the women?
+A writer & producer of what movie said he wanted it to be like a Western or James Bond film, 'only it takes place in the 30s'?
+In 1898 what's been called the first blockbuster art show was devoted to which artist & put on for Queen Wilhelmina's coronation?
+Part of the largest contiguous land empire during the 1200s & 1300s, today what is the world's second-largest landlocked country?
+A 2006 book was titled 'The Poem That Changed America:' What 'Fifty Years Later'?
+Backed by 14,000 troops, who invaded England to restore, in his words, its 'religion, laws, and liberties'?
+After its completion in the late 19th c., what was landmark was called 'a truly tragic street lamp' & a 'high & skinny pyramid of iron ladders'?
+The busiest passenger port in the U.K., what shares its name with a capital of one of the original 13 states?
+This man made lists, perhaps to cope with depression; a set of lists he published in 1852 made whose name synonymous with a type of book?
+An 1869 presidential pardon was granted to which man, due in part to a plea by the Medical Society of Harford County, Maryland?
+Letters, pocket knives, C rations & steel helmets are among the tangible items referred to in the title of what American literature modern war classic?
+What nonfiction book has the line, 'The discovery of America…opened up fresh ground for the rising bourgeoisie'?
+A radical Republican championed what 1875 act but the Supreme Court struck it down in 1883; a new version was passed 81 years later?
+Whose brothers, Castor & Pollux, saved her after Theseus stole her away as a kid; a larger force would seek her later in life?
+Once Africa's largest country in area, what African Country dropped to third in 2011 when a portion of it declared independence?
+The ancient writer Galen said books on ships arriving to what city's port were seized, originals kept & copies returned?
+For a special 1970s cookbook, who provided one simple recipe–a can of Campbell's tomato soup & 2 cans of milk?
+Thought to descend from people of Southeast Asia, the Chamorro make up what U.S. territory’s largest ethnic group?
+In office from 2022, the president of what country has taken so many foreign trips a play on his name is 'Ferdinand Magellan Jr.'?
+In 1939 which writer lived on Toulouse Street in the French Quarter & chose the professional name that bonded him to the South?
+What National Park is named for a river indigenous people called Mi tse a-da-zi, translated by French-speaking trappers as 'Pierre Jaune'?
+In 2010 who introduced the 4-point shot, 35 feet from the basket?
+Losses over Asia in the 1960s led to the establishment of the program known as what at a San Diego naval base in 1969?
+A craft that visited what was named for Giotto, based on the story that 680 years earlier, the painter depicted it as the Star of Bethlehem?
+In World War I, 'Cistern' & 'reservoir' were suggested names for what secret invention, but the British preferred this less clumsy monosyllable?
+Until 1806, some German nobles included among their honors the title of 'Elector' for their role in selecting this personage?
+In 1904, wearing a harness, actress Nina Boucicault became the first to play what character onstage?
+Alphabetically the first German city in encyclopedias, what was also the first one taken by the Allies in World War II?
+This Sanskrit word referring to a spoken word or phrase comes from a word for 'to think'?
+1917's 'Elements of Trench Warfare' said what Old West invention was 'difficult to destroy' & 'difficult to get through'?
+Mimi Reinhard, who never learned to type using more than 2 fingers, produced what in World War II with 1,100 names, including hers?
+Poseidon carried off the maiden Theophane & turned her into a ewe; their offspring was the source of what mythical object?
+Published in 2011, P.D. James' final novel, 'Death Comes to Pemberley', was a sequel to what novel from 200 years earlier?
+5 U.S. states have 6-letter names; only which 2 west of the Mississippi River border each other?
+Originally relating to a story of suffering, what word now more commonly refers to strong emotion of any kind?
+The 2007 biopic called 'La Môme' in France, meaning 'The Kid', was released in the U.S. under what other French title?
+Returning home in 1493, Columbus stopped in the Azores at an island with what name, also something he'd lost off the Haiti coast?
+Pskov & Nizhny Novgorod are 2 of the cities that have a fortress called what?
+In the 1950s the New York Times said what author 'is writing about all lust' & his lecherous narrator 'is all of us'?
+At the winter solstice, the sun is in Sagittarius; it once appeared in what constellation, giving a geographic feature its name?
+Mike Post combined the sound of a slamming jail door, an anvil & 100 men stomping on a floor for what television series that debuted in 1990?
+Like Sir Thomas More, 3 16th century English queens are buried at what British location?
+In 1692 Increase Mather wrote, 'It were better that ten suspected' of these who 'escape, than that one innocent person be condemned'?
+The Geography Mnemonic Mimal, sometimes said to be the silhouette of a chef or elf, stands for Minnesota, Iowa, Missouri, and what other 2 states?
+What was first sold in 1908, at a price equivalent to about $27,000 today?
+The name of what author dead since 2013 now appears on books written by a former U.S. marshal & a former Apache helicopter pilot?
+The artwork once known in France as 'la tapisserie de la Reine Mathilde' is better known as what?
+In 2022 which pop star became the first woman to have a Billboard Top 10 album in 5 decades starting with the 1980s?
+In one 19th century translation, what female classic tale character 'perceived the dawn of day and ceased' speaking nearly 1,000 times?
+Ironically, though what company founded in the 1860s is Moore County, Tennessee's largest employer, Moore is a dry county?
+After a 1789 event, who wrote, 'My first determination was to seek a supply of…water at Tofoa, & afterwards to sail for Tongataboo'?
+Laurence Olivier & Ernest Borgnine were considered for the lead role & Sergio Leone to direct for what film that turned 50 in 2022?
+Until a 1903 secession, what country's contiguous territory spanned 2 continents?
+Early in her career which foreign-born author translated romance novels into Spanish, often changing the dialogue to make the heroines smarter?
+Saying it was stolen by Napoleon, self-styled Italian patriot Vincenzo Peruggia took what in 1911?
+Continuing a downward trend, in July 2022 what US body of water was at 27% capacity, its lowest level since 1937 when it was first being filled?
+Each morning which goddess began her ride in her chariot across the sky ahead of her brother Sol, or Helios?
+Until the Civil War, the Jan. 8 date of what American battle of dubious military importance but big morale value was a national holiday?
+Which children's book title character is told 'By the time you are real, most of your hair has been loved off your eyes drop out & you get shabby'?
+In a TV reunion over 40 years in the making, Dolly Parton appeared as an angel named Agnes in the final episode of what comedy in 2022?
+In an 1847 American poem what character sees her town of Grand-Pré burned, but finally reunites with her beau for a kiss before his death?
+In 2001 who published a book called 'Banging Your Head Against a Brick Wall'; in 2002, 'Existencilism'?
+The title object of what childrens book 'never looked more beautiful each strand held dozens of bright drops of early morning dew'?
+The shouts of excited children at a 1946 holiday parade are said to have inspired what perennial classic song favorite?
+Unable to make what candies perfectly round, the confectioner embraced this flawed name for the product?
+What country is home to 58 UNESCO World Heritage Sites, more than any other country; the sites include a volcano & a lagoon?
+What action movie's last line is 'If this is their idea of Christmas, I gotta be here for New Years'?
+Only 3 presidents have married while in office— John Tyler was the first & which one was the last?
+Demonstrating the dignity & humanity of Black Americans, who sat for 160 known photographs, the most of any American in the 19th century?
+Originally, which Latin 3-word phrase referred to when a doctor or apothecary substituted one medicine for another?
+The 1975 premiere of what movie comedy advertised free coconuts for the first thousand in the audience?
+A cocktail, an island & a WWII venture originally called 'Development of Substitute Materials' all bear what name?
+Which US President was sworn in twice as President within 2 years, first by his father & then later by a former U.S. President?
+A 1609 story in which an exiled king of Bulgaria creates a sea palace with his magic may have inspired the plot of what play?
+In 2009, during a 20th anniversary celebration, what landmark was called 'an edifice of fear. On Nov. 9, it became a place of joy'?
+Among what world capital's nicknames are the 'City of Classical Music' &, possibly in honor of a famous resident from 1860 to 1938, the 'City of Dreams'?
+Now meaning someone with nocturnal habits, what catches a sleeping dove in Shakespeare's 'Lucrece'?
+The stars on what country's flag represent states, 26 of them; unlike the USA's, its 'federal district' gets its own 27th star?
+What father was the only man among the 13 plaintiffs in a US class-action case filed in 1951?
+Reversing the story of what heroine she created, childrens author Patricia Maclachlan was born on the prairie but spent much of her life in New England?
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -1,3 +1,191 @@
-# main
+# llama.cpp/example/main

-TODO
+This example program allows you to use various LLaMA language models in an easy and efficient way. It is specifically designed to work with the [llama.cpp](https://github.com/ggerganov/llama.cpp) project, which provides a plain C/C++ implementation with optional 4-bit quantization support for faster, lower memory inference, and is optimized for desktop CPUs. This program can be used to perform various inference tasks with LLaMA models, including generating text based on user-provided prompts and chat-like interactions with reverse prompts.
+
+## Table of Contents
+
+1. [Quick Start](#quick-start)
+2. [Common Options](#common-options)
+3. [Input Prompts](#input-prompts)
+4. [Interaction](#interaction)
+5. [Context Management](#context-management)
+6. [Generation Flags](#generation-flags)
+7. [Performance Tuning and Memory Options](#performance-tuning-and-memory-options)
+8. [Additional Options](#additional-options)
+
+## Quick Start
+
+To get started right away, run the following command, making sure to use the correct path for the model you have:
+
+```bash
+./main -m models/7B/ggml-model.bin --prompt "Once upon a time"
+```
+
+The following command generates "infinite" text from a starting prompt (you can use `Ctrl-C` to stop it):
+
+```bash
+./main -m models/7B/ggml-model.bin --ignore-eos --n_predict -1 --keep -1 --prompt "Once upon a time"
+```
+
+For an interactive experience, try this command:
+
+```bash
+./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " --prompt $'User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:'
+```
+
+Note that the newline characters in the prompt string above only work on Linux. On Windows, you will have to use the ``--file`` option (see below) to load a multi-line prompt from file instead.
+
+## Common Options
+
+In this section, we cover the most commonly used options for running the `main` program with the LLaMA models:
+
+-   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
+-   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
+-   `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models.
+-   `-t N, --threads N`: Set the number of threads to use during computation. It is recommended to set this to the number of physical cores your CPU has.
+-   `-n N, --n_predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
+-   `-c N, --ctx_size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
+
+## Input Prompts
+
+The `main` program provides several ways to interact with the LLaMA models using input prompts:
+
+-   `--prompt PROMPT`: Provide a prompt directly as a command-line option.
+-   `--file FNAME`: Provide a file containing a prompt or multiple prompts.
+-   `--interactive-first`: Run the program in interactive mode and wait for input right away. (More on this below.)
+-   `--random-prompt`: Start with a randomized prompt.
+
+## Interaction
+
+The `main` program offers a seamless way to interact with LLaMA models, allowing users to engage in real-time conversations or provide instructions for specific tasks. The interactive mode can be triggered using various options, including `--interactive`, `--interactive-first`, and `--instruct`.
+
+In interactive mode, users can participate in text generation by injecting their input during the process. Users can press `Ctrl+C` at any time to interject and type their input, followed by pressing `Return` to submit it to the LLaMA model. To submit additional lines without finalizing input, users can end the current line with a backslash (`\`) and continue typing.
+
+### Interaction Options
+
+-   `-i, --interactive`: Run the program in interactive mode, allowing users to engage in real-time conversations or provide specific instructions to the model.
+-   `--interactive-first`: Run the program in interactive mode and immediately wait for user input before starting the text generation.
+-   `-ins, --instruct`: Run the program in instruction mode, which is specifically designed to work with Alpaca models that excel in completing tasks based on user instructions.
+-   `--color`: Enable colorized output to differentiate visually distinguishing between prompts, user input, and generated text.
+
+By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs.
+
+### Reverse Prompts
+
+Reverse prompts are a powerful way to create a chat-like experience with a LLaMA model by pausing the text generation when specific text strings are encountered:
+
+-   `-r PROMPT, --reverse-prompt PROMPT`: Specify one or multiple reverse prompts to pause text generation and switch to interactive mode. For example, `-r "User:"` can be used to jump back into the conversation whenever it's the user's turn to speak. This helps create a more interactive and conversational experience. However, the reverse prompt doesn't work when it ends with a space.
+
+To overcome this limitation, you can use the `--in-prefix` flag to add a space or any other characters after the reverse prompt.
+
+### In-Prefix
+
+The `--in-prefix` flag is used to add a prefix to your input, primarily, this is used to insert a space after the reverse prompt. Here's an example of how to use the `--in-prefix` flag in conjunction with the `--reverse-prompt` flag:
+
+```sh
+./main -r "User:" --in-prefix " "
+```
+
+### Instruction Mode
+
+Instruction mode is particularly useful when working with Alpaca models, which are designed to follow user instructions for specific tasks:
+
+-   `-ins, --instruct`: Enable instruction mode to leverage the capabilities of Alpaca models in completing tasks based on user-provided instructions.
+
+Technical detail: the user's input is internally prefixed with the reverse prompt (or ``### Instruction:`` as the default), and followed by ``### Response:`` (except if you just press Return without any input, to keep generating a longer response).
+
+By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs.
+
+## Context Management
+
+During text generation, LLaMA models have a limited context size, which means they can only consider a certain number of tokens from the input and generated text. When the context fills up, the model resets internally, potentially losing some information from the beginning of the conversation or instructions. Context management options help maintain continuity and coherence in these situations.
+
+### Context Size
+
+The `--ctx_size` option allows you to set the size of the prompt context used by the LLaMA models during text generation. A larger context size helps the model to better comprehend and generate responses for longer input or conversations.
+
+-   `-c N, --ctx_size N`: Set the size of the prompt context (default: 512). The LLaMA models were built with a context of 2048, which will yield the best results on longer input/inference. However, increasing the context size beyond 2048 may lead to unpredictable results.
+
+### Keep Prompt
+
+The `--keep` option allows users to retain the original prompt when the model runs out of context, ensuring a connection to the initial instruction or conversation topic is maintained.
+
+-   `--keep N`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context. By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt.
+
+By utilizing context management options like `--ctx_size` and `--keep`, you can maintain a more coherent and consistent interaction with the LLaMA models, ensuring that the generated text remains relevant to the original prompt or conversation.
+
+## Generation Flags
+
+The following options are related to controlling the text generation process, influencing the diversity, creativity, and quality of the generated text. Understanding these options will help you fine-tune the output according to your needs:
+
+### Number of Tokens to Predict
+
+-   `-n N, --n_predict N`: Set the number of tokens to predict when generating text (default: 128, -1 = infinity).
+
+The `--n_predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text. A value of -1 will cause text to be generated without limit.
+
+It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `n_predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the ``--ignore-eos`` parameter.
+
+### RNG Seed
+
+-   `-s SEED, --seed SEED`: Set the random number generator (RNG) seed (default: -1).
+
+The RNG seed is used to initialize the random number generator that influences the text generation process. By setting a specific seed value, you can obtain consistent and reproducible results across multiple runs with the same input and settings. This can be helpful for testing, debugging, or comparing the effects of different options on the generated text to see when they diverge. If the seed is set to a value less than or equal to 0, a random seed will be used, which will result in different outputs on each run.
+
+### Temperature
+
+-   `--temp N`: Adjust the randomness of the generated text (default: 0.8).
+
+Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.
+
+Example usage: `--temp 0.8`
+
+### Repeat Penalty
+
+-   `--repeat_penalty N`: Control the repetition of token sequences in the generated text (default: 1.1).
+
+Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. The default value is 1.1.
+
+Example usage: `--repeat_penalty 1.1`
+
+### Top-K Sampling
+
+-   `--top_k N`: Limit the next token selection to the K most probable tokens (default: 40).
+
+Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top_k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text. The default value is 40.
+
+Example usage: `--top_k 40`
+
+### Top-P Sampling
+
+-   `--top_p N`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9).
+
+Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. The default value is 0.9.
+
+Example usage: `--top_p 0.9`
+
+By adjusting these options, you can control the diversity, quality, and creativity of the generated text to better suit your needs. You can experiment with different combinations of values to find the best settings for your specific use case.
+
+## Performance Tuning and Memory Options
+
+These options help improve the performance and memory usage of the LLaMA models:
+
+-   `-t N, --threads N`: Set the number of threads to use during computation. Using the correct number of threads can greatly improve performance. It is recommended to set this value to the number of CPU cores.
+-   `--mlock`: Lock the model in memory, preventing it from being swapped out when mmaped. This can improve performance.
+-   `--no-mmap`: Do not memory-map the model. This results in a slower load time but may reduce pageouts if you're not using `mlock`.
+-   `--memory_f32`: Use 32 bit floats instead of 16 bit floats for memory key+value, allowing higher quality inference at the cost of memory.
+-   `-b N, --batch_size N`: Set the batch size for prompt processing (default: 512). This large batch size benefits users who have BLAS installed and enabled it during the build. If you don't have BLAS enabled ("BLAS=0"), you can use a smaller number, such as 8, to see the prompt progress as it's evaluated in some situations.
+
+For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-data--run).
+
+By understanding and using these performance tuning settings, you can optimize the LLaMA model's behavior to achieve the best performance for your specific needs.
+
+## Additional Options
+
+These options provide extra functionality and customization when running the LLaMA models:
+
+-   `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated.
+-   `--verbose-prompt`: Print the prompt before generating text.
+-   `--mtest`: Test the model's functionality by running a series of tests to ensure it's working properly.
+-   `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
+-   `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -25,6 +25,7 @@
 #endif

 static console_state con_st;
+static llama_context ** g_ctx;

 static bool is_interacting = false;

@@ -36,6 +37,7 @@ void sigint_handler(int signo) {
        if (!is_interacting) {
            is_interacting=true;
        } else {
+            llama_print_timings(*g_ctx);
            _exit(130);
        }
    }
@@ -94,6 +96,7 @@ int main(int argc, char ** argv) {
 //bool is_prime(int n) {)";

    llama_context * ctx;
+    g_ctx = &ctx;

    // load the model
    {
@@ -154,6 +157,32 @@ int main(int argc, char ** argv) {
    // Add a space in front of the first character to match OG llama tokenizer behavior
    params.prompt.insert(0, 1, ' ');

+    std::string path_session = params.path_session;
+    std::vector<llama_token> session_tokens;
+
+    if (!path_session.empty()) {
+        fprintf(stderr, "%s: attempting to load saved session from %s..\n", __func__, path_session.c_str());
+
+        // REVIEW - fopen to check for existing session
+        FILE * fp = std::fopen(path_session.c_str(), "rb");
+        if (fp != NULL) {
+            std::fclose(fp);
+
+            session_tokens.resize(params.n_ctx);
+            size_t n_token_count_out = 0;
+            const size_t n_session_bytes = llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out);
+            session_tokens.resize(n_token_count_out);
+
+            if (n_session_bytes > 0) {
+                fprintf(stderr, "%s: loaded %zu bytes of session data!\n", __func__, n_session_bytes);
+            } else {
+                fprintf(stderr, "%s: could not load session file, will recreate\n", __func__);
+            }
+        } else {
+            fprintf(stderr, "%s: session file does not exist, will create\n", __func__);
+        }
+    }
+
    // tokenize the prompt
    auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);

@@ -164,6 +193,26 @@ int main(int argc, char ** argv) {
        return 1;
    }

+    // debug message about similarity of saved session, if applicable
+    size_t n_matching_session_tokens = 0;
+    if (session_tokens.size()) {
+        for (llama_token id : session_tokens) {
+            if (n_matching_session_tokens >= embd_inp.size() || id != embd_inp[n_matching_session_tokens]) {
+                break;
+            }
+            n_matching_session_tokens++;
+        }
+        if (n_matching_session_tokens >= embd_inp.size()) {
+            fprintf(stderr, "%s: session file has exact match for prompt!\n", __func__);
+        } else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
+            fprintf(stderr, "%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
+                __func__, n_matching_session_tokens, embd_inp.size());
+        } else {
+            fprintf(stderr, "%s: session file matches %zu / %zu tokens of prompt\n",
+                __func__, n_matching_session_tokens, embd_inp.size());
+        }
+    }
+
    // number of tokens to keep when resetting context
    if (params.n_keep < 0 || params.n_keep > (int)embd_inp.size() || params.instruct) {
        params.n_keep = (int)embd_inp.size();
@@ -175,12 +224,12 @@ int main(int argc, char ** argv) {

    // in instruct mode, we inject a prefix and a suffix to each input by the user
    if (params.instruct) {
-        params.interactive_start = true;
+        params.interactive_first = true;
        params.antiprompt.push_back("### Instruction:\n\n");
    }

    // enable interactive mode if reverse prompt or interactive start is specified
-    if (params.antiprompt.size() != 0 || params.interactive_start) {
+    if (params.antiprompt.size() != 0 || params.interactive_first) {
        params.interactive = true;
    }

@@ -243,15 +292,22 @@ int main(int argc, char ** argv) {
 #endif
               " - Press Return to return control to LLaMa.\n"
               " - If you want to submit another line, end your input in '\\'.\n\n");
-        is_interacting = params.interactive_start;
+        is_interacting = params.interactive_first;
    }

    bool is_antiprompt = false;
    bool input_noecho  = false;

+    // HACK - because session saving incurs a non-negligible delay, for now skip re-saving session
+    // if we loaded a session with at least 75% similarity. It's currently just used to speed up the
+    // initial prompt so it doesn't need to be an exact match.
+    bool need_to_save_session = !path_session.empty() && n_matching_session_tokens < (embd_inp.size() * 3 / 4);
+
+
    int n_past     = 0;
    int n_remain   = params.n_predict;
    int n_consumed = 0;
+    int n_session_consumed = 0;

    // the first thing we will do is to output the prompt, so set color accordingly
    set_console_color(con_st, CONSOLE_COLOR_PROMPT);
@@ -264,7 +320,7 @@ int main(int argc, char ** argv) {
            // infinite text generation via context swapping
            // if we run out of context:
            // - take the n_keep first tokens from the original prompt (via n_past)
-            // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in a batch
+            // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
            if (n_past + (int) embd.size() > n_ctx) {
                const int n_left = n_past - params.n_keep;

@@ -273,6 +329,9 @@ int main(int argc, char ** argv) {
                // insert n_left/2 tokens at the start of embd from last_n_tokens
                embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());

+                // REVIEW - stop saving session if we run out of context
+                path_session = "";
+
                //printf("\n---\n");
                //printf("resetting: '");
                //for (int i = 0; i < (int) embd.size(); i++) {
@@ -282,13 +341,48 @@ int main(int argc, char ** argv) {
                //printf("\n---\n");
            }

-            if (llama_eval(ctx, embd.data(), embd.size(), n_past, params.n_threads)) {
-                fprintf(stderr, "%s : failed to eval\n", __func__);
-                return 1;
+            // try to reuse a matching prefix from the loaded session instead of re-eval (via n_past)
+            // REVIEW
+            if (n_session_consumed < (int) session_tokens.size()) {
+                size_t i = 0;
+                for ( ; i < embd.size(); i++) {
+                    if (embd[i] != session_tokens[n_session_consumed]) {
+                        session_tokens.resize(n_session_consumed);
+                        break;
+                    }
+
+                    n_past++;
+                    n_session_consumed++;
+
+                    if (n_session_consumed >= (int) session_tokens.size()) {
+                        break;
+                    }
+                }
+                if (i > 0) {
+                    embd.erase(embd.begin(), embd.begin() + i);
+                }
+            }
+
+            // evaluate tokens in batches
+            // embd is typically prepared beforehand to fit within a batch, but not always
+            for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
+                int n_eval = (int) embd.size() - i;
+                if (n_eval > params.n_batch) {
+                    n_eval = params.n_batch;
+                }
+                if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads)) {
+                    fprintf(stderr, "%s : failed to eval\n", __func__);
+                    return 1;
+                }
+                n_past += n_eval;
+            }
+
+            if (embd.size() > 0 && !path_session.empty()) {
+                session_tokens.insert(session_tokens.end(), embd.begin(), embd.end());
+                n_session_consumed = session_tokens.size();
            }
        }

-        n_past += embd.size();
        embd.clear();

        if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
@@ -298,6 +392,12 @@ int main(int argc, char ** argv) {
            const float   temp           = params.temp;
            const float   repeat_penalty = params.repeat_penalty;

+            // optionally save the session on first sample (for faster prompt loading next time)
+            if (!path_session.empty() && need_to_save_session) {
+                need_to_save_session = false;
+                llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
+            }
+
            llama_token id = 0;

            {
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -53,7 +53,13 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
        auto end_t = std::chrono::high_resolution_clock::now();
        if (i == 0) {
            const float seconds = std::chrono::duration<float>(end_t - start_t).count();
-            printf("%.2f seconds per pass - ETA %.2f hours\n", seconds, (seconds * seq_count) / (60.0*60.0));
+            printf("%.2f seconds per pass - ETA ", seconds);
+            int total_seconds = (int)(seconds * seq_count);
+            if (total_seconds >= 60*60) {
+                printf("%d hours ", total_seconds / (60*60));
+                total_seconds = total_seconds % (60*60);
+            }
+            printf("%d minutes\n", total_seconds / 60);
        }
        // We get the logits for all the tokens in the context window (params.n_ctx)
        // from llama_eval above.  Now, based on https://huggingface.co/docs/transformers/perplexity,
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -2,8 +2,18 @@
 #include "llama.h"

 #include <cstdio>
+#include <map>
 #include <string>

+static const std::map<std::string, enum llama_ftype> LLAMA_FTYPE_MAP = {
+  {"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0},
+  {"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1},
+  {"q4_2", LLAMA_FTYPE_MOSTLY_Q4_2},
+  {"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0},
+  {"q5_1", LLAMA_FTYPE_MOSTLY_Q5_1},
+  {"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0},
+};
+
 // usage:
 //  ./quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type
 //
@@ -12,10 +22,9 @@ int main(int argc, char ** argv) {

    if (argc < 4) {
        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type [nthread]\n", argv[0]);
-        fprintf(stderr, "  type = %d - q4_0\n", LLAMA_FTYPE_MOSTLY_Q4_0);
-        fprintf(stderr, "  type = %d - q4_1\n", LLAMA_FTYPE_MOSTLY_Q4_1);
-        fprintf(stderr, "  type = %d - q4_2\n", LLAMA_FTYPE_MOSTLY_Q4_2);
-        fprintf(stderr, "  type = %d - q4_3\n", LLAMA_FTYPE_MOSTLY_Q4_3);
+        for (auto it = LLAMA_FTYPE_MAP.begin(); it != LLAMA_FTYPE_MAP.end(); it++) {
+            fprintf(stderr, "  type = \"%s\" or %d\n", it->first.c_str(), it->second);
+        }
        return 1;
    }

@@ -29,7 +38,18 @@ int main(int argc, char ** argv) {
    const std::string fname_inp = argv[1];
    const std::string fname_out = argv[2];

-    const enum llama_ftype ftype = (enum llama_ftype)atoi(argv[3]);
+    enum llama_ftype ftype;
+    if (argv[3][0] == 'q') {
+        auto it = LLAMA_FTYPE_MAP.find(argv[3]);
+        if (it == LLAMA_FTYPE_MAP.end()) {
+            fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, argv[3]);
+            return 1;
+        }
+        ftype = it->second;
+    } else {
+        ftype = (enum llama_ftype)atoi(argv[3]);
+    }
+
    int nthread = argc > 4 ? atoi(argv[4]) : 0;

    const int64_t t_main_start_us = ggml_time_us();
--- a/examples/save-load-state/CMakeLists.txt
+++ b/examples/save-load-state/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(TARGET save-load-state)
+add_executable(${TARGET} save-load-state.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -0,0 +1,128 @@
+#include <vector>
+#include <cstdio>
+#include <chrono>
+
+#include "common.h"
+#include "llama.h"
+#include "llama.cpp"
+
+using namespace std;
+
+int main(int argc, char ** argv) {
+    gpt_params params;
+    params.model = "models/llama-7B/ggml-model.bin";
+    params.seed = 42;
+    params.n_threads = 4;
+    params.repeat_last_n = 64;
+    params.prompt = "The quick brown fox";
+
+    if (gpt_params_parse(argc, argv, params) == false) {
+        return 1;
+    }
+
+    auto lparams = llama_context_default_params();
+
+    lparams.n_ctx      = params.n_ctx;
+    lparams.n_parts    = params.n_parts;
+    lparams.seed       = params.seed;
+    lparams.f16_kv     = params.memory_f16;
+    lparams.use_mmap   = params.use_mmap;
+    lparams.use_mlock  = params.use_mlock;
+
+    auto n_past = 0;
+    auto last_n_tokens_data = vector<llama_token>(params.repeat_last_n, 0);
+
+    // init
+    auto ctx = llama_init_from_file(params.model.c_str(), lparams);
+    auto tokens = vector<llama_token>(params.n_ctx);
+    auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), tokens.size(), true);
+
+    if (n_prompt_tokens < 1) {
+        fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
+        return 1;
+    }
+
+    // evaluate prompt
+
+    llama_eval(ctx, tokens.data(), n_prompt_tokens, n_past, params.n_threads);
+
+    last_n_tokens_data.insert(last_n_tokens_data.end(), tokens.data(), tokens.data() + n_prompt_tokens);
+    n_past += n_prompt_tokens;
+
+    // Save state (rng, logits, embedding and kv_cache) to file
+    FILE *fp_write = fopen("dump_state.bin", "wb");
+    auto state_size = llama_get_state_size(ctx);
+    auto state_mem = new uint8_t[state_size];
+    llama_copy_state_data(ctx, state_mem); // could also copy directly to memory mapped file
+    fwrite(state_mem, 1, state_size, fp_write);
+    fclose(fp_write);
+
+    // save state (last tokens)
+    auto last_n_tokens_data_saved = vector<llama_token>(last_n_tokens_data);
+    auto n_past_saved = n_past;
+
+    // first run
+    printf("\n%s", params.prompt.c_str());
+    for (auto i = 0; i < params.n_predict; i++) {
+        auto next_token = llama_sample_top_p_top_k(
+            ctx,
+            &last_n_tokens_data.back() - params.repeat_last_n,
+            params.repeat_last_n,
+            40,
+            1.0,
+            1.0,
+            1.1);
+        auto next_token_str = llama_token_to_str(ctx, next_token);
+        last_n_tokens_data.push_back(next_token);
+        printf("%s", next_token_str);
+        if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) {
+            fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
+            return 1;
+        }
+        n_past += 1;
+    }
+    printf("\n\n");
+
+    // free old model
+    llama_free(ctx);
+
+    // load new model
+
+    auto ctx2 = llama_init_from_file(params.model.c_str(), lparams);
+
+    // Load state (rng, logits, embedding and kv_cache) from file
+    FILE *fp_read = fopen("dump_state.bin", "rb");
+    auto state_size2 = llama_get_state_size(ctx2);
+    if (state_size != state_size2) {
+        fprintf(stderr, "\n%s : failed to validate state size\n", __func__);
+    }
+    fread(state_mem, 1, state_size, fp_read);
+    llama_set_state_data(ctx2, state_mem);  // could also read directly from memory mapped file
+    fclose(fp_read);
+
+    // restore state (last tokens)
+    last_n_tokens_data = last_n_tokens_data_saved;
+    n_past = n_past_saved;
+
+    // second run
+    for (auto i = 0; i < params.n_predict; i++) {
+        auto next_token = llama_sample_top_p_top_k(
+            ctx2,
+            &last_n_tokens_data.back() - params.repeat_last_n,
+            params.repeat_last_n,
+            40,
+            1.0,
+            1.0,
+            1.1);
+        auto next_token_str = llama_token_to_str(ctx2, next_token);
+        last_n_tokens_data.push_back(next_token);
+        printf("%s", next_token_str);
+        if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) {
+            fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
+            return 1;
+        }
+        n_past += 1;
+    }
+    printf("\n\n");
+    return 0;
+}
--- a/flake.nix
+++ b/flake.nix
@@ -30,9 +30,9 @@
            mv bin/* $out/bin/
            mv $out/bin/main $out/bin/llama

-            echo "#!${llama-python}/bin/python" > $out/bin/convert-pth-to-ggml
-            cat ${./convert-pth-to-ggml.py} >> $out/bin/convert-pth-to-ggml
-            chmod +x $out/bin/convert-pth-to-ggml
+            echo "#!${llama-python}/bin/python" > $out/bin/convert.py
+            cat ${./convert.py} >> $out/bin/convert.py
+            chmod +x $out/bin/convert.py
          '';
          meta.mainProgram = "llama";
        };
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -1,5 +1,7 @@
 #include <stdint.h>
+#include <stdio.h>
 #include <cuda_fp16.h>
+#include <atomic>
 #include "ggml-cuda.h"

 typedef uint16_t ggml_fp16_t;
@@ -22,11 +24,34 @@ static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 b

 #define QK4_2 16
 typedef struct {
-    __half d;               // delta
+    __half  d;              // delta
    uint8_t qs[QK4_2 / 2];  // nibbles / quants
 } block_q4_2;
 static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding");

+#define QK5_0 32
+typedef struct {
+    __half d;               // delta
+    uint8_t qh[4];          // 5-th bit of quants
+    uint8_t qs[QK5_0 / 2];  // nibbles / quants
+} block_q5_0;
+static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
+
+#define QK5_1 32
+typedef struct {
+    __half d;               // delta
+    __half m;               // min
+    uint32_t qh;            // 5-th bit of quants
+    uint8_t qs[QK5_1 / 2];  // nibbles / quants
+} block_q5_1;
+static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
+
+#define QK8_0 32
+typedef struct {
+    float   d;              // delta
+    int8_t  qs[QK8_0];      // quants
+} block_q8_0;
+static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding");

 static __global__ void dequantize_block_q4_0(const void * vx, float * y) {
    const block_q4_0 * x = (const block_q4_0 *) vx;
@@ -98,19 +123,182 @@ static __global__ void dequantize_block_q4_2(const void * vx, float * y) {
    }
 }

-extern "C" {
-    __host__ void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
-        const int nb = k / QK4_0;
-        dequantize_block_q4_0<<<nb, 1, 0, stream>>>(vx, y);
-    }
+static __global__ void dequantize_block_q5_0(const void * vx, float * y) {
+    const block_q5_0 * x = (const block_q5_0 *) vx;

-    __host__ void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
-        const int nb = k / QK4_1;
-        dequantize_block_q4_1<<<nb, 1, 0, stream>>>(vx, y);
-    }
+    const int i = blockIdx.x;

-    __host__ void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
-        const int nb = k / QK4_2;
-        dequantize_block_q4_2<<<nb, 1, 0, stream>>>(vx, y);
+    const float d = x[i].d;
+
+    const uint8_t * pp = x[i].qs;
+
+    uint32_t qh;
+    memcpy(&qh, x[i].qh, sizeof(qh));
+
+    for (int l = 0; l < QK5_0; l += 2) {
+        const uint8_t vi = pp[l/2];
+
+        const int8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
+        const int8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
+
+        const int8_t vi0 = ((vi & 0xf) | vh0);
+        const int8_t vi1 = ((vi >>  4) | vh1);
+
+        const float v0 = (vi0 - 16)*d;
+        const float v1 = (vi1 - 16)*d;
+
+        y[i*QK5_0 + l + 0] = v0;
+        y[i*QK5_0 + l + 1] = v1;
+    }
+}
+
+static __global__ void dequantize_block_q5_1(const void * vx, float * y) {
+    const block_q5_1 * x = (const block_q5_1 *) vx;
+
+    const int i = blockIdx.x;
+
+    const float d = x[i].d;
+    const float m = x[i].m;
+
+    const uint8_t * pp = x[i].qs;
+
+    const uint32_t qh = x[i].qh;
+
+    for (int l = 0; l < QK5_1; l += 2) {
+        const uint8_t vi = pp[l/2];
+
+        const int8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
+        const int8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
+
+        const int8_t vi0 = (vi & 0xf) | vh0;
+        const int8_t vi1 = (vi >>  4) | vh1;
+
+        const float v0 = vi0*d + m;
+        const float v1 = vi1*d + m;
+
+        y[i*QK5_1 + l + 0] = v0;
+        y[i*QK5_1 + l + 1] = v1;
+    }
+}
+
+static __global__ void dequantize_block_q8_0(const void * vx, float * y) {
+    const block_q8_0 * x = (const block_q8_0 *) vx;
+
+    const int i = blockIdx.x;
+
+    const float d = x[i].d;
+
+    const int8_t * pp = x[i].qs;
+
+    for (int l = 0; l < QK8_0; l++) {
+        const int8_t vi = pp[l];
+
+        y[i*QK8_0 + l] = vi*d;
+    }
+}
+
+void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
+    const int nb = k / QK4_0;
+    dequantize_block_q4_0<<<nb, 1, 0, stream>>>(vx, y);
+}
+
+void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
+    const int nb = k / QK4_1;
+    dequantize_block_q4_1<<<nb, 1, 0, stream>>>(vx, y);
+}
+
+void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
+    const int nb = k / QK4_2;
+    dequantize_block_q4_2<<<nb, 1, 0, stream>>>(vx, y);
+}
+
+void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
+    const int nb = k / QK5_0;
+    dequantize_block_q5_0<<<nb, 1, 0, stream>>>(vx, y);
+}
+
+void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
+    const int nb = k / QK5_1;
+    dequantize_block_q5_1<<<nb, 1, 0, stream>>>(vx, y);
+}
+
+void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
+    const int nb = k / QK8_0;
+    dequantize_block_q8_0<<<nb, 1, 0, stream>>>(vx, y);
+}
+
+// buffer pool for cuda
+#define MAX_CUDA_BUFFERS 16
+
+struct scoped_spin_lock {
+    std::atomic_flag& lock;
+    scoped_spin_lock(std::atomic_flag& lock) : lock(lock) {
+        while (lock.test_and_set(std::memory_order_acquire)) {
+            ; // spin
+        }
+    }
+    ~scoped_spin_lock() {
+        lock.clear(std::memory_order_release);
+    }
+    scoped_spin_lock(const scoped_spin_lock&) = delete;
+    scoped_spin_lock& operator=(const scoped_spin_lock&) = delete;
+};
+
+struct cuda_buffer {
+    void * ptr = nullptr;
+    size_t size = 0;
+};
+
+static cuda_buffer g_cuda_buffer_pool[MAX_CUDA_BUFFERS];
+static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
+
+void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
+    scoped_spin_lock lock(g_cuda_pool_lock);
+
+    for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
+        cuda_buffer& b = g_cuda_buffer_pool[i];
+        if (b.size >= size && b.ptr != nullptr) {
+            void * ptr = b.ptr;
+            *actual_size = b.size;
+            b.ptr = nullptr;
+            b.size = 0;
+            return ptr;
+        }
+    }
+    void * ptr;
+    CUDA_CHECK(cudaMalloc((void **) &ptr, size));
+    *actual_size = size;
+    return ptr;
+}
+
+void ggml_cuda_pool_free(void * ptr, size_t size) {
+    scoped_spin_lock lock(g_cuda_pool_lock);
+
+    for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
+        cuda_buffer& b = g_cuda_buffer_pool[i];
+        if (b.ptr == nullptr) {
+            b.ptr = ptr;
+            b.size = size;
+            return;
+        }
+    }
+    fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
+    CUDA_CHECK(cudaFree(ptr));
+}
+
+cublasHandle_t g_cublasH = NULL;
+cudaStream_t g_cudaStream = NULL;
+
+void ggml_init_cublas(void) {
+    if (g_cublasH == NULL) {
+        // create cublas handle, bind a stream
+        CUBLAS_CHECK(cublasCreate(&g_cublasH));
+
+        CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStream, cudaStreamNonBlocking));
+
+        CUBLAS_CHECK(cublasSetStream(g_cublasH, g_cudaStream));
+
+        // configure logging to stdout
+        // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, NULL));
    }
 }
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@@ -1,10 +1,42 @@
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+
 #ifdef  __cplusplus
 extern "C" {
 #endif

+#define CUDA_CHECK(err)                                                                 \
+    do {                                                                                \
+        cudaError_t err_ = (err);                                                       \
+        if (err_ != cudaSuccess) {                                                      \
+            fprintf(stderr, "CUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__,   \
+                cudaGetErrorString(err_));                                              \
+            exit(1);                                                                    \
+        }                                                                               \
+    } while (0)
+
+#define CUBLAS_CHECK(err)                                                               \
+    do {                                                                                \
+        cublasStatus_t err_ = (err);                                                    \
+        if (err_ != CUBLAS_STATUS_SUCCESS) {                                            \
+            fprintf(stderr, "cuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__);    \
+            exit(1);                                                                    \
+        }                                                                               \
+    } while (0)
+
+extern cublasHandle_t g_cublasH;
+extern cudaStream_t   g_cudaStream;
+
+void   ggml_init_cublas(void);
+void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size);
+void   ggml_cuda_pool_free(void * ptr, size_t size);
+
 void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
 void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
 void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream);
+void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
+void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
+void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);

 #ifdef  __cplusplus
 }
--- a/ggml-opencl-dequant.cl
+++ b/ggml-opencl-dequant.cl
@@ -0,0 +1,63 @@
+#define MULTILINE_QUOTE(...) #__VA_ARGS__
+const char * clblast_dequant = MULTILINE_QUOTE(
+
+struct block_q4_0
+{
+    float d;
+    uchar qs[16];
+};
+
+__kernel void dequantize_row_q4_0(__global struct block_q4_0* blocks, __global float* result) {
+    const uint i = get_global_id(0) / 32;
+    const uint l = get_local_id(0);
+
+    const float d = blocks[i].d;
+
+    const uchar vi = blocks[i].qs[l];
+
+    const uint index = i*32 + l*2;
+    result[index + 0] = ((vi & 0xf) - 8)*d;
+    result[index + 1] = ((vi >> 4) - 8)*d;
+}
+
+struct block_q4_1
+{
+    float d;
+    float m;
+    uchar qs[16];
+};
+
+__kernel void dequantize_row_q4_1(__global struct block_q4_1* blocks, __global float* result) {
+    const uint i = get_global_id(0) / 32;
+    const uint l = get_local_id(0);
+
+    const float d = blocks[i].d;
+    const float m = blocks[i].m;
+
+    const uchar vi = blocks[i].qs[l];
+
+    const uint index = i*32 + l*2;
+    result[index + 0] = (vi & 0xf) * d + m;
+    result[index + 1] = (vi >> 4) * d + m;
+}
+
+struct block_q4_2
+{
+    ushort d;
+    uchar qs[8];
+};
+
+__kernel void dequantize_row_q4_2(__global struct block_q4_2* blocks, __global float* result) {
+    const uint i = get_global_id(0) / 16;
+    const uint l = get_local_id(0);
+
+    const float d = vload_half(0, (__global half*) &blocks[i].d);;
+
+    const uchar vi = blocks[i].qs[l];
+
+    const uint index = i*16 + l*2;
+    result[index + 0] = ((vi & 0xf) - 8)*d;
+    result[index + 1] = ((vi >> 4) - 8)*d;
+}
+
+);
--- a/ggml-opencl.c
+++ b/ggml-opencl.c
@@ -0,0 +1,208 @@
+#include "ggml-opencl.h"
+
+#define CL_TARGET_OPENCL_VERSION 110
+#include <clblast_c.h>
+
+#include <stdio.h>
+#include <string.h>
+
+#include "ggml.h"
+
+#include "ggml-opencl-dequant.cl"
+
+#define CL_CHECK(err, name)                                                                     \
+    do {                                                                                        \
+        cl_int err_ = (err);                                                                    \
+        if (err_ != CL_SUCCESS) {                                                               \
+            fprintf(stderr, "OpenCL %s error %d at %s:%d\n", name, err_, __FILE__, __LINE__);   \
+            exit(1);                                                                            \
+        }                                                                                       \
+    } while (0)
+
+static cl_platform_id platform;
+static cl_device_id device;
+static cl_context context;
+static cl_command_queue queue;
+static cl_program program;
+static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q4_2;
+static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c;
+static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0;
+
+static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) {
+    cl_program p;
+    char *program_log;
+    size_t program_size, log_size;
+    int err;
+
+    program_size = strlen(program_buffer);
+
+    p = clCreateProgramWithSource(ctx, 1, (const char**)&program_buffer, &program_size, &err);
+    if(err < 0) {
+        fprintf(stderr, "OpenCL error creating program");
+        exit(1);
+    }
+
+    err = clBuildProgram(p, 0, NULL, NULL, NULL, NULL);
+    if(err < 0) {
+
+        clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
+        program_log = (char*) malloc(log_size + 1);
+        program_log[log_size] = '\0';
+        clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, log_size + 1, program_log, NULL);
+        printf("%s\n", program_log);
+        free(program_log);
+        exit(1);
+    }
+
+    return p;
+}
+
+void ggml_cl_init(void) {
+    cl_int err = 0;
+    char * GGML_CLBLAST_PLATFORM = getenv("GGML_CLBLAST_PLATFORM");
+    char * GGML_CLBLAST_DEVICE = getenv("GGML_CLBLAST_DEVICE");
+    int plat_num = (GGML_CLBLAST_PLATFORM == NULL ? 0 : atoi(GGML_CLBLAST_PLATFORM));
+    int dev_num = (GGML_CLBLAST_DEVICE == NULL ? 0 : atoi(GGML_CLBLAST_DEVICE));
+    printf("\nInitializing CLBlast (First Run)...");
+    printf("\nAttempting to use: Platform=%d, Device=%d (If invalid, program will crash)\n",plat_num,dev_num);
+    cl_uint num_platforms;
+    clGetPlatformIDs(0, NULL, &num_platforms);
+    cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id));
+    clGetPlatformIDs(num_platforms, platforms, NULL);
+    platform = platforms[plat_num];
+    char platform_buffer[1024];
+    clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_buffer), &platform_buffer, NULL);
+    cl_uint num_devices;
+    clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
+    cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
+    clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
+    device = devices[dev_num];
+    char device_buffer[1024];
+    clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_buffer), &device_buffer, NULL);
+    printf("Using Platform: %s Device: %s\n", platform_buffer, device_buffer);
+    context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
+    CL_CHECK(err, "clCreateContext");
+    queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
+    CL_CHECK(err, "clCreateCommandQueue");
+
+    free(platforms);
+    free(devices);
+
+    program = build_program_from_source(context, device, clblast_dequant);
+
+    // Prepare dequantize kernels
+    kernel_q4_0 = clCreateKernel(program, "dequantize_row_q4_0", &err);
+    CL_CHECK(err, "clCreateKernel");
+    kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err);
+    CL_CHECK(err, "clCreateKernel");
+    kernel_q4_2 = clCreateKernel(program, "dequantize_row_q4_2", &err);
+    CL_CHECK(err, "clCreateKernel");
+}
+
+static void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_mem* buf) {
+    if (req_size <= *cur_size) {
+        return;
+    }
+
+    // Reallocate buffer with enough space
+    if (*cur_size > 0) {
+        clReleaseMemObject(*buf);
+    }
+    cl_int err;
+    *buf = clCreateBuffer(context, flags, req_size, NULL, &err);
+    *cur_size = req_size;
+    CL_CHECK(err, "clCreateBuffer");
+}
+
+void ggml_cl_sgemm_wrapper(
+        const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b,
+        const int m, const int n, const int k,
+        const float alpha, const void *host_a, const int lda,
+        const float *host_b, const int ldb, const float beta,
+        float *host_c, const int ldc, const int btype) {
+    cl_int err = 0;
+
+    cl_kernel kernel;
+    size_t global = n * k, local, size_qb;
+    bool dequant;
+
+    switch (btype) {
+    case GGML_TYPE_F32:
+        dequant = false;
+        break;
+    case GGML_TYPE_Q4_0:
+        dequant = true;
+        kernel = kernel_q4_0;
+        local = 16;
+        size_qb = global * (sizeof(float) + local) / 32;
+        break;
+    case GGML_TYPE_Q4_1:
+        dequant = true;
+        kernel = kernel_q4_1;
+        local = 16;
+        size_qb = global * (sizeof(float) * 2 + local) / 32;
+        break;
+    case GGML_TYPE_Q4_2:
+        dequant = true;
+        kernel = kernel_q4_2;
+        local = 8;
+        size_qb = global * (sizeof(short) + local) / 16;
+        break;
+    default:
+        fprintf(stderr, "Error: Unsupported OpenCL btype %d\n", btype);
+        abort();
+    }
+
+    const size_t size_a =  m * k * sizeof(float);
+    const size_t size_b =  n * k * sizeof(float);
+    const size_t size_c =  m * n * sizeof(float);
+
+    // Prepare buffers
+    ggml_cl_malloc(size_a, &cl_size_a, CL_MEM_READ_ONLY, &cl_buffer_a);
+    if (dequant) {
+        ggml_cl_malloc(size_qb, &cl_size_qb, CL_MEM_READ_ONLY, &cl_buffer_qb);
+    }
+    ggml_cl_malloc(size_b, &cl_size_b, CL_MEM_READ_WRITE, &cl_buffer_b);
+    ggml_cl_malloc(size_c, &cl_size_c, CL_MEM_WRITE_ONLY, &cl_buffer_c);
+
+    cl_event ev_a, ev_qb, ev_b;
+
+    if (dequant) {
+        err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_buffer_qb);
+        err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_buffer_b);
+        CL_CHECK(err, "clSetKernelArg");
+        clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb);
+    } else {
+        clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b);
+    }
+
+    clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a);
+    if (dequant) {
+        err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 1, &ev_qb, &ev_b);
+        CL_CHECK(err, "clEnqueueNDRangeKernel");
+        clReleaseEvent(ev_qb);
+    }
+    clWaitForEvents(1, &ev_a);
+    clWaitForEvents(1, &ev_b);
+    clReleaseEvent(ev_a);
+    clReleaseEvent(ev_b);
+
+    cl_event ev_sgemm;
+    CLBlastSgemm((CLBlastLayout)order,
+                 (CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b,
+                 m, n, k,
+                 alpha,
+                 cl_buffer_a, 0, lda,
+                 cl_buffer_b, 0, ldb,
+                 beta,
+                 cl_buffer_c, 0, ldc,
+                 &queue, &ev_sgemm);
+
+    cl_event ev_c;
+    clEnqueueReadBuffer(queue, cl_buffer_c, CL_TRUE, 0, size_c, host_c, 1, &ev_sgemm, &ev_c);
+
+    // Wait for completion
+    clWaitForEvents(1, &ev_c);
+    clReleaseEvent(ev_sgemm);
+    clReleaseEvent(ev_c);
+}
--- a/ggml-opencl.h
+++ b/ggml-opencl.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+void ggml_cl_init(void);
+
+enum ggml_blas_order {
+    GGML_BLAS_ORDER_ROW_MAJOR = 101,
+    GGML_BLAS_ORDER_COLUMN_MAJOR = 102,
+};
+
+enum ggml_blas_op {
+    GGML_BLAS_OP_N = 111,
+    GGML_BLAS_OP_T = 112,
+    GGML_BLAS_OP_C = 113,
+};
+
+void ggml_cl_sgemm_wrapper(const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b, const int m, const int n, const int k, const float alpha, const void *host_a, const int lda, const float *host_b, const int ldb, const float beta, float *host_c, const int ldc, const int btype);
+
+#ifdef  __cplusplus
+}
+#endif
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
--- a/llama.cpp
+++ b/llama.cpp
@@ -27,6 +27,7 @@
 #include <thread>
 #include <atomic>
 #include <mutex>
+#include <sstream>

 #define LLAMA_USE_SCRATCH
 #define LLAMA_MAX_SCRATCH_BUFFERS 16
@@ -53,7 +54,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
        { MODEL_7B,    512ull * MB },
        { MODEL_13B,   512ull * MB },
        { MODEL_30B,   512ull * MB },
-        { MODEL_65B,   512ull * MB },
+        { MODEL_65B,  1024ull * MB },
    };
    return _MEM_REQ_SCRATCH0;
 }
@@ -64,10 +65,10 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
        { MODEL_7B,    512ull * MB },
        { MODEL_13B,   512ull * MB },
        { MODEL_30B,   512ull * MB },
-        { MODEL_65B,   512ull * MB },
+        { MODEL_65B,  1024ull * MB },
    };
    return _MEM_REQ_SCRATCH1;
-};
+}

 // 2*n_embd*n_ctx*n_layer*sizeof(float16)
 static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
@@ -79,7 +80,7 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
        { MODEL_65B,  5120ull * MB },
    };
    return _MEM_REQ_KV_SELF;
-};
+}

 // this is mostly needed for temporary mul_mat buffers to dequantize the data
 // not actually needed if BLAS is disabled
@@ -92,7 +93,7 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
        { MODEL_65B, 1536ull * MB },
    };
    return _MEM_REQ_EVAL;
-};
+}

 // default hparams (LLaMA 7B)
 struct llama_hparams {
@@ -482,7 +483,9 @@ struct llama_file_loader {
                case GGML_TYPE_Q4_0:
                case GGML_TYPE_Q4_1:
                case GGML_TYPE_Q4_2:
-                case GGML_TYPE_Q4_3:
+                case GGML_TYPE_Q5_0:
+                case GGML_TYPE_Q5_1:
+                case GGML_TYPE_Q8_0:
                    break;
                default: {
                    throw format("unrecognized tensor type %u\n", shard.type);
@@ -556,7 +559,9 @@ struct llama_file_saver {
            case GGML_TYPE_Q4_0:
            case GGML_TYPE_Q4_1:
            case GGML_TYPE_Q4_2:
-            case GGML_TYPE_Q4_3:
+            case GGML_TYPE_Q5_0:
+            case GGML_TYPE_Q5_1:
+            case GGML_TYPE_Q8_0:
                break;
            default: LLAMA_ASSERT(false);
        }
@@ -846,7 +851,9 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
        case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
                                      return "mostly Q4_1, some F16";
        case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
-        case LLAMA_FTYPE_MOSTLY_Q4_3: return "mostly Q4_3";
+        case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
+        case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
+        case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
        default:                      return "unknown, may not work";
    }
 }
@@ -1075,7 +1082,7 @@ static bool llama_eval_internal(
    // for big prompts, if BLAS is enabled, it is better to use only one thread
    // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
    ggml_cgraph gf = {};
-    gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_cublas() ? 1 : n_threads;
+    gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;

    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
    memcpy(embd->data, tokens, N*ggml_element_size(embd));
@@ -1249,9 +1256,11 @@ static bool llama_eval_internal(
    ggml_build_forward_expand(&gf, inpL);
    ggml_graph_compute       (ctx0, &gf);

+#ifdef GGML_PERF
    // print timing information per ggml operation (for debugging purposes)
    // requires GGML_PERF to be defined
-    //ggml_graph_print(&gf);
+    ggml_graph_print(&gf);
+#endif

    // plot the computation graph in dot format (for debugging purposes)
    //if (n_past%100 == 0) {
@@ -1581,7 +1590,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
        case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
        case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
-        case LLAMA_FTYPE_MOSTLY_Q4_3: quantized_type = GGML_TYPE_Q4_3; break;
+        case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
+        case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
+        case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
        default: throw format("invalid output file type %d\n", ftype);
    };

@@ -1618,6 +1629,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        // quantize only 2D tensors
        quantize &= (tensor.ne.size() == 2);

+        // uncomment this to keep the output layer in FP16
+        //if (tensor.name == "output.weight") {
+        //    quantize = false;
+        //}
+
        enum ggml_type new_type;
        void * new_data;
        size_t new_size;
@@ -1782,7 +1798,7 @@ struct llama_context * llama_init_from_file(
        if (params.logits_all) {
            ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);
        } else {
-            ctx->logits.reserve(hparams.n_ctx);
+            ctx->logits.reserve(hparams.n_vocab);
        }

        if (params.embedding){
@@ -2064,31 +2080,198 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
    }
 }

-// Returns the KV cache that will contain the context for the
-// ongoing prediction with the model.
-const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
-    return ctx->model.kv_self.buf.addr;
-}
-
-// Returns the size of the KV cache
-size_t llama_get_kv_cache_size(struct llama_context * ctx) {
-    return ctx->model.kv_self.buf.size;
-}
-
 int llama_get_kv_cache_token_count(struct llama_context * ctx) {
    return ctx->model.kv_self.n;
 }

-// Sets the KV cache containing the current context for the model
-void llama_set_kv_cache(
-        struct llama_context * ctx,
-               const uint8_t * kv_cache,
-                      size_t   n_size,
-                         int   n_token_count) {
-    // Make sure we have the same kv cache setup
-    LLAMA_ASSERT(ctx->model.kv_self.buf.size == n_size);
-    memcpy(ctx->model.kv_self.buf.addr, kv_cache, n_size);
-    ctx->model.kv_self.n = n_token_count;
+#define LLAMA_MAX_RNG_STATE 64*1024
+
+void llama_set_rng_seed(struct llama_context * ctx, int seed) {
+    if (seed <= 0) {
+        seed = time(NULL);
+    }
+    ctx->rng.seed(seed);
+}
+
+// Returns the size of the state
+size_t llama_get_state_size(struct llama_context * ctx) {
+    // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
+    // for reference, std::mt19937(1337) serializes to 6701 bytes.
+    const size_t s_rng_size        = sizeof(size_t);
+    const size_t s_rng             = LLAMA_MAX_RNG_STATE;
+    const size_t s_logits_capacity = sizeof(size_t);
+    const size_t s_logits_size     = sizeof(size_t);
+    const size_t s_logits          = ctx->logits.capacity() * sizeof(float);
+    const size_t s_embedding_size  = sizeof(size_t);
+    const size_t s_embedding       = ctx->embedding.size() * sizeof(float);
+    const size_t s_kv_size         = sizeof(size_t);
+    const size_t s_kv_ntok         = sizeof(int);
+    const size_t s_kv              = ctx->model.kv_self.buf.size;
+
+    const size_t s_total = (
+        + s_rng_size
+        + s_rng
+        + s_logits_capacity
+        + s_logits_size
+        + s_logits
+        + s_embedding_size
+        + s_embedding
+        + s_kv_size
+        + s_kv_ntok
+        + s_kv
+    );
+
+    return s_total;
+}
+
+// Copies the state to the specified destination address
+size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
+    uint8_t * out = dest;
+
+    // copy rng
+    {
+        std::stringstream rng_ss;
+        rng_ss << ctx->rng;
+
+        const size_t rng_size = rng_ss.str().size();
+        char rng_buf[LLAMA_MAX_RNG_STATE];
+
+        memset(&rng_buf[0], 0, LLAMA_MAX_RNG_STATE);
+        memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size());
+
+        memcpy(out, &rng_size,   sizeof(rng_size));    out += sizeof(rng_size);
+        memcpy(out, &rng_buf[0], LLAMA_MAX_RNG_STATE); out += LLAMA_MAX_RNG_STATE;
+    }
+
+    // copy logits
+    {
+        const size_t logits_cap  = ctx->logits.capacity();
+        const size_t logits_size = ctx->logits.size();
+
+        memcpy(out, &logits_cap,  sizeof(logits_cap));  out += sizeof(logits_cap);
+        memcpy(out, &logits_size, sizeof(logits_size)); out += sizeof(logits_size);
+
+        if (logits_size) {
+            memcpy(out, ctx->logits.data(), logits_size * sizeof(float));
+        }
+
+        out += logits_cap * sizeof(float);
+    }
+
+    // copy embeddings
+    {
+        const size_t embedding_size = ctx->embedding.size();
+
+        memcpy(out, &embedding_size, sizeof(embedding_size)); out += sizeof(embedding_size);
+
+        if (embedding_size) {
+            memcpy(out, ctx->embedding.data(), embedding_size * sizeof(float));
+            out += embedding_size * sizeof(float);
+        }
+    }
+
+    // copy kv cache
+    {
+        const size_t kv_size = ctx->model.kv_self.buf.size;
+        const int    kv_ntok = llama_get_kv_cache_token_count(ctx);
+
+        memcpy(out, &kv_size, sizeof(kv_size)); out += sizeof(kv_size);
+        memcpy(out, &kv_ntok, sizeof(kv_ntok)); out += sizeof(kv_ntok);
+
+        if (kv_size) {
+            memcpy(out, ctx->model.kv_self.buf.addr, kv_size); out += kv_size;
+        }
+    }
+
+    const size_t written  = out - dest;
+    const size_t expected = llama_get_state_size(ctx);
+
+    LLAMA_ASSERT(written == expected);
+
+    return written;
+}
+
+// Sets the state reading from the specified source address
+size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
+    const uint8_t * in = src;
+
+    // set rng
+    {
+        size_t rng_size;
+        char   rng_buf[LLAMA_MAX_RNG_STATE];
+
+        memcpy(&rng_size,   in, sizeof(rng_size));    in += sizeof(rng_size);
+        memcpy(&rng_buf[0], in, LLAMA_MAX_RNG_STATE); in += LLAMA_MAX_RNG_STATE;
+
+        std::stringstream rng_ss;
+        rng_ss.str(std::string(&rng_buf[0], rng_size));
+        rng_ss >> ctx->rng;
+
+        LLAMA_ASSERT(rng_ss.fail() == false);
+    }
+
+    // set logits
+    {
+        size_t logits_cap;
+        size_t logits_size;
+
+        memcpy(&logits_cap,  in, sizeof(logits_cap));  in += sizeof(logits_cap);
+        memcpy(&logits_size, in, sizeof(logits_size)); in += sizeof(logits_size);
+
+        LLAMA_ASSERT(ctx->logits.capacity() == logits_cap);
+
+        if (logits_size) {
+            ctx->logits.resize(logits_size);
+            memcpy(ctx->logits.data(), in, logits_size * sizeof(float));
+        }
+
+        in += logits_cap * sizeof(float);
+    }
+
+    // set embeddings
+    {
+        size_t embedding_size;
+
+        memcpy(&embedding_size, in, sizeof(embedding_size)); in += sizeof(embedding_size);
+
+        LLAMA_ASSERT(ctx->embedding.capacity() == embedding_size);
+
+        if (embedding_size) {
+            memcpy(ctx->embedding.data(), in, embedding_size * sizeof(float));
+            in += embedding_size * sizeof(float);
+        }
+    }
+
+    // set kv cache
+    {
+        size_t kv_size;
+        int kv_ntok;
+
+        memcpy(&kv_size, in, sizeof(kv_size)); in += sizeof(kv_size);
+        memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok);
+
+        if (kv_size) {
+            LLAMA_ASSERT(ctx->model.kv_self.buf.size == kv_size);
+
+            void * k_data = ctx->model.kv_self.k->data; // remember data pointers
+            void * v_data = ctx->model.kv_self.v->data; // because their value is stored in buf and overwritten by memcpy
+
+            memcpy(ctx->model.kv_self.buf.addr, in, kv_size); in += kv_size;
+
+            ctx->model.kv_self.k->data = k_data; // restore correct data pointers
+            ctx->model.kv_self.v->data = v_data;
+
+        }
+
+        ctx->model.kv_self.n = kv_ntok;
+    }
+
+    const size_t nread    = in - src;
+    const size_t expected = llama_get_state_size(ctx);
+
+    LLAMA_ASSERT(nread == expected);
+
+    return nread;
 }

 int llama_eval(
@@ -2243,3 +2426,57 @@ const char * llama_print_system_info(void) {
 std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
    return ctx->model.tensors_by_name;
 }
+
+size_t llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
+    // TODO leverage mmap
+    llama_file file(path_session, "rb");
+    const uint32_t magic = file.read_u32();
+    const uint32_t version = file.read_u32();
+
+    if (!(magic == 'ggsn' && version == 0)) {
+        fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
+        return 0;
+    }
+
+    llama_hparams session_hparams;
+    file.read_raw(&session_hparams, sizeof(llama_hparams));
+
+    // REVIEW
+    if (session_hparams != ctx->model.hparams) {
+        fprintf(stderr, "%s : model hparams didn't match from session file!\n", __func__);
+        return 0;
+    }
+
+    const uint32_t n_token_count = file.read_u32();
+    LLAMA_ASSERT(n_token_capacity >= n_token_count);
+    file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
+    *n_token_count_out = n_token_count;
+
+    const size_t n_state_size = file.size - file.tell();
+    const size_t n_orig_state_size = llama_get_state_size(ctx);
+    if (n_state_size != n_orig_state_size) {
+        fprintf(stderr, "%s : failed to validate state size\n", __func__);
+    }
+    std::unique_ptr<uint8_t[]> state_data(new uint8_t[n_state_size]);
+    file.read_raw(state_data.get(), n_state_size);
+    return llama_set_state_data(ctx, state_data.get());
+}
+
+size_t llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
+    // TODO save temp & swap
+    llama_file file(path_session, "wb");
+
+    const size_t n_state_size = llama_get_state_size(ctx);
+    std::unique_ptr<uint8_t[]> state_data(new uint8_t[n_state_size]);
+    llama_copy_state_data(ctx, state_data.get());
+
+    file.write_u32('ggsn'); // magic
+    file.write_u32(0); // version
+    file.write_raw(&ctx->model.hparams, sizeof(llama_hparams));
+
+    file.write_u32((uint32_t) n_token_count); // REVIEW
+    file.write_raw(tokens, sizeof(llama_token) * n_token_count);
+
+    file.write_raw(state_data.get(), n_state_size);
+    return n_state_size; // REVIEW
+}
--- a/llama.h
+++ b/llama.h
@@ -73,7 +73,10 @@ extern "C" {
        LLAMA_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
        LLAMA_FTYPE_MOSTLY_Q4_2 = 5,  // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_3 = 6,  // except 1d tensors
+        // LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
+        LLAMA_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_0 = 8,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_1 = 9,  // except 1d tensors
    };

    LLAMA_API struct llama_context_params llama_context_default_params();
@@ -112,22 +115,27 @@ extern "C" {
                      const char * path_base_model,
                             int   n_threads);

-    // Returns the KV cache that will contain the context for the
-    // ongoing prediction with the model.
-    LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx);
-
-    // Returns the size of the KV cache
-    LLAMA_API size_t llama_get_kv_cache_size(struct llama_context * ctx);
-
    // Returns the number of tokens in the KV cache
    LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);

-    // Sets the KV cache containing the current context for the model
-    LLAMA_API void llama_set_kv_cache(
-            struct llama_context * ctx,
-                   const uint8_t * kv_cache,
-                          size_t   n_size,
-                             int   n_token_count);
+    // Sets the current rng seed.
+    LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
+
+    // Returns the size in bytes of the state (rng, logits, embedding and kv_cache)
+    LLAMA_API size_t llama_get_state_size(struct llama_context * ctx);
+
+    // Copies the state to the specified destination address.
+    // Destination needs to have allocated enough memory.
+    // Returns the number of bytes copied
+    LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest);
+
+    // Set the state reading from the specified address
+    // Returns the number of bytes read
+    LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src);
+
+    // Save/load session file
+    LLAMA_API size_t llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
+    LLAMA_API size_t llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);

    // Run the llama inference to obtain the logits and probabilities for the next token.
    // tokens + n_tokens is the provided batch of new tokens to process
--- a/llama_util.h
+++ b/llama_util.h
@@ -21,6 +21,9 @@
        #if defined(_POSIX_MAPPED_FILES)
            #include <sys/mman.h>
        #endif
+        #if defined(_POSIX_MEMLOCK_RANGE)
+            #include <sys/resource.h>
+        #endif
    #endif
 #endif

@@ -303,8 +306,18 @@ struct llama_mlock {
        if (!mlock(addr, size)) {
            return true;
        } else {
-            fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n" MLOCK_SUGGESTION,
-                    size, this->size, std::strerror(errno));
+            char* errmsg = std::strerror(errno);
+            bool suggest = (errno == ENOMEM);
+
+            // Check if the resource limit is fine after all
+            struct rlimit lock_limit;
+            if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit))
+                suggest = false;
+            if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size))
+                suggest = false;
+
+            fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
+                    size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
            return false;
        }
    }
--- a/pocs/vdot/CMakeLists.txt
+++ b/pocs/vdot/CMakeLists.txt
@@ -2,3 +2,8 @@ set(TARGET vdot)
 add_executable(${TARGET} vdot.cpp)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
+
+set(TARGET q8dot)
+add_executable(${TARGET} q8dot.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/pocs/vdot/q8dot.cpp
+++ b/pocs/vdot/q8dot.cpp
@@ -0,0 +1,172 @@
+#include <cstdio>
+#include <type_traits>
+#include <vector>
+#include <random>
+#include <chrono>
+#include <cstdlib>
+#include <cmath>
+#include <cassert>
+#include <cstring>
+#include <array>
+#include <type_traits>
+
+#include <ggml.h>
+
+constexpr int kVecSize = 1 << 16;
+
+// Copy-pasted from ggml.c
+#define QK4_0 32
+typedef struct {
+    float   d;          // delta
+    uint8_t qs[QK4_0 / 2];  // nibbles / quants
+} block_q4_0;
+static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding");
+
+#define QK4_1 32
+typedef struct {
+    float   d;          // delta
+    float   m;          // min
+    uint8_t qs[QK4_1 / 2];  // nibbles / quants
+} block_q4_1;
+static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
+
+// Copy-pasted from ggml.c
+#define QK8_0 32
+typedef struct {
+    float   d;          // delta
+    float   s;          // d * sum(qs[i])
+    int8_t  qs[QK8_0];  // quants
+} block_q8_0;
+static_assert(sizeof(block_q8_0) == 2*sizeof(float) + QK8_0, "wrong q8_0 block size/padding");
+
+static_assert(QK4_1 == QK8_0, "QK4_1 and QK8_0 must be the same");
+static_assert(QK4_0 == QK8_0, "QK4_0 and QK8_0 must be the same");
+
+template <typename T>
+void fillQ4blocks(std::vector<T>& blocks, std::mt19937& rndm) {
+    for (auto& b : blocks) {
+        b.d = 1;
+        for (int i=0; i<QK4_1/2; ++i) {
+            uint8_t v1 = rndm() >> 28;
+            uint8_t v2 = rndm() >> 28;
+            b.qs[i] = v1 | (v2 << 4);
+        }
+    }
+}
+
+void fillQ80blocks(std::vector<block_q8_0>& blocks, std::mt19937& rndm) {
+    for (auto& b : blocks) {
+        b.d = 1;
+        int sum = 0;
+        for (int i=0; i<QK8_0; ++i) {
+            b.qs[i] = (rndm() >> 24) - 128;
+            sum += b.qs[i];
+        }
+        b.s = b.d * sum;
+    }
+}
+
+float simpleDot(const block_q4_0& x, const block_q8_0& y) {
+    int s1 = 0; //, s2 = 0;
+    for (int i=0; i<QK4_1/2; i+=2) {
+        int v1 = x.qs[i+0] & 0xf;
+        int v2 = x.qs[i+0] >> 4;
+        int v3 = x.qs[i+1] & 0xf;
+        int v4 = x.qs[i+1] >> 4;
+        int j = 2*i;
+        s1 += v1*y.qs[j] + v2*y.qs[j+1] + v3*y.qs[j+2] + v4*y.qs[j+3];
+        //s2 += y.qs[j] + y.qs[j+1] + y.qs[j+2] + y.qs[j+3];
+    }
+    return y.d * x.d * s1 - 8 * x.d * y.s;
+    //return y.d * x.d * (s1 - 8 * s2);
+}
+
+float simpleDot(const block_q4_1& x, const block_q8_0& y) {
+    int s1 = 0; //, s2 = 0;
+    for (int i=0; i<QK4_1/2; i+=2) {
+        int v1 = x.qs[i+0] & 0xf;
+        int v2 = x.qs[i+0] >> 4;
+        int v3 = x.qs[i+1] & 0xf;
+        int v4 = x.qs[i+1] >> 4;
+        int j = 2*i;
+        s1 += v1*y.qs[j] + v2*y.qs[j+1] + v3*y.qs[j+2] + v4*y.qs[j+3];
+        //s2 += y.qs[j] + y.qs[j+1] + y.qs[j+2] + y.qs[j+3];
+    }
+    return y.d * x.d * s1 + y.s * x.m;
+    //return y.d * (x.d * s1 + x.m * s2);
+}
+
+struct Stat {
+    double sum = 0, sumt = 0, sumt2 = 0, maxt = 0;
+    int nloop = 0;
+    void addResult(double s, double t) {
+        sum += s;
+        sumt += t; sumt2 += t*t; maxt = std::max(maxt, t);
+        ++nloop;
+    }
+    void reportResult(const char* title) const {
+        if (nloop < 1) {
+            printf("%s(%s): no result\n",__func__,title);
+            return;
+        }
+        printf("============ %s\n",title);
+        printf("<dot> = %g\n",sum/nloop);
+        auto t = sumt/nloop, dt = sumt2/nloop - t*t;
+        if (dt > 0) dt = sqrt(dt);
+        printf("<time> = %g +/- %g us. Max. time = %g us.\n",t,dt,maxt);
+    }
+};
+
+
+int main(int argc, char** argv) {
+
+    int nloop = argc > 1 ? atoi(argv[1]) : 10;
+    int type  = argc > 2 ? atoi(argv[2]) : 1;
+
+    std::mt19937 rndm(1234);
+
+    std::vector<block_q4_1> x41;
+    std::vector<block_q4_0> x40;
+    std::vector<block_q8_0> y(kVecSize);
+    if (type == 0) x40.resize(kVecSize);
+    else {
+        x41.resize(kVecSize);
+        for (auto& b : x41) b.m = 1;
+    }
+
+    auto ggml_type = type == 0 ? GGML_TYPE_Q4_0 : GGML_TYPE_Q4_1;
+
+    auto funcs = ggml_internal_get_quantize_fn(ggml_type);
+
+    Stat simple, ggml;
+
+    for (int iloop=0; iloop<nloop; ++iloop) {
+
+        if (type == 0) fillQ4blocks(x40, rndm);
+        else fillQ4blocks(x41, rndm);
+        fillQ80blocks(y, rndm);
+
+        auto t1 = std::chrono::high_resolution_clock::now();
+        double s = 0;
+        if (type == 0) for (int i=0; i<kVecSize; ++i) s += simpleDot(x40[i], y[i]);
+        else for (int i=0; i<kVecSize; ++i) s += simpleDot(x41[i], y[i]);
+        auto t2 = std::chrono::high_resolution_clock::now();
+        auto t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
+        if (iloop > 3) simple.addResult(s, t);
+
+        t1 = std::chrono::high_resolution_clock::now();
+        float fs;
+        if (type == 0) funcs.vec_dot_q(kVecSize * QK4_1, &fs, x40.data(), y.data());
+        else funcs.vec_dot_q(kVecSize * QK4_1, &fs, x41.data(), y.data());
+        t2 = std::chrono::high_resolution_clock::now();
+        t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
+        if (iloop > 3) ggml.addResult(fs, t);
+
+    }
+
+    // Report the time (and the average of the dot products so the compiler does not come up with the idea
+    // of optimizing away the function calls after figuring that the result is not used).
+    simple.reportResult("Simple");
+    ggml.reportResult("ggml");
+    return 0;
+}
--- a/scripts/sync-ggml.sh
+++ b/scripts/sync-ggml.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+cp -rpv ../ggml/src/ggml.c          ./ggml.c
+cp -rpv ../ggml/src/ggml-cuda.cu    ./ggml-cuda.cu
+cp -rpv ../ggml/src/ggml-cuda.h     ./ggml-cuda.h
+cp -rpv ../ggml/include/ggml/ggml.h ./ggml.h
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -6,5 +6,6 @@ function(llama_add_test source)
 endfunction()

 # llama_add_test(test-double-float.c) # SLOW
-llama_add_test(test-quantize.c)
+llama_add_test(test-quantize-fns.cpp)
+llama_add_test(test-quantize-perf.cpp)
 llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
--- a/tests/test-quantize-fns.cpp
+++ b/tests/test-quantize-fns.cpp
@@ -0,0 +1,154 @@
+// Unit tests for quantization specific functions - quantize, dequantize and dot product
+
+#include "ggml.h"
+
+#undef NDEBUG
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <string>
+#include <vector>
+
+
+const float MAX_QUANTIZATION_REFERENCE_ERROR = 0.0001;
+const float MAX_QUANTIZATION_TOTAL_ERROR = 0.002;
+const float MAX_DOT_PRODUCT_ERROR = 0.02;
+
+const char* RESULT_STR[] = {"ok", "FAILED"};
+
+
+// Generate synthetic data
+void generate_data(float offset, size_t n, float * dst) {
+    for (size_t i = 0; i < n; i++) {
+        dst[i] = 0.1 + 2*cosf(i + offset);
+    }
+}
+
+// Calculate RMSE between two float arrays
+float array_rmse(const float * a1, const float * a2, size_t n) {
+    double sum = 0;
+    for (size_t i = 0; i < n; i++) {
+        double diff = a1[i] - a2[i];
+        sum += diff * diff;
+    }
+    return sqrtf(sum) / n;
+}
+
+// Total quantization error on test data
+float total_quantization_error(quantize_fns_t & qfns, size_t test_size, const float * test_data) {
+    std::vector<uint8_t> tmp_q(2*test_size);
+    std::vector<float> tmp_out(test_size);
+
+    qfns.quantize_row_q(test_data, tmp_q.data(), test_size);
+    qfns.dequantize_row_q(tmp_q.data(), tmp_out.data(), test_size);
+    return array_rmse(test_data, tmp_out.data(), test_size);
+}
+
+// Total quantization error on test data
+float reference_quantization_error(quantize_fns_t & qfns, size_t test_size, const float * test_data) {
+    std::vector<uint8_t> tmp_q(2*test_size);
+    std::vector<float> tmp_out(test_size);
+    std::vector<float> tmp_out_ref(test_size);
+
+    qfns.quantize_row_q(test_data, tmp_q.data(), test_size);
+    qfns.dequantize_row_q(tmp_q.data(), tmp_out.data(), test_size);
+
+    qfns.quantize_row_q_reference(test_data, tmp_q.data(), test_size);
+    qfns.dequantize_row_q(tmp_q.data(), tmp_out_ref.data(), test_size);
+
+    return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size);
+}
+
+float dot_product(const float * a1, const float * a2, size_t test_size) {
+    double sum = 0;
+    for (size_t i = 0; i < test_size; i++) {
+        sum += a1[i] * a2[i];
+    }
+    return sum;
+}
+
+// Total dot product error
+float dot_product_error(quantize_fns_t & qfns, size_t test_size, const float * test_data1, const float *test_data2) {
+    std::vector<uint8_t> tmp_q1(2*test_size);
+    std::vector<uint8_t> tmp_q2(2*test_size);
+
+    qfns.quantize_row_q    (test_data1, tmp_q1.data(), test_size);
+    qfns.quantize_row_q_dot(test_data2, tmp_q2.data(), test_size);
+
+    float result = INFINITY;
+    qfns.vec_dot_q(test_size, &result, tmp_q1.data(), tmp_q2.data());
+
+    const float dot_ref = dot_product(test_data1, test_data2, test_size);
+
+    return fabsf(result - dot_ref) / test_size;
+}
+
+int main(int argc, char * argv[]) {
+    bool verbose = false;
+    const size_t test_size = 32 * 128;
+
+    std::string arg;
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+
+        if (arg == "-v") {
+            verbose = true;
+        } else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            return 1;
+        }
+    }
+
+    std::vector<float> test_data(test_size);
+    std::vector<float> test_data2(test_size);
+
+    generate_data(0.0, test_data.size(), test_data.data());
+    generate_data(1.0, test_data2.size(), test_data2.data());
+
+    // Initialize GGML, ensures float conversion tables are initialized
+    struct ggml_init_params ggml_params = {
+        /* .mem_size   = */ 1*1024,
+        /* .mem_buffer = */ NULL,
+        /* .no_alloc   = */ true,
+    };
+    struct ggml_context * ctx = ggml_init(ggml_params);
+
+    int num_failed = 0;
+    bool failed = false;
+
+    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
+        ggml_type type = (ggml_type) i;
+        quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
+
+        if (qfns.quantize_row_q && qfns.dequantize_row_q) {
+            const float total_error = total_quantization_error(qfns, test_size, test_data.data());
+            failed = !(total_error < MAX_QUANTIZATION_TOTAL_ERROR);
+            num_failed += failed;
+            if (failed || verbose) {
+                printf("%5s absolute quantization error:    %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], total_error);
+            }
+
+            const float reference_error = reference_quantization_error(qfns, test_size, test_data.data());
+            failed = !(reference_error < MAX_QUANTIZATION_REFERENCE_ERROR);
+            num_failed += failed;
+            if (failed || verbose) {
+                printf("%5s reference implementation error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], reference_error);
+            }
+
+            const float vec_dot_error = dot_product_error(qfns, test_size, test_data.data(), test_data2.data());
+            failed = !(vec_dot_error < MAX_DOT_PRODUCT_ERROR);
+            num_failed += failed;
+            if (failed || verbose) {
+                printf("%5s dot product error:              %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], vec_dot_error);
+            }
+        }
+    }
+
+    if (num_failed || verbose) {
+        printf("%d tests failed\n", num_failed);
+    }
+
+    ggml_free(ctx);
+
+    return num_failed > 0;
+}
--- a/tests/test-quantize-perf.cpp
+++ b/tests/test-quantize-perf.cpp
@@ -0,0 +1,310 @@
+// Benchmark quantization specific functions on synthetic data
+
+#include "ggml.h"
+
+#undef NDEBUG
+#include <algorithm>
+#include <assert.h>
+#include <functional>
+#include <inttypes.h>
+#include <math.h>
+#include <memory>
+#include <stdio.h>
+#include <string>
+#include <vector>
+
+#define MAX_ALIGNMENT 64
+#define QK 32
+#define WARMUP 5
+#define ITERATIONS 10
+
+#define L1_SIZE      32*128
+#define L2_SIZE     32*2048
+#define L3_SIZE    32*20480
+#define MEM_SIZE 32*2048000
+
+struct quantize_perf_params {
+    std::vector<std::string> include_types;
+    std::vector<size_t> test_sizes;
+    size_t alignment_offset = 0;
+    bool op_quantize_row_q_reference = false;
+    bool op_quantize_row_q = false;
+    bool op_dequantize_row_q = false;
+    bool op_quantize_row_q_dot = false;
+    bool op_vec_dot_q = false;
+};
+
+
+#if defined(__x86_64__) || defined(__i386__)
+
+#include <x86intrin.h>
+inline int64_t cpu_cycles() {
+// Rough way to detect new-ish CPUs
+#ifdef __POPCNT__
+    unsigned int dummy;
+    return __rdtscp(&dummy);
+#else
+    return __rdtsc();
+#endif
+}
+
+#else
+
+#define cpu_cycles() 0
+
+#endif
+
+
+// Generate synthetic data
+void generate_data(float offset, size_t n, float * dst) {
+    for (size_t i = 0; i < n; i++) {
+        dst[i] = 0.1 + 2*cosf(i + offset);
+    }
+}
+
+float gigabytes_per_second(size_t bytes, int64_t usecs) {
+    return bytes / (float) usecs * 1000000 / (1024*1024*1024);
+}
+
+void * align_with_offset(void * ptr, int offset) {
+    size_t dummy_size = MAX_ALIGNMENT * 4;
+    return (char *) std::align(MAX_ALIGNMENT, MAX_ALIGNMENT, ptr, dummy_size) + offset;
+}
+
+void benchmark_function(size_t size, size_t q_size, std::function<size_t(void)> function) {
+    int64_t min_time_us = INT64_MAX;
+    int64_t total_time_us = 0;
+    int64_t min_time_cycles = INT64_MAX;
+    int64_t total_time_cycles = 0;
+
+    for (int i = 0; i < WARMUP; i++) {
+        function();
+    }
+
+
+    for (int i = 0; i < ITERATIONS; i++) {
+        const int64_t start_time = ggml_time_us();
+        const int64_t start_cycles = cpu_cycles();
+
+        function();
+
+        const int64_t end_cycles = cpu_cycles();
+        const int64_t end_time = ggml_time_us();
+
+        total_time_cycles += end_cycles - start_cycles;
+        min_time_cycles = std::min(min_time_cycles, end_cycles - start_cycles);
+        total_time_us += end_time - start_time;
+        min_time_us = std::min(min_time_us, end_time - start_time);
+    }
+
+    printf("      min cycles/%d vals   : %9.2f\n",  QK, QK * min_time_cycles / (float) size);
+    printf("      avg cycles/%d vals   : %9.2f\n",  QK, QK * total_time_cycles / (float) (size * ITERATIONS));
+    printf("      float32 throughput   : %9.2f GB/s\n",  gigabytes_per_second(4 * size * ITERATIONS, total_time_us));
+    printf("      quantized throughput : %9.2f GB/s\n",  gigabytes_per_second(q_size * ITERATIONS, total_time_us));
+}
+
+int main(int argc, char * argv[]) {
+    quantize_perf_params params {};
+
+    // read command line
+
+    bool invalid_param = false;
+    std::string arg;
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+
+        if (arg == "--size") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            size_t size = std::stoi(argv[i]);
+            if (size % 32 != 0) {
+                fprintf(stderr, "error: size %zu not divisible by 32\n", size);
+                invalid_param = true;
+                break;
+            }
+            params.test_sizes.push_back(size);
+        } else if (arg == "-3") {
+            // quick select sizes that probably fit in CPU caches
+            params.test_sizes.push_back(L1_SIZE);
+            params.test_sizes.push_back(L2_SIZE);
+            params.test_sizes.push_back(L3_SIZE);
+        } else if (arg == "-4") {
+            // quick select cache sizes + memory
+            params.test_sizes.push_back(L1_SIZE);
+            params.test_sizes.push_back(L2_SIZE);
+            params.test_sizes.push_back(L3_SIZE);
+            params.test_sizes.push_back(MEM_SIZE);
+        } else if (arg == "--op") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            std::string op {argv[i]};
+            if (op == "quantize_row_q_reference") {
+                params.op_quantize_row_q_reference = true;
+            } else if (op == "quantize_row_q") {
+                params.op_quantize_row_q = true;
+            } else if (op == "dequantize_row_q") {
+                params.op_dequantize_row_q = true;
+            } else if (op == "quantize_row_q_dot") {
+                params.op_quantize_row_q_dot = true;
+            } else if (op == "vec_dot_q") {
+                params.op_vec_dot_q = true;
+            } else {
+                invalid_param = true;
+                break;
+            }
+        } else if (arg == "--type") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.include_types.push_back(argv[i]);
+        } else if (arg == "--alignment-offset") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            int alignment = std::stoi(argv[i]);
+            if (alignment < 0 || alignment > MAX_ALIGNMENT) {
+            fprintf(stderr, "error: aligment-offset must be less than %d\n", MAX_ALIGNMENT);
+                invalid_param = true;
+                break;
+            }
+            params.alignment_offset = alignment;
+        } else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            return 1;
+        }
+    }
+    if (invalid_param) {
+        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+        return 1;
+    }
+
+    if (params.test_sizes.empty()) {
+        params.test_sizes.push_back(L1_SIZE);
+    }
+    if (!(params.op_quantize_row_q_reference || params.op_quantize_row_q || params.op_dequantize_row_q || params.op_quantize_row_q_dot || params.op_vec_dot_q)) {
+        params.op_quantize_row_q_reference = params.op_quantize_row_q = params.op_dequantize_row_q = params.op_quantize_row_q_dot = params.op_vec_dot_q = true;
+    }
+
+    std::sort(params.test_sizes.begin(), params.test_sizes.end());
+    size_t largest = params.test_sizes.back();
+
+    std::vector<uint8_t> test_data1_v(largest*4 + MAX_ALIGNMENT*2);
+    std::vector<uint8_t> test_data2_v(largest*4 + MAX_ALIGNMENT*2);
+    std::vector<uint8_t> test_q1_v(largest*4 + MAX_ALIGNMENT*2);
+    std::vector<uint8_t> test_q2_v(largest*4 + MAX_ALIGNMENT*2);
+    std::vector<uint8_t> test_out_v(largest*4 + MAX_ALIGNMENT*2);
+
+    float * test_data1 = (float *) align_with_offset(test_data1_v.data(), params.alignment_offset);
+    float * test_data2 = (float *) align_with_offset(test_data2_v.data(), params.alignment_offset);
+    float * test_q1 = (float *) align_with_offset(test_q1_v.data(), params.alignment_offset);
+    float * test_q2 = (float *) align_with_offset(test_q2_v.data(), params.alignment_offset);
+    float * test_out = (float *) align_with_offset(test_out_v.data(), params.alignment_offset);
+
+    generate_data(0, largest, test_data1);
+    generate_data(1, largest, test_data2);
+
+
+    // Initialize GGML, ensures float conversion tables are initialized
+    struct ggml_init_params ggml_params = {
+        /* .mem_size   = */ 1*1024,
+        /* .mem_buffer = */ NULL,
+        /* .no_alloc   = */ true,
+    };
+    struct ggml_context * ctx = ggml_init(ggml_params);
+
+    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
+        ggml_type type = (ggml_type) i;
+        quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
+        if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), ggml_type_name(type)) == params.include_types.end()) {
+            continue;
+        }
+
+        if (qfns.quantize_row_q && qfns.dequantize_row_q) {
+            printf("%s\n", ggml_type_name(type));
+
+            if (params.op_quantize_row_q_reference) {
+                printf("  quantize_row_q_reference\n");
+                for (size_t size : params.test_sizes) {
+                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
+                    auto quantize_fn = [&](void ) {
+                        qfns.quantize_row_q_reference(test_data1, test_q1, size);
+                        return test_q1[0];
+                    };
+                    size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
+                    benchmark_function(size, quantized_size, quantize_fn);
+                }
+                printf("\n");
+            }
+
+            if (params.op_quantize_row_q) {
+                printf("  quantize_row_q\n");
+                for (size_t size : params.test_sizes) {
+                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
+                    auto quantize_fn = [&](void ) {
+                        qfns.quantize_row_q(test_data1, test_q1, size);
+                        return test_q1[0];
+                    };
+                    size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
+                    benchmark_function(size, quantized_size, quantize_fn);
+                }
+                printf("\n");
+            }
+
+            if (params.op_dequantize_row_q) {
+                printf("  dequantize_row_q\n");
+                qfns.quantize_row_q(test_data1, test_q1, largest);
+                for (size_t size : params.test_sizes) {
+                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
+                    auto quantize_fn = [&](void ) {
+                        qfns.dequantize_row_q(test_q1, test_out, size);
+                        return test_out[0];
+                    };
+                    size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
+                    benchmark_function(size, quantized_size, quantize_fn);
+                }
+                printf("\n");
+            }
+
+            if (params.op_quantize_row_q_dot) {
+                printf("  quantize_row_q_dot\n");
+                for (size_t size : params.test_sizes) {
+                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
+                    auto quantize_fn = [&](void ) {
+                        qfns.quantize_row_q_dot(test_data1, test_q1, size);
+                        return test_q1[0];
+                    };
+                    size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
+                    benchmark_function(size, quantized_size, quantize_fn);
+                }
+                printf("\n");
+            }
+
+            if (params.op_vec_dot_q) {
+                printf("  vec_dot_q\n");
+                qfns.quantize_row_q(test_data1, test_q1, largest);
+                qfns.quantize_row_q(test_data2, test_q2, largest);
+                for (size_t size : params.test_sizes) {
+                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
+                    auto quantize_fn = [&](void ) {
+                        float result;
+                        qfns.vec_dot_q(size, &result, test_q1, test_q2);
+                        return result;
+                    };
+                    size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
+                    benchmark_function(size, quantized_size, quantize_fn);
+                }
+                printf("\n");
+            }
+        }
+    }
+
+    ggml_free(ctx);
+
+    return 0;
+}
--- a/tests/test-quantize.c
+++ b/tests/test-quantize.c
@@ -1,42 +0,0 @@
-#include "ggml.h"
-#undef NDEBUG
-#include <assert.h>
-#include <math.h>
-
-int main(void) {
-    #define QK 32
-    float src[QK];
-    uint8_t dst[24];
-    int64_t hist[16];
-
-    for (int i = 0; i < QK; i++) {
-        src[i] = (float)(i + 1);
-    }
-
-    size_t size = ggml_quantize_q4_0(src, dst, QK, QK, hist);
-    assert(size == 20);
-    float max_result = ((float *)dst)[0];
-    float max_expected = src[31] / ((1 << 3) - 1);
-    assert(max_result == max_expected);
-    for (int i = 0; i < QK; i++) {
-        uint8_t q4_result = (i % 2) ? (dst[sizeof(float) + i/2] >> 4) : (dst[sizeof(float) + i/2] & 0xF);
-        uint8_t q4_expected = roundf(src[i] / max_expected) + 8;
-        assert(q4_result == q4_expected);
-    }
-
-    size = ggml_quantize_q4_1(src, dst, QK, QK, hist);
-    assert(size == 24);
-    float delta_result = ((float *)dst)[0];
-    float delta_expected = (src[31] - src[0]) / ((1 << 4) - 1);
-    assert(delta_result == delta_expected);
-    float min_result = ((float *)dst)[1];
-    float min_expected = src[0];
-    assert(min_result == min_expected);
-    for (int i = 0; i < QK; i++) {
-        uint8_t q4_result = (i % 2) ? (dst[sizeof(float)*2 + i/2] >> 4) : (dst[sizeof(float)*2 + i/2] & 0xF);
-        uint8_t q4_expected = roundf((src[i] - min_expected) / delta_expected);
-        assert(q4_result == q4_expected);
-    }
-
-    return 0;
-}
Author	SHA1	Message	Date
Stephan Walter	36d19a603b	Remove Q4_3 which is no better than Q5 (#1218 )	2023-04-28 23:10:43 +00:00
Georgi Gerganov	7f15c5c477	readme : update hot topics	2023-04-28 21:32:52 +03:00
Georgi Gerganov	55390bcaf2	ggml : sync ggml (ggml_alibi)	2023-04-28 20:51:05 +03:00
CRD716	5fba3c016b	examples : add Jeopardy example (#1168 ) * Basic Setup * Prevent Results.txt from coming up * Prefixes, Line separators, etc * editorcheck * introduction to give more consistent results * Basic graph thing * Grading, ready for testing! * Y'all ready to get funky? * fix column removal stuff * missed a few	2023-04-28 19:13:33 +03:00
Evan Jones	1481a9cf25	llama : add session file format and saved sessions in main (#1169 )	2023-04-28 18:59:37 +03:00
Georgi Gerganov	11d902364b	ggml : add helper debug printf in soft_max	2023-04-28 17:59:08 +03:00
0cc4m	7296c961d9	ggml : add CLBlast support (#1164 ) * Allow use of OpenCL GPU-based BLAS using ClBlast instead of OpenBLAS for context processing * Improve ClBlast implementation, avoid recreating buffers, remove redundant transfers * Finish merge of ClBlast support * Move CLBlast implementation to separate file Add buffer reuse code (adapted from slaren's cuda implementation) * Add q4_2 and q4_3 CLBlast support, improve code * Double CLBlast speed by disabling OpenBLAS thread workaround Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com> Co-authored-by: slaren <2141330+slaren@users.noreply.github.com> * Fix device selection env variable names * Fix cast in opencl kernels * Add CLBlast to CMakeLists.txt * Replace buffer pool with static buffers a, b, qb, c Fix compile warnings * Fix typos, use GGML_TYPE defines, improve code * Improve btype dequant kernel selection code, add error if type is unsupported * Improve code quality * Move internal stuff out of header * Use internal enums instead of CLBlast enums * Remove leftover C++ includes and defines * Make event use easier to read Co-authored-by: Henri Vasserman <henv@hot.ee> * Use c compiler for opencl files * Simplify code, fix include * First check error, then release event * Make globals static, fix indentation * Rename dequant kernels file to conform with other file names * Fix import cl file name --------- Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com> Co-authored-by: slaren <2141330+slaren@users.noreply.github.com> Co-authored-by: Henri Vasserman <henv@hot.ee> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2023-04-28 17:57:16 +03:00
Folko-Ven	78ec543733	Correcting link to w64devkit (#1214 ) Correcting link to w64devkit (change seeto to skeeto).	2023-04-28 16:22:48 +02:00
Johannes Gäßler	92a6e13a31	Add Manjaro CUDA include and lib dirs to Makefile (#1212 )	2023-04-28 15:40:32 +02:00
Yann Follet	04aaae1d79	add avx2 for dot_q8_0_q8_0, 2x faster than scalar (#1211 )	2023-04-28 11:59:48 +00:00
Stephan Walter	0b2da20538	ggml : slightly faster AVX2 implementation for Q5 (#1197 )	2023-04-26 23:26:42 +03:00
Georgi Gerganov	f9be42add0	readme : add quantization info	2023-04-26 23:24:42 +03:00
Georgi Gerganov	574406dc7e	ggml : add Q5_0 and Q5_1 quantization (#1187 ) * ggml : add Q5_0 quantization (cuBLAS only) * ggml : fix Q5_0 qh -> uint32_t * ggml : fix q5_0 histogram stats * ggml : q5_0 scalar dot product * ggml : q5_0 ARM NEON dot * ggml : q5_0 more efficient ARM NEON using uint64_t masks * ggml : rename Q5_0 -> Q5_1 * ggml : adding Q5_0 mode * quantize : add Q5_0 and Q5_1 to map * ggml : AVX2 optimizations for Q5_0, Q5_1 (#1195) --------- Co-authored-by: Stephan Walter <stephan@walter.name>	2023-04-26 23:14:13 +03:00
Ásgeir Bjarni Ingvarsson	87a6f846d3	Allow setting the rng seed after initialization. (#1184 ) The llama_set_state_data function restores the rng state to what it was at the time llama_copy_state_data was called. But users may want to restore the state and proceed with a different seed.	2023-04-26 22:08:43 +02:00
DaniAndTheWeb	ea3ad7eb60	Updating build instructions to include BLAS support (#1183 ) * Updated build information First update to the build instructions to include BLAS. * Update README.md * Update information about BLAS * Better BLAS explanation Adding a clearer BLAS explanation and adding a link to download the CUDA toolkit. * Better BLAS explanation * BLAS for Mac Specifying that BLAS is already supported on Macs using the Accelerate Framework. * Clarify the effect of BLAS * Windows Make instructions Added the instructions to build with Make on Windows * Fixing typo * Fix trailing whitespace	2023-04-26 22:03:03 +02:00
Pavol Rusnak	859fee6dfb	quantize : use `map` to assign quantization type from `string` (#1191 ) instead of `int` (while `int` option still being supported) This allows the following usage: `./quantize ggml-model-f16.bin ggml-model-q4_0.bin q4_0` instead of: `./quantize ggml-model-f16.bin ggml-model-q4_0.bin 2`	2023-04-26 18:43:27 +02:00
Stephan Walter	4afcc37869	Update SHA256SUMS after quantization change (#1181 ) Co-authored-by: Pavol Rusnak <pavol@rusnak.io>	2023-04-25 23:41:56 +02:00
ostix360	667c501334	py : cast lora_alpha to int in convert-lora-to-ggml (#1170 ) Co-authored-by: Pavol Rusnak <pavol@rusnak.io>	2023-04-25 23:33:08 +02:00
Pavol Rusnak	bb98e77be7	nix: use convert.py instead of legacy wrapper convert-pth-to-ggml.py (#981 )	2023-04-25 23:19:57 +02:00
Georgi Gerganov	7a32fcb3b2	ggml : add Q8_0 quantization format (rename the old one to Q8_1) (ARM NEON) (#1179 ) * ggml : add Q8_0 quantization format (rename the old one to Q8_1) * tests : fix test-quantize-fns * ggml : finalize Q8_0 implementation * ggml : use q4_0_q8_0 and q4_2_q8_0 * ggml : fix Q8_0 dot product bug (ARM) * ggml : Q8_0 unroll x2 * ggml : fix bug - using wrong block type * ggml : extend quantize_fns_t with "vec_dot_type" * ggml : fix Q8_0 to use 255 values out of 256 * ggml : fix assert using wrong QK4_2 instead of QK4_3	2023-04-25 23:40:51 +03:00
unbounded	dd0eabc049	ggml : use full range for Q4_0 and Q4_2 quantization (#729 ) * Use full range for q4_0 quantization By keeping the sign of the highest magnitude, we can make sure the highest value maps to -8, which is currently unused. This is a bit of a freebie since it is fully backwards compatible with the current format. * Update quantize_row_q4_0 for AVX/AVX2 * Update quantize_row_q4_0 for WASM Untested * Update quantize_row_q4_0 for Arm NEON * Update quantize_row_q4_0 for PowerPC Untested * Use full range for q4_2 quantization	2023-04-25 20:20:46 +03:00
xaedes	54bb60e268	ggml : fix bug in ggml_compute_forward_sum_f32 (#1162 ) The sum over all rows is now computed instead of just the last row	2023-04-24 23:02:02 +02:00
Georgi Gerganov	8a0f8673ba	ggml : export symbols (#1155 )	2023-04-24 22:18:25 +03:00
xaedes	0c5692345d	examples : add save_load_state example (#1150 ) * add save_load_state example * use <cstdio> instead of <iostream> and fprintf / printf instead of cout * renamed save-load-state example files replacing underscores by dashes	2023-04-24 19:23:31 +03:00
Georgi Gerganov	957c8ae21d	llama : increase scratch buffer size for 65B (ref #1152 ) Temporary solution	2023-04-24 18:47:30 +03:00
mgroeber9110	9b0a4d4214	examples/main README improvements and some light refactoring (#1131 )	2023-04-24 15:45:32 +00:00
Stephan Walter	2ec83428de	Fix build for gcc 8 and test in CI (#1154 )	2023-04-24 15:38:26 +00:00
slaren	e4cf982e0d	Fix cuda compilation (#1128 ) * Fix: Issue with CUBLAS compilation error due to missing -fPIC flag --------- Co-authored-by: B1gM8c <89020353+B1gM8c@users.noreply.github.com>	2023-04-24 17:29:58 +02:00
Georgi Gerganov	c4fe84fb0d	llama : refactor get / set state + remove redundant kv cache API (#1143 )	2023-04-24 07:40:02 +03:00
slaren	1d78fecdab	Fix LoRA acronym (#1145 )	2023-04-23 23:03:44 +02:00
Georgi Gerganov	284685f169	scripts : add helper scripts to synch ggml repo	2023-04-23 19:57:09 +03:00
DannyDaemonic	edce63baa9	Added README.md for main with examples and explanations (#1139 )	2023-04-23 15:37:02 +00:00
Georgi Gerganov	ec9cdb6752	ggml : do not print perf ops that have not been used at all	2023-04-23 18:32:52 +03:00
Georgi Gerganov	e4422e299c	ggml : better PERF prints + support "LLAMA_PERF=1 make"	2023-04-23 18:15:39 +03:00
Stephan Walter	53c8434398	Improve AVX2 for vec_dot_q4_3_q8_0 (#1138 )	2023-04-23 11:01:03 +00:00
Pavol Rusnak	c6524f46eb	readme : update gpt4all instructions (#980 )	2023-04-23 10:21:26 +02:00
Yishuo Wang	c9e2c26f41	A better `packNibbles` and `mul_sum_i8_pairs_float` implementation using AVX512 (#1119 )	2023-04-23 07:57:05 +00:00
Georgi Gerganov	0e018fe008	ggml : fix Q4_3 cuBLAS	2023-04-22 16:32:07 +03:00
Stephan Walter	857308d1e8	ci : trigger CI for drafts, but not most PR actions (#1125 )	2023-04-22 16:12:29 +03:00
Stephan Walter	c50b628810	Fix CI: ARM NEON, quantization unit tests, editorconfig (#1122 )	2023-04-22 10:54:13 +00:00
unbounded	5f939498d5	ggml : unit test for quantization functions (#953 ) * Unit test for quantization functions Use the ggml_internal_get_quantize_fn function to loop through all quantization formats and run a sanity check on the result. Also add a microbenchmark that times these functions directly without running the rest of the GGML graph. * test-quantize-fns: CI fixes Fix issues uncovered in CI - need to use sizes divisible by 328 for loop unrolling - use intrinsic header that should work on Mac test-quantize: remove Per PR comment, subsumed by test-quantize-fns * test-quantize: fix for q8_0 intermediates	2023-04-22 12:10:39 +03:00
wbpxre150	36b4f7e064	llama : print timings on ctrl+c exit (#1021 ) * print timings on ctrl+c exit * remove redundant free memory call. * add global pointer to ctx.	2023-04-22 11:56:35 +03:00
eiery	10f19c1121	llama : have n_batch default to 512 (#1091 ) * set default n_batch to 512 when using BLAS * spacing * alternate implementation of setting different n_batch for BLAS * set n_batch to 512 for all cases	2023-04-22 11:27:05 +03:00
Howard Su	7e312f165c	cmake : fix build under Windows when enable BUILD_SHARED_LIBS (#1100 ) * Fix build under Windows when enable BUILD_SHARED_LIBS * Make AVX512 test on Windows to build the shared libs	2023-04-22 11:18:20 +03:00
Georgi Gerganov	872c365a91	ggml : fix AVX build + update to new Q8_0 format	2023-04-22 11:08:12 +03:00
Georgi Gerganov	955ef9a5d5	ggml : alternative Q4_3 implementation using modified Q8_0 (#1109 ) * ggml : prefer vzip to vuzp This way we always use the same type of instruction across all quantizations * ggml : alternative Q4_3 implementation using modified Q8_0 * ggml : fix Q4_3 scalar imlpementation * ggml : slight improvement of Q4_3 - no need for loop unrolling * ggml : fix AVX paths for Q8_0 quantization	2023-04-22 10:55:35 +03:00
Stephan Walter	c5aa5e5777	ggml : AVX2 optimization for vec_dot_q4_3_q8_0 and refactoring (#1099 ) * AVX2 optimization for vec_dot_q4_3_q8_0 and refactoring * finish AVX vectorization of quantize_row_q8_0 * Rename hsum_int_8 to hsum_i32_8	2023-04-22 10:37:05 +03:00
Clint Herron	e9a9cb0c54	examples : Improve Alpaca Default Repeat Penalty: Better Match Alpaca.cpp Experience (#1107 ) * Moving parameters to separate lines for readability. * Increasing repeate_penalty to 1.1 to make alpaca more usable by default. * Adding trailing newline.	2023-04-22 09:54:33 +03:00
xaedes	b6e7f9b09e	llama : add api for getting/setting the complete state: rng, logits, embedding and kv_cache (#1105 ) * reserve correct size for logits * add functions to get and set the whole llama state: including rng, logits, embedding and kv_cache * remove unused variables * remove trailing whitespace * fix comment	2023-04-22 09:21:32 +03:00
slaren	50cb666b8a	Improve cuBLAS performance by using a memory pool (#1094 ) * Improve cuBLAS performance by using a memory pool * Move cuda specific definitions to ggml-cuda.h/cu * Add CXX flags to nvcc * Change memory pool synchronization mechanism to a spin lock General code cleanup	2023-04-21 21:59:17 +02:00
apaz	25d7abbd1f	llama : fixed rlimit error message (#888 )	2023-04-21 21:48:06 +03:00
源文雨	018f2279f5	cmake : link threads publicly to ggml (#1042 ) * fix: ld link test-tokenizer-0 error ``` cmake3 --build . --config Release [ 5%] Built target ggml [ 16%] Built target llama [ 22%] Linking CXX executable ../bin/test-tokenizer-0 ../libllama.a(ggml.c.o)：在函数‘ggml_graph_compute’中： ggml.c:(.text+0xf2db)：对‘pthread_create’未定义的引用 ggml.c:(.text+0xf9d4)：对‘pthread_join’未定义的引用 collect2: error: ld returned 1 exit status gmake[2]: * [bin/test-tokenizer-0] 错误 1 gmake[1]: * [tests/CMakeFiles/test-tokenizer-0.dir/all] 错误 2 gmake: *** [all] 错误 2 ``` * Update CMakeLists.txt * Update CMakeLists.txt * Update CMakeLists.txt	2023-04-21 21:27:06 +03:00
Alex Klinkhamer	9411288271	main : evaluate tokens in batches after swapping context (#1014 ) * examples : evaluate tokens in batches after swapping context * Update examples/main/main.cpp --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2023-04-21 21:18:09 +03:00
xaedes	8687c1f258	llama : remember and restore kv cache data pointers (#1104 ) because their value is stored in buf and overwritten by memcpy	2023-04-21 18:25:21 +03:00
Kawrakow	1bfc153e2f	ggml : a faster version for Q4_1 x Q8_0 dot products (#1083 ) * A faster version for Q4_1 x Q8_0 dot products The idea nehind being that Q8_0 quantized values get used many times in the matrix multiplications where they are involved. In the current implementations, when we are evaluating the dot products, we need to compute the sum of the quants in the Q8_0 vector, so the same operation is repeated many times. Here we pre-compute the sum during Q8_0 quantization, store it in the now modified block_q8_0 struct, and then reuse this result in the subsequent dot products. In a synthetic benchmark (just compute a bunch of dot products), this change speeds up the Q4_1 * Q8_0 dot product by 80%, making the performance identical to Q4_0 * Q8_0. In practical application, I see a ~15% gain in speed for token prediction on M2, and ~5% gain on Ryzen 7950X. The speed gain in the prompt evaluation is much bigger (around 50%). I have only done the change for the scalar version, ARM_NEON, and AVX2, so we still need an AVX implementation. * Cleaning up --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>	2023-04-21 18:18:26 +03:00
slaren	3d59769c3b	Show perplexity ETA in hours and minutes (#1096 )	2023-04-21 14:57:57 +02:00
Georgi Gerganov	d40fded93e	llama : fix comment for "output.weight" tensor	2023-04-21 10:24:02 +03:00
Stephan Walter	2510c1831f	Add ggml-model-.bin checksums for 7B, 13B, 30B, 65B (#1088 ) Add ggml-model-.bin checksums for 7B, 13B, 30B Add ggml-model-*.bin checksums for 65B --------- Co-authored-by: Pavol Rusnak <pavol@rusnak.io>	2023-04-20 23:56:44 +02:00
Georgi Gerganov	12b5900dbc	ggml : sync ggml (add GPT-NeoX RoPE implementation)	2023-04-20 23:32:59 +03:00
Georgi Gerganov	9ff334f3c9	ggml : fix bug in ggml_compute_forward_dup_f32()	2023-04-20 21:58:38 +03:00
slaren	2005469ea1	Add Q4_3 support to cuBLAS (#1086 )	2023-04-20 20:49:53 +02:00