mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-26 14:23:22 +02:00
Compare commits
23 Commits
master-ec9
...
master-04a
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
04aaae1d79 | ||
|
|
0b2da20538 | ||
|
|
f9be42add0 | ||
|
|
574406dc7e | ||
|
|
87a6f846d3 | ||
|
|
ea3ad7eb60 | ||
|
|
859fee6dfb | ||
|
|
4afcc37869 | ||
|
|
667c501334 | ||
|
|
bb98e77be7 | ||
|
|
7a32fcb3b2 | ||
|
|
dd0eabc049 | ||
|
|
54bb60e268 | ||
|
|
8a0f8673ba | ||
|
|
0c5692345d | ||
|
|
957c8ae21d | ||
|
|
9b0a4d4214 | ||
|
|
2ec83428de | ||
|
|
e4cf982e0d | ||
|
|
c4fe84fb0d | ||
|
|
1d78fecdab | ||
|
|
284685f169 | ||
|
|
edce63baa9 |
@@ -23,7 +23,7 @@ elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then
|
||||
echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
|
||||
else
|
||||
echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
|
||||
./quantize "$i" "${i/f16/q4_0}" 2
|
||||
./quantize "$i" "${i/f16/q4_0}" q4_0
|
||||
fi
|
||||
done
|
||||
else
|
||||
|
||||
10
.github/workflows/build.yml
vendored
10
.github/workflows/build.yml
vendored
@@ -19,8 +19,8 @@ env:
|
||||
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||
|
||||
jobs:
|
||||
ubuntu-latest-make:
|
||||
runs-on: ubuntu-latest
|
||||
ubuntu-focal-make:
|
||||
runs-on: ubuntu-20.04
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -31,12 +31,12 @@ jobs:
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential
|
||||
sudo apt-get install build-essential gcc-8
|
||||
|
||||
- name: Build
|
||||
id: make_build
|
||||
run: |
|
||||
make
|
||||
CC=gcc-8 make
|
||||
|
||||
ubuntu-latest-cmake:
|
||||
runs-on: ubuntu-latest
|
||||
@@ -216,7 +216,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
needs:
|
||||
- ubuntu-latest-make
|
||||
- ubuntu-focal-make
|
||||
- ubuntu-latest-cmake
|
||||
- macOS-latest-make
|
||||
- macOS-latest-cmake
|
||||
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -15,6 +15,7 @@ build-em/
|
||||
build-debug/
|
||||
build-release/
|
||||
build-static/
|
||||
build-cublas/
|
||||
build-no-accel/
|
||||
build-sanitize-addr/
|
||||
build-sanitize-thread/
|
||||
|
||||
4
Makefile
4
Makefile
@@ -109,9 +109,9 @@ ifdef LLAMA_CUBLAS
|
||||
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64
|
||||
OBJS += ggml-cuda.o
|
||||
NVCC = nvcc
|
||||
NVCCFLAGS = --forward-unknown-to-host-linker -arch=native
|
||||
NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native
|
||||
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
||||
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -c $< -o $@
|
||||
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
|
||||
endif
|
||||
ifdef LLAMA_GPROF
|
||||
CFLAGS += -pg
|
||||
|
||||
125
README.md
125
README.md
@@ -7,31 +7,27 @@
|
||||
|
||||
Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
||||
|
||||
**Warnings**
|
||||
|
||||
- `Q4_2` and `Q4_3` are still in development. Do not expect any kind of backward compatibility until they are finalized
|
||||
|
||||
**Hot topics:**
|
||||
|
||||
- [New quantization methods](https://github.com/ggerganov/llama.cpp#quantization)
|
||||
- [Added LoRA support](https://github.com/ggerganov/llama.cpp/pull/820)
|
||||
- [Add GPU support to ggml](https://github.com/ggerganov/llama.cpp/discussions/915)
|
||||
- [Roadmap Apr 2023](https://github.com/ggerganov/llama.cpp/discussions/784)
|
||||
|
||||
## Description
|
||||
|
||||
The main goal of llama.cpp is to run the llama model using 4-bit quantization on a MacBook.
|
||||
The main goal of `llama.cpp` is to run the LLaMA model using 4-bit integer quantization on a MacBook
|
||||
|
||||
- Plain C/C++ implementation without dependencies
|
||||
- Apple silicon first-class citizen - optimized via ARM NEON and Accelerate framework
|
||||
- AVX2 support for x86 architectures
|
||||
- Mixed F16 / F32 precision
|
||||
- 4-bit quantization support
|
||||
- 4-bit integer quantization support
|
||||
- Runs on the CPU
|
||||
|
||||
This was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022) - I have no idea if it works correctly.
|
||||
Please do not make conclusions about the models based on the results from this implementation.
|
||||
For all I know, it can be completely wrong. This project is for educational purposes.
|
||||
New features will probably be added mostly through community contributions.
|
||||
The original implementation of `llama.cpp` was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022).
|
||||
Since then, the project has improved significantly thanks to many contributions. This project is for educational purposes and serves
|
||||
as the main playground for developing new features for the [ggml](https://github.com/ggerganov/ggml) library.
|
||||
|
||||
**Supported platforms:**
|
||||
|
||||
@@ -167,15 +163,27 @@ cd llama.cpp
|
||||
|
||||
### Build
|
||||
|
||||
Note: For Windows, CMake or Zig can be used.
|
||||
In order to build llama.cpp you have three different options.
|
||||
|
||||
1. Use `make`
|
||||
- Using `make`:
|
||||
- On Linux or MacOS:
|
||||
|
||||
```bash
|
||||
make
|
||||
```
|
||||
```bash
|
||||
make
|
||||
```
|
||||
|
||||
1. Use CMake
|
||||
- On Windows:
|
||||
|
||||
1. Download the latest fortran version of [w64devkit](https://github.com/seeto/w64devkit/releases).
|
||||
2. Extract `w64devkit` on your pc.
|
||||
3. Run `w64devkit.exe`.
|
||||
4. Use the `cd` command to reach the `llama.cpp` folder.
|
||||
5. From here you can run:
|
||||
```bash
|
||||
make
|
||||
```
|
||||
|
||||
- Using `CMake`:
|
||||
|
||||
```bash
|
||||
mkdir build
|
||||
@@ -184,12 +192,71 @@ Note: For Windows, CMake or Zig can be used.
|
||||
cmake --build . --config Release
|
||||
```
|
||||
|
||||
1. Use Zig
|
||||
- Using `Zig`:
|
||||
|
||||
```bash
|
||||
zig build -Drelease-fast
|
||||
```
|
||||
|
||||
### BLAS Build
|
||||
|
||||
Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). BLAS doesn't affect the normal generation performance. There are currently three different implementations of it:
|
||||
|
||||
- Accelerate Framework:
|
||||
|
||||
This is only available on Mac PCs and it's enabled by default. You can just build using the normal instructions.
|
||||
|
||||
- OpenBLAS:
|
||||
|
||||
This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS installed on your machine.
|
||||
|
||||
- Using `make`:
|
||||
- On Linux:
|
||||
```bash
|
||||
make LLAMA_OPENBLAS=1
|
||||
```
|
||||
Note: In order to build on Arch Linux with OpenBLAS support enabled you must edit the Makefile adding at the end of the line 105: `-lcblas`
|
||||
|
||||
- On Windows:
|
||||
|
||||
1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
|
||||
2. Download the latest version of [OpenBLAS for Windows](https://github.com/xianyi/OpenBLAS/releases).
|
||||
3. Extract `w64devkit` on your pc.
|
||||
4. From the OpenBLAS zip that you just downloaded copy `libopenblas.a`, located inside the `lib` folder, inside `w64devkit\x86_64-w64-mingw32\lib`.
|
||||
5. From the same OpenBLAS zip copy the content of the `include` folder inside `w64devkit\x86_64-w64-mingw32\include`.
|
||||
6. Run `w64devkit.exe`.
|
||||
7. Use the `cd` command to reach the `llama.cpp` folder.
|
||||
8. From here you can run:
|
||||
|
||||
```bash
|
||||
make LLAMA_OPENBLAS=1
|
||||
```
|
||||
|
||||
- Using `CMake` on Linux:
|
||||
|
||||
```bash
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -DLLAMA_OPENBLAS=ON
|
||||
cmake --build . --config Release
|
||||
```
|
||||
|
||||
- cuBLAS
|
||||
|
||||
This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
|
||||
- Using `make`:
|
||||
```bash
|
||||
make LLAMA_CUBLAS=1
|
||||
```
|
||||
- Using `CMake`:
|
||||
|
||||
```bash
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -DLLAMA_CUBLAS=ON
|
||||
cmake --build . --config Release
|
||||
```
|
||||
|
||||
### Prepare Data & Run
|
||||
|
||||
```bash
|
||||
@@ -203,8 +270,8 @@ python3 -m pip install -r requirements.txt
|
||||
# convert the 7B model to ggml FP16 format
|
||||
python3 convert.py models/7B/
|
||||
|
||||
# quantize the model to 4-bits (using method 2 = q4_0)
|
||||
./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin 2
|
||||
# quantize the model to 4-bits (using q4_0 method)
|
||||
./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin q4_0
|
||||
|
||||
# run the inference
|
||||
./main -m ./models/7B/ggml-model-q4_0.bin -n 128
|
||||
@@ -223,6 +290,24 @@ As the models are currently fully loaded into memory, you will need adequate dis
|
||||
| 30B | 60 GB | 19.5 GB |
|
||||
| 65B | 120 GB | 38.5 GB |
|
||||
|
||||
### Quantization
|
||||
|
||||
Several quantization methods are supported. They differ in the resulting model disk size and inference speed.
|
||||
|
||||
Model | F16 | Q4_0 | Q4_1 | Q4_2 | Q4_3 | Q5_0 | Q5_1 | Q8_0
|
||||
-- | -- | -- | -- | -- | -- | -- | -- | --
|
||||
7B (ppl) | 5.9565 | 6.2103 | 6.1286 | 6.1698 | 6.0617 | 6.0139 | 5.9934 | 5.9571
|
||||
7B (size) | 13.0G | 4.0G | 4.8G | 4.0G | 4.8G | 4.4G | 4.8G | 7.1G
|
||||
7B (ms/tok @ 4th) | 128 | 56 | 61 | 84 | 91 | 91 | 95 | 75
|
||||
7B (ms/tok @ 8th) | 128 | 47 | 55 | 48 | 53 | 53 | 59 | 75
|
||||
7B (bpw) | 16.0 | 5.0 | 6.0 | 5.0 | 6.0 | 5.5 | 6.0 | 9.0
|
||||
-- | -- | -- | -- | -- | -- | -- | -- | --
|
||||
13B (ppl) | 5.2455 | 5.3748 | 5.3471 | 5.3433 | 5.3234 | 5.2768 | 5.2582 | 5.2458
|
||||
13B (size) | 25.0G | 7.6G | 9.1G | 7.6G | 9.1G | 8.4G | 9.1G | 14G
|
||||
13B (ms/tok @ 4th) | 239 | 104 | 113 | 160 | 175 | 176 | 185 | 141
|
||||
13B (ms/tok @ 8th) | 240 | 85 | 99 | 97 | 114 | 108 | 117 | 147
|
||||
13B (bpw) | 16.0 | 5.0 | 6.0 | 5.0 | 6.0 | 5.5 | 6.0 | 9.0
|
||||
|
||||
### Interactive mode
|
||||
|
||||
If you want a more ChatGPT-like experience, you can run in interactive mode by passing `-i` as a parameter.
|
||||
@@ -241,7 +326,7 @@ Here is an example of a few-shot interaction, invoked with the command
|
||||
./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
|
||||
```
|
||||
|
||||
Note the use of `--color` to distinguish between user input and generated text.
|
||||
Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `main` example program.
|
||||
|
||||

|
||||
|
||||
|
||||
16
SHA256SUMS
16
SHA256SUMS
@@ -1,16 +1,16 @@
|
||||
700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d models/7B/consolidated.00.pth
|
||||
666a4bb533b303bdaf89e1b6a3b6f93535d868de31d903afdc20983dc526c847 models/7B/ggml-model-f16.bin
|
||||
fcb7664c2e69776920b526362a243e912f73c36b1ec892eb354bab940f5edb5a models/7B/ggml-model-q4_0.bin
|
||||
99aeb35f26b577fa2732716cca4d8b5ada39a78ea9b2dca2651fc632b5d101b6 models/7B/ggml-model-q4_0.bin
|
||||
cc061458339a3eb8bcecbf0a825e9924fb7d1a8150f63cd5d091caa99215aafe models/7B/ggml-model-q4_1.bin
|
||||
1bc7484c24a87612726d756f1761890e7acf5f412e23378577ce50fbe789b5b8 models/7B/ggml-model-q4_2.bin
|
||||
25b050337a87344da687a7f2adddc03bd99b7f6c140450e836649f3585fb6496 models/7B/ggml-model-q4_2.bin
|
||||
3429bf198ec771886cf81a574df45245f3ebf04f0ce0956b73ef5d0ab01ff48b models/7B/ggml-model-q4_3.bin
|
||||
7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265 models/7B/params.json
|
||||
745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08 models/13B/consolidated.00.pth
|
||||
d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085 models/13B/consolidated.01.pth
|
||||
2b206e9b21fb1076f11cafc624e2af97c9e48ea09312a0962153acc20d45f808 models/13B/ggml-model-f16.bin
|
||||
4b69e4d6b6e3275230955997b90407fceca7e5ab3daf2e63a2c9e7270a8e1e3e models/13B/ggml-model-q4_0.bin
|
||||
eecb575d325d935157761172e2bf05984dad216eb2b06777b73463cf9b818bab models/13B/ggml-model-q4_0.bin
|
||||
d9581b5b88e5622532fe897c9f9b0e67a317d22dd27a6f90fa4ab8c6d23ccdbb models/13B/ggml-model-q4_1.bin
|
||||
8d55a2077317ec9a928c7851d6a43e08e51f7e9e08360f2a7a7e1deefea3134f models/13B/ggml-model-q4_2.bin
|
||||
75a218a47df03f5f96354656329864613abcb67779412b9bc2282b28c1c3cbaa models/13B/ggml-model-q4_2.bin
|
||||
4208cdec9788ffa48dc1a17af2c36a0299f5bf3eb0e2b87889dda7fad591fca3 models/13B/ggml-model-q4_3.bin
|
||||
4ab77bec4d4405ccb66a97b282574c89a94417e3c32e5f68f37e2876fc21322f models/13B/params.json
|
||||
e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067 models/30B/consolidated.00.pth
|
||||
@@ -18,9 +18,9 @@ e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067 models/30B/con
|
||||
24a87f01028cbd3a12de551dcedb712346c0b5cbdeff1454e0ddf2df9b675378 models/30B/consolidated.02.pth
|
||||
1adfcef71420886119544949767f6a56cb6339b4d5fcde755d80fe68b49de93b models/30B/consolidated.03.pth
|
||||
7e1b524061a9f4b27c22a12d6d2a5bf13b8ebbea73e99f218809351ed9cf7d37 models/30B/ggml-model-f16.bin
|
||||
7a679908ce31c9d6ae2e38d6059bcd4d0ad3a870cd58cc1c8f7b36f2b2f51c73 models/30B/ggml-model-q4_0.bin
|
||||
517b9e525742c42b5478a6280a4b41ec66f46298c57aba7f0453d491682fe42d models/30B/ggml-model-q4_0.bin
|
||||
7b75ac615fa369ee593493a7e6ef87542bf0350255db928b22c5a24f6d598bcd models/30B/ggml-model-q4_1.bin
|
||||
2c82b4954a94a6a284f452f6011c1e4f0d20362c194a0b1eb5737f5fd8a20fb3 models/30B/ggml-model-q4_2.bin
|
||||
aadbc9cf806313a55be570f62884eed289d30c313fac3b7838717e01bd553204 models/30B/ggml-model-q4_2.bin
|
||||
a6188660199dbcb8d5658abe7d89169869e50423494385830d9e6b330ea7fc33 models/30B/ggml-model-q4_3.bin
|
||||
2c07118ea98d69dbe7810d88520e30288fa994751b337f8fca02b171955f44cb models/30B/params.json
|
||||
135c563f6b3938114458183afb01adc9a63bef3d8ff7cccc3977e5d3664ecafe models/65B/consolidated.00.pth
|
||||
@@ -32,9 +32,9 @@ a287c0dfe49081626567c7fe87f74cce5831f58e459b427b5e05567641f47b78 models/65B/con
|
||||
72b4eba67a1a3b18cb67a85b70f8f1640caae9b40033ea943fb166bd80a7b36b models/65B/consolidated.06.pth
|
||||
d27f5b0677d7ff129ceacd73fd461c4d06910ad7787cf217b249948c3f3bc638 models/65B/consolidated.07.pth
|
||||
60758f2384d74e423dffddfd020ffed9d3bb186ebc54506f9c4a787d0f5367b0 models/65B/ggml-model-f16.bin
|
||||
c671fe1bce71499ac732ec999770ebe53ac486623a7891e42c9dfdb6962d2c64 models/65B/ggml-model-q4_0.bin
|
||||
01672072136f8be6ca9d7cebe5f86ed316e8b85851b9fe3de951809233cea4f2 models/65B/ggml-model-q4_0.bin
|
||||
4743a28aac3e5f32a6e838a815f51d3779de44fbbe251d745251e66c23c5950f models/65B/ggml-model-q4_1.bin
|
||||
4a145a210c56982389b1ed34387e0590c3e0d7325fa9be4f2284fe4d244a3633 models/65B/ggml-model-q4_2.bin
|
||||
1b6f6588d0e2ecfe6c4d849088e48e5e3083466b962daa32e3261363e21fc5e9 models/65B/ggml-model-q4_2.bin
|
||||
305e91a4608b4f627b9b8ad5b4af75187d2684254bfd76dcb9db571618ef293c models/65B/ggml-model-q4_3.bin
|
||||
999ed1659b469ccc2a941714c0a9656fa571d17c9f7c8c7589817ca90edef51b models/65B/params.json
|
||||
9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 models/tokenizer.model
|
||||
|
||||
@@ -49,7 +49,12 @@ def translate_tensor_name(t: str) -> str:
|
||||
def write_file_header(fout: TextIO, params: Dict[str, Any]) -> None:
|
||||
fout.write(b"ggla"[::-1]) # magic (ggml lora)
|
||||
fout.write(struct.pack("i", 1)) # file version
|
||||
fout.write(struct.pack("ii", params["r"], params["lora_alpha"]))
|
||||
fout.write(struct.pack("i", params["r"]))
|
||||
# https://opendelta.readthedocs.io/en/latest/modules/deltas.html says that `lora_alpha` is an int
|
||||
# but some models ship a float value instead
|
||||
# let's convert to int, but fail if lossless conversion is not possible
|
||||
assert int(params["lora_alpha"]) == params["lora_alpha"], "cannot convert float to int losslessly"
|
||||
fout.write(struct.pack("i", int(params["lora_alpha"])))
|
||||
|
||||
|
||||
def write_tensor_header(
|
||||
@@ -89,7 +94,7 @@ if params["peft_type"] != "LORA":
|
||||
print(f"Error: unsupported adapter type {params['peft_type']}, expected LORA")
|
||||
sys.exit(1)
|
||||
|
||||
if params["fan_in_fan_out"] == True:
|
||||
if params["fan_in_fan_out"] is True:
|
||||
print("Error: param fan_in_fan_out is not supported")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
@@ -34,4 +34,5 @@ else()
|
||||
add_subdirectory(quantize-stats)
|
||||
add_subdirectory(perplexity)
|
||||
add_subdirectory(embedding)
|
||||
add_subdirectory(save-load-state)
|
||||
endif()
|
||||
|
||||
@@ -156,10 +156,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
params.interactive = true;
|
||||
} else if (arg == "--embedding") {
|
||||
params.embedding = true;
|
||||
} else if (arg == "--interactive-start") {
|
||||
params.interactive = true;
|
||||
} else if (arg == "--interactive-first") {
|
||||
params.interactive_start = true;
|
||||
params.interactive_first = true;
|
||||
} else if (arg == "-ins" || arg == "--instruct") {
|
||||
params.instruct = true;
|
||||
} else if (arg == "--color") {
|
||||
|
||||
@@ -43,7 +43,7 @@ struct gpt_params {
|
||||
bool interactive = false; // interactive mode
|
||||
|
||||
bool embedding = false; // get only sentence embedding
|
||||
bool interactive_start = false; // wait for user input immediately
|
||||
bool interactive_first = false; // wait for user input immediately
|
||||
|
||||
bool instruct = false; // instruction mode (used for Alpaca models)
|
||||
bool ignore_eos = false; // do not stop generating after eos
|
||||
|
||||
@@ -1,3 +1,191 @@
|
||||
# main
|
||||
# llama.cpp/example/main
|
||||
|
||||
TODO
|
||||
This example program allows you to use various LLaMA language models in an easy and efficient way. It is specifically designed to work with the [llama.cpp](https://github.com/ggerganov/llama.cpp) project, which provides a plain C/C++ implementation with optional 4-bit quantization support for faster, lower memory inference, and is optimized for desktop CPUs. This program can be used to perform various inference tasks with LLaMA models, including generating text based on user-provided prompts and chat-like interactions with reverse prompts.
|
||||
|
||||
## Table of Contents
|
||||
|
||||
1. [Quick Start](#quick-start)
|
||||
2. [Common Options](#common-options)
|
||||
3. [Input Prompts](#input-prompts)
|
||||
4. [Interaction](#interaction)
|
||||
5. [Context Management](#context-management)
|
||||
6. [Generation Flags](#generation-flags)
|
||||
7. [Performance Tuning and Memory Options](#performance-tuning-and-memory-options)
|
||||
8. [Additional Options](#additional-options)
|
||||
|
||||
## Quick Start
|
||||
|
||||
To get started right away, run the following command, making sure to use the correct path for the model you have:
|
||||
|
||||
```bash
|
||||
./main -m models/7B/ggml-model.bin --prompt "Once upon a time"
|
||||
```
|
||||
|
||||
The following command generates "infinite" text from a starting prompt (you can use `Ctrl-C` to stop it):
|
||||
|
||||
```bash
|
||||
./main -m models/7B/ggml-model.bin --ignore-eos --n_predict -1 --keep -1 --prompt "Once upon a time"
|
||||
```
|
||||
|
||||
For an interactive experience, try this command:
|
||||
|
||||
```bash
|
||||
./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " --prompt $'User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:'
|
||||
```
|
||||
|
||||
Note that the newline characters in the prompt string above only work on Linux. On Windows, you will have to use the ``--file`` option (see below) to load a multi-line prompt from file instead.
|
||||
|
||||
## Common Options
|
||||
|
||||
In this section, we cover the most commonly used options for running the `main` program with the LLaMA models:
|
||||
|
||||
- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
|
||||
- `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
|
||||
- `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models.
|
||||
- `-t N, --threads N`: Set the number of threads to use during computation. It is recommended to set this to the number of physical cores your CPU has.
|
||||
- `-n N, --n_predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
|
||||
- `-c N, --ctx_size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
|
||||
|
||||
## Input Prompts
|
||||
|
||||
The `main` program provides several ways to interact with the LLaMA models using input prompts:
|
||||
|
||||
- `--prompt PROMPT`: Provide a prompt directly as a command-line option.
|
||||
- `--file FNAME`: Provide a file containing a prompt or multiple prompts.
|
||||
- `--interactive-first`: Run the program in interactive mode and wait for input right away. (More on this below.)
|
||||
- `--random-prompt`: Start with a randomized prompt.
|
||||
|
||||
## Interaction
|
||||
|
||||
The `main` program offers a seamless way to interact with LLaMA models, allowing users to engage in real-time conversations or provide instructions for specific tasks. The interactive mode can be triggered using various options, including `--interactive`, `--interactive-first`, and `--instruct`.
|
||||
|
||||
In interactive mode, users can participate in text generation by injecting their input during the process. Users can press `Ctrl+C` at any time to interject and type their input, followed by pressing `Return` to submit it to the LLaMA model. To submit additional lines without finalizing input, users can end the current line with a backslash (`\`) and continue typing.
|
||||
|
||||
### Interaction Options
|
||||
|
||||
- `-i, --interactive`: Run the program in interactive mode, allowing users to engage in real-time conversations or provide specific instructions to the model.
|
||||
- `--interactive-first`: Run the program in interactive mode and immediately wait for user input before starting the text generation.
|
||||
- `-ins, --instruct`: Run the program in instruction mode, which is specifically designed to work with Alpaca models that excel in completing tasks based on user instructions.
|
||||
- `--color`: Enable colorized output to differentiate visually distinguishing between prompts, user input, and generated text.
|
||||
|
||||
By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs.
|
||||
|
||||
### Reverse Prompts
|
||||
|
||||
Reverse prompts are a powerful way to create a chat-like experience with a LLaMA model by pausing the text generation when specific text strings are encountered:
|
||||
|
||||
- `-r PROMPT, --reverse-prompt PROMPT`: Specify one or multiple reverse prompts to pause text generation and switch to interactive mode. For example, `-r "User:"` can be used to jump back into the conversation whenever it's the user's turn to speak. This helps create a more interactive and conversational experience. However, the reverse prompt doesn't work when it ends with a space.
|
||||
|
||||
To overcome this limitation, you can use the `--in-prefix` flag to add a space or any other characters after the reverse prompt.
|
||||
|
||||
### In-Prefix
|
||||
|
||||
The `--in-prefix` flag is used to add a prefix to your input, primarily, this is used to insert a space after the reverse prompt. Here's an example of how to use the `--in-prefix` flag in conjunction with the `--reverse-prompt` flag:
|
||||
|
||||
```sh
|
||||
./main -r "User:" --in-prefix " "
|
||||
```
|
||||
|
||||
### Instruction Mode
|
||||
|
||||
Instruction mode is particularly useful when working with Alpaca models, which are designed to follow user instructions for specific tasks:
|
||||
|
||||
- `-ins, --instruct`: Enable instruction mode to leverage the capabilities of Alpaca models in completing tasks based on user-provided instructions.
|
||||
|
||||
Technical detail: the user's input is internally prefixed with the reverse prompt (or ``### Instruction:`` as the default), and followed by ``### Response:`` (except if you just press Return without any input, to keep generating a longer response).
|
||||
|
||||
By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs.
|
||||
|
||||
## Context Management
|
||||
|
||||
During text generation, LLaMA models have a limited context size, which means they can only consider a certain number of tokens from the input and generated text. When the context fills up, the model resets internally, potentially losing some information from the beginning of the conversation or instructions. Context management options help maintain continuity and coherence in these situations.
|
||||
|
||||
### Context Size
|
||||
|
||||
The `--ctx_size` option allows you to set the size of the prompt context used by the LLaMA models during text generation. A larger context size helps the model to better comprehend and generate responses for longer input or conversations.
|
||||
|
||||
- `-c N, --ctx_size N`: Set the size of the prompt context (default: 512). The LLaMA models were built with a context of 2048, which will yield the best results on longer input/inference. However, increasing the context size beyond 2048 may lead to unpredictable results.
|
||||
|
||||
### Keep Prompt
|
||||
|
||||
The `--keep` option allows users to retain the original prompt when the model runs out of context, ensuring a connection to the initial instruction or conversation topic is maintained.
|
||||
|
||||
- `--keep N`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context. By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt.
|
||||
|
||||
By utilizing context management options like `--ctx_size` and `--keep`, you can maintain a more coherent and consistent interaction with the LLaMA models, ensuring that the generated text remains relevant to the original prompt or conversation.
|
||||
|
||||
## Generation Flags
|
||||
|
||||
The following options are related to controlling the text generation process, influencing the diversity, creativity, and quality of the generated text. Understanding these options will help you fine-tune the output according to your needs:
|
||||
|
||||
### Number of Tokens to Predict
|
||||
|
||||
- `-n N, --n_predict N`: Set the number of tokens to predict when generating text (default: 128, -1 = infinity).
|
||||
|
||||
The `--n_predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text. A value of -1 will cause text to be generated without limit.
|
||||
|
||||
It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `n_predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the ``--ignore-eos`` parameter.
|
||||
|
||||
### RNG Seed
|
||||
|
||||
- `-s SEED, --seed SEED`: Set the random number generator (RNG) seed (default: -1).
|
||||
|
||||
The RNG seed is used to initialize the random number generator that influences the text generation process. By setting a specific seed value, you can obtain consistent and reproducible results across multiple runs with the same input and settings. This can be helpful for testing, debugging, or comparing the effects of different options on the generated text to see when they diverge. If the seed is set to a value less than or equal to 0, a random seed will be used, which will result in different outputs on each run.
|
||||
|
||||
### Temperature
|
||||
|
||||
- `--temp N`: Adjust the randomness of the generated text (default: 0.8).
|
||||
|
||||
Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.
|
||||
|
||||
Example usage: `--temp 0.8`
|
||||
|
||||
### Repeat Penalty
|
||||
|
||||
- `--repeat_penalty N`: Control the repetition of token sequences in the generated text (default: 1.1).
|
||||
|
||||
Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. The default value is 1.1.
|
||||
|
||||
Example usage: `--repeat_penalty 1.1`
|
||||
|
||||
### Top-K Sampling
|
||||
|
||||
- `--top_k N`: Limit the next token selection to the K most probable tokens (default: 40).
|
||||
|
||||
Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top_k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text. The default value is 40.
|
||||
|
||||
Example usage: `--top_k 40`
|
||||
|
||||
### Top-P Sampling
|
||||
|
||||
- `--top_p N`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9).
|
||||
|
||||
Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. The default value is 0.9.
|
||||
|
||||
Example usage: `--top_p 0.9`
|
||||
|
||||
By adjusting these options, you can control the diversity, quality, and creativity of the generated text to better suit your needs. You can experiment with different combinations of values to find the best settings for your specific use case.
|
||||
|
||||
## Performance Tuning and Memory Options
|
||||
|
||||
These options help improve the performance and memory usage of the LLaMA models:
|
||||
|
||||
- `-t N, --threads N`: Set the number of threads to use during computation. Using the correct number of threads can greatly improve performance. It is recommended to set this value to the number of CPU cores.
|
||||
- `--mlock`: Lock the model in memory, preventing it from being swapped out when mmaped. This can improve performance.
|
||||
- `--no-mmap`: Do not memory-map the model. This results in a slower load time but may reduce pageouts if you're not using `mlock`.
|
||||
- `--memory_f32`: Use 32 bit floats instead of 16 bit floats for memory key+value, allowing higher quality inference at the cost of memory.
|
||||
- `-b N, --batch_size N`: Set the batch size for prompt processing (default: 512). This large batch size benefits users who have BLAS installed and enabled it during the build. If you don't have BLAS enabled ("BLAS=0"), you can use a smaller number, such as 8, to see the prompt progress as it's evaluated in some situations.
|
||||
|
||||
For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-data--run).
|
||||
|
||||
By understanding and using these performance tuning settings, you can optimize the LLaMA model's behavior to achieve the best performance for your specific needs.
|
||||
|
||||
## Additional Options
|
||||
|
||||
These options provide extra functionality and customization when running the LLaMA models:
|
||||
|
||||
- `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated.
|
||||
- `--verbose-prompt`: Print the prompt before generating text.
|
||||
- `--mtest`: Test the model's functionality by running a series of tests to ensure it's working properly.
|
||||
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
|
||||
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
|
||||
|
||||
@@ -178,12 +178,12 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// in instruct mode, we inject a prefix and a suffix to each input by the user
|
||||
if (params.instruct) {
|
||||
params.interactive_start = true;
|
||||
params.interactive_first = true;
|
||||
params.antiprompt.push_back("### Instruction:\n\n");
|
||||
}
|
||||
|
||||
// enable interactive mode if reverse prompt or interactive start is specified
|
||||
if (params.antiprompt.size() != 0 || params.interactive_start) {
|
||||
if (params.antiprompt.size() != 0 || params.interactive_first) {
|
||||
params.interactive = true;
|
||||
}
|
||||
|
||||
@@ -246,7 +246,7 @@ int main(int argc, char ** argv) {
|
||||
#endif
|
||||
" - Press Return to return control to LLaMa.\n"
|
||||
" - If you want to submit another line, end your input in '\\'.\n\n");
|
||||
is_interacting = params.interactive_start;
|
||||
is_interacting = params.interactive_first;
|
||||
}
|
||||
|
||||
bool is_antiprompt = false;
|
||||
|
||||
@@ -2,8 +2,19 @@
|
||||
#include "llama.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <map>
|
||||
#include <string>
|
||||
|
||||
static const std::map<std::string, enum llama_ftype> LLAMA_FTYPE_MAP = {
|
||||
{"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0},
|
||||
{"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1},
|
||||
{"q4_2", LLAMA_FTYPE_MOSTLY_Q4_2},
|
||||
{"q4_3", LLAMA_FTYPE_MOSTLY_Q4_3},
|
||||
{"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0},
|
||||
{"q5_1", LLAMA_FTYPE_MOSTLY_Q5_1},
|
||||
{"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0},
|
||||
};
|
||||
|
||||
// usage:
|
||||
// ./quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type
|
||||
//
|
||||
@@ -12,10 +23,9 @@ int main(int argc, char ** argv) {
|
||||
|
||||
if (argc < 4) {
|
||||
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type [nthread]\n", argv[0]);
|
||||
fprintf(stderr, " type = %d - q4_0\n", LLAMA_FTYPE_MOSTLY_Q4_0);
|
||||
fprintf(stderr, " type = %d - q4_1\n", LLAMA_FTYPE_MOSTLY_Q4_1);
|
||||
fprintf(stderr, " type = %d - q4_2\n", LLAMA_FTYPE_MOSTLY_Q4_2);
|
||||
fprintf(stderr, " type = %d - q4_3\n", LLAMA_FTYPE_MOSTLY_Q4_3);
|
||||
for (auto it = LLAMA_FTYPE_MAP.begin(); it != LLAMA_FTYPE_MAP.end(); it++) {
|
||||
fprintf(stderr, " type = \"%s\" or %d\n", it->first.c_str(), it->second);
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -29,7 +39,18 @@ int main(int argc, char ** argv) {
|
||||
const std::string fname_inp = argv[1];
|
||||
const std::string fname_out = argv[2];
|
||||
|
||||
const enum llama_ftype ftype = (enum llama_ftype)atoi(argv[3]);
|
||||
enum llama_ftype ftype;
|
||||
if (argv[3][0] == 'q') {
|
||||
auto it = LLAMA_FTYPE_MAP.find(argv[3]);
|
||||
if (it == LLAMA_FTYPE_MAP.end()) {
|
||||
fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, argv[3]);
|
||||
return 1;
|
||||
}
|
||||
ftype = it->second;
|
||||
} else {
|
||||
ftype = (enum llama_ftype)atoi(argv[3]);
|
||||
}
|
||||
|
||||
int nthread = argc > 4 ? atoi(argv[4]) : 0;
|
||||
|
||||
const int64_t t_main_start_us = ggml_time_us();
|
||||
|
||||
4
examples/save-load-state/CMakeLists.txt
Normal file
4
examples/save-load-state/CMakeLists.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
set(TARGET save-load-state)
|
||||
add_executable(${TARGET} save-load-state.cpp)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
128
examples/save-load-state/save-load-state.cpp
Normal file
128
examples/save-load-state/save-load-state.cpp
Normal file
@@ -0,0 +1,128 @@
|
||||
#include <vector>
|
||||
#include <cstdio>
|
||||
#include <chrono>
|
||||
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
#include "llama.cpp"
|
||||
|
||||
using namespace std;
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
params.model = "models/llama-7B/ggml-model.bin";
|
||||
params.seed = 42;
|
||||
params.n_threads = 4;
|
||||
params.repeat_last_n = 64;
|
||||
params.prompt = "The quick brown fox";
|
||||
|
||||
if (gpt_params_parse(argc, argv, params) == false) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
auto lparams = llama_context_default_params();
|
||||
|
||||
lparams.n_ctx = params.n_ctx;
|
||||
lparams.n_parts = params.n_parts;
|
||||
lparams.seed = params.seed;
|
||||
lparams.f16_kv = params.memory_f16;
|
||||
lparams.use_mmap = params.use_mmap;
|
||||
lparams.use_mlock = params.use_mlock;
|
||||
|
||||
auto n_past = 0;
|
||||
auto last_n_tokens_data = vector<llama_token>(params.repeat_last_n, 0);
|
||||
|
||||
// init
|
||||
auto ctx = llama_init_from_file(params.model.c_str(), lparams);
|
||||
auto tokens = vector<llama_token>(params.n_ctx);
|
||||
auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), tokens.size(), true);
|
||||
|
||||
if (n_prompt_tokens < 1) {
|
||||
fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// evaluate prompt
|
||||
|
||||
llama_eval(ctx, tokens.data(), n_prompt_tokens, n_past, params.n_threads);
|
||||
|
||||
last_n_tokens_data.insert(last_n_tokens_data.end(), tokens.data(), tokens.data() + n_prompt_tokens);
|
||||
n_past += n_prompt_tokens;
|
||||
|
||||
// Save state (rng, logits, embedding and kv_cache) to file
|
||||
FILE *fp_write = fopen("dump_state.bin", "wb");
|
||||
auto state_size = llama_get_state_size(ctx);
|
||||
auto state_mem = new uint8_t[state_size];
|
||||
llama_copy_state_data(ctx, state_mem); // could also copy directly to memory mapped file
|
||||
fwrite(state_mem, 1, state_size, fp_write);
|
||||
fclose(fp_write);
|
||||
|
||||
// save state (last tokens)
|
||||
auto last_n_tokens_data_saved = vector<llama_token>(last_n_tokens_data);
|
||||
auto n_past_saved = n_past;
|
||||
|
||||
// first run
|
||||
printf("\n%s", params.prompt.c_str());
|
||||
for (auto i = 0; i < params.n_predict; i++) {
|
||||
auto next_token = llama_sample_top_p_top_k(
|
||||
ctx,
|
||||
&last_n_tokens_data.back() - params.repeat_last_n,
|
||||
params.repeat_last_n,
|
||||
40,
|
||||
1.0,
|
||||
1.0,
|
||||
1.1);
|
||||
auto next_token_str = llama_token_to_str(ctx, next_token);
|
||||
last_n_tokens_data.push_back(next_token);
|
||||
printf("%s", next_token_str);
|
||||
if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) {
|
||||
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
n_past += 1;
|
||||
}
|
||||
printf("\n\n");
|
||||
|
||||
// free old model
|
||||
llama_free(ctx);
|
||||
|
||||
// load new model
|
||||
|
||||
auto ctx2 = llama_init_from_file(params.model.c_str(), lparams);
|
||||
|
||||
// Load state (rng, logits, embedding and kv_cache) from file
|
||||
FILE *fp_read = fopen("dump_state.bin", "rb");
|
||||
auto state_size2 = llama_get_state_size(ctx2);
|
||||
if (state_size != state_size2) {
|
||||
fprintf(stderr, "\n%s : failed to validate state size\n", __func__);
|
||||
}
|
||||
fread(state_mem, 1, state_size, fp_read);
|
||||
llama_set_state_data(ctx2, state_mem); // could also read directly from memory mapped file
|
||||
fclose(fp_read);
|
||||
|
||||
// restore state (last tokens)
|
||||
last_n_tokens_data = last_n_tokens_data_saved;
|
||||
n_past = n_past_saved;
|
||||
|
||||
// second run
|
||||
for (auto i = 0; i < params.n_predict; i++) {
|
||||
auto next_token = llama_sample_top_p_top_k(
|
||||
ctx2,
|
||||
&last_n_tokens_data.back() - params.repeat_last_n,
|
||||
params.repeat_last_n,
|
||||
40,
|
||||
1.0,
|
||||
1.0,
|
||||
1.1);
|
||||
auto next_token_str = llama_token_to_str(ctx2, next_token);
|
||||
last_n_tokens_data.push_back(next_token);
|
||||
printf("%s", next_token_str);
|
||||
if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) {
|
||||
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
n_past += 1;
|
||||
}
|
||||
printf("\n\n");
|
||||
return 0;
|
||||
}
|
||||
@@ -30,9 +30,9 @@
|
||||
mv bin/* $out/bin/
|
||||
mv $out/bin/main $out/bin/llama
|
||||
|
||||
echo "#!${llama-python}/bin/python" > $out/bin/convert-pth-to-ggml
|
||||
cat ${./convert-pth-to-ggml.py} >> $out/bin/convert-pth-to-ggml
|
||||
chmod +x $out/bin/convert-pth-to-ggml
|
||||
echo "#!${llama-python}/bin/python" > $out/bin/convert.py
|
||||
cat ${./convert.py} >> $out/bin/convert.py
|
||||
chmod +x $out/bin/convert.py
|
||||
'';
|
||||
meta.mainProgram = "llama";
|
||||
};
|
||||
|
||||
113
ggml-cuda.cu
113
ggml-cuda.cu
@@ -37,6 +37,30 @@ typedef struct {
|
||||
} block_q4_3;
|
||||
static_assert(sizeof(block_q4_3) == 2 * sizeof(ggml_fp16_t) + QK4_3 / 2, "wrong q4_3 block size/padding");
|
||||
|
||||
#define QK5_0 32
|
||||
typedef struct {
|
||||
__half d; // delta
|
||||
uint8_t qh[4]; // 5-th bit of quants
|
||||
uint8_t qs[QK5_0 / 2]; // nibbles / quants
|
||||
} block_q5_0;
|
||||
static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
|
||||
|
||||
#define QK5_1 32
|
||||
typedef struct {
|
||||
__half d; // delta
|
||||
__half m; // min
|
||||
uint32_t qh; // 5-th bit of quants
|
||||
uint8_t qs[QK5_1 / 2]; // nibbles / quants
|
||||
} block_q5_1;
|
||||
static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
|
||||
|
||||
#define QK8_0 32
|
||||
typedef struct {
|
||||
float d; // delta
|
||||
int8_t qs[QK8_0]; // quants
|
||||
} block_q8_0;
|
||||
static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding");
|
||||
|
||||
static __global__ void dequantize_block_q4_0(const void * vx, float * y) {
|
||||
const block_q4_0 * x = (const block_q4_0 *) vx;
|
||||
|
||||
@@ -131,6 +155,80 @@ static __global__ void dequantize_block_q4_3(const void * vx, float * y) {
|
||||
}
|
||||
}
|
||||
|
||||
static __global__ void dequantize_block_q5_0(const void * vx, float * y) {
|
||||
const block_q5_0 * x = (const block_q5_0 *) vx;
|
||||
|
||||
const int i = blockIdx.x;
|
||||
|
||||
const float d = x[i].d;
|
||||
|
||||
const uint8_t * pp = x[i].qs;
|
||||
|
||||
uint32_t qh;
|
||||
memcpy(&qh, x[i].qh, sizeof(qh));
|
||||
|
||||
for (int l = 0; l < QK5_0; l += 2) {
|
||||
const uint8_t vi = pp[l/2];
|
||||
|
||||
const int8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
|
||||
const int8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
|
||||
|
||||
const int8_t vi0 = ((vi & 0xf) | vh0);
|
||||
const int8_t vi1 = ((vi >> 4) | vh1);
|
||||
|
||||
const float v0 = (vi0 - 16)*d;
|
||||
const float v1 = (vi1 - 16)*d;
|
||||
|
||||
y[i*QK5_0 + l + 0] = v0;
|
||||
y[i*QK5_0 + l + 1] = v1;
|
||||
}
|
||||
}
|
||||
|
||||
static __global__ void dequantize_block_q5_1(const void * vx, float * y) {
|
||||
const block_q5_1 * x = (const block_q5_1 *) vx;
|
||||
|
||||
const int i = blockIdx.x;
|
||||
|
||||
const float d = x[i].d;
|
||||
const float m = x[i].m;
|
||||
|
||||
const uint8_t * pp = x[i].qs;
|
||||
|
||||
const uint32_t qh = x[i].qh;
|
||||
|
||||
for (int l = 0; l < QK5_1; l += 2) {
|
||||
const uint8_t vi = pp[l/2];
|
||||
|
||||
const int8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
|
||||
const int8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
|
||||
|
||||
const int8_t vi0 = (vi & 0xf) | vh0;
|
||||
const int8_t vi1 = (vi >> 4) | vh1;
|
||||
|
||||
const float v0 = vi0*d + m;
|
||||
const float v1 = vi1*d + m;
|
||||
|
||||
y[i*QK5_1 + l + 0] = v0;
|
||||
y[i*QK5_1 + l + 1] = v1;
|
||||
}
|
||||
}
|
||||
|
||||
static __global__ void dequantize_block_q8_0(const void * vx, float * y) {
|
||||
const block_q8_0 * x = (const block_q8_0 *) vx;
|
||||
|
||||
const int i = blockIdx.x;
|
||||
|
||||
const float d = x[i].d;
|
||||
|
||||
const int8_t * pp = x[i].qs;
|
||||
|
||||
for (int l = 0; l < QK8_0; l++) {
|
||||
const int8_t vi = pp[l];
|
||||
|
||||
y[i*QK8_0 + l] = vi*d;
|
||||
}
|
||||
}
|
||||
|
||||
void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
|
||||
const int nb = k / QK4_0;
|
||||
dequantize_block_q4_0<<<nb, 1, 0, stream>>>(vx, y);
|
||||
@@ -151,6 +249,21 @@ void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, cudaStream_t st
|
||||
dequantize_block_q4_3<<<nb, 1, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
|
||||
const int nb = k / QK5_0;
|
||||
dequantize_block_q5_0<<<nb, 1, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
|
||||
const int nb = k / QK5_1;
|
||||
dequantize_block_q5_1<<<nb, 1, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
|
||||
const int nb = k / QK8_0;
|
||||
dequantize_block_q8_0<<<nb, 1, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
// buffer pool for cuda
|
||||
#define MAX_CUDA_BUFFERS 16
|
||||
|
||||
|
||||
@@ -35,6 +35,9 @@ void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t st
|
||||
void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
|
||||
void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream);
|
||||
void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, cudaStream_t stream);
|
||||
void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
|
||||
void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
|
||||
void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
346
llama.cpp
346
llama.cpp
@@ -54,7 +54,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
||||
{ MODEL_7B, 512ull * MB },
|
||||
{ MODEL_13B, 512ull * MB },
|
||||
{ MODEL_30B, 512ull * MB },
|
||||
{ MODEL_65B, 512ull * MB },
|
||||
{ MODEL_65B, 1024ull * MB },
|
||||
};
|
||||
return _MEM_REQ_SCRATCH0;
|
||||
}
|
||||
@@ -65,7 +65,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
||||
{ MODEL_7B, 512ull * MB },
|
||||
{ MODEL_13B, 512ull * MB },
|
||||
{ MODEL_30B, 512ull * MB },
|
||||
{ MODEL_65B, 512ull * MB },
|
||||
{ MODEL_65B, 1024ull * MB },
|
||||
};
|
||||
return _MEM_REQ_SCRATCH1;
|
||||
}
|
||||
@@ -484,6 +484,9 @@ struct llama_file_loader {
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q4_2:
|
||||
case GGML_TYPE_Q4_3:
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
break;
|
||||
default: {
|
||||
throw format("unrecognized tensor type %u\n", shard.type);
|
||||
@@ -558,6 +561,9 @@ struct llama_file_saver {
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q4_2:
|
||||
case GGML_TYPE_Q4_3:
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
break;
|
||||
default: LLAMA_ASSERT(false);
|
||||
}
|
||||
@@ -848,6 +854,9 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
|
||||
return "mostly Q4_1, some F16";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_3: return "mostly Q4_3";
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
|
||||
case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
|
||||
default: return "unknown, may not work";
|
||||
}
|
||||
}
|
||||
@@ -1585,6 +1594,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_3: quantized_type = GGML_TYPE_Q4_3; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
|
||||
default: throw format("invalid output file type %d\n", ftype);
|
||||
};
|
||||
|
||||
@@ -2072,35 +2084,198 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
|
||||
}
|
||||
}
|
||||
|
||||
// Returns the KV cache that will contain the context for the
|
||||
// ongoing prediction with the model.
|
||||
const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
|
||||
return ctx->model.kv_self.buf.addr;
|
||||
}
|
||||
|
||||
// Returns the size of the KV cache
|
||||
size_t llama_get_kv_cache_size(struct llama_context * ctx) {
|
||||
return ctx->model.kv_self.buf.size;
|
||||
}
|
||||
|
||||
int llama_get_kv_cache_token_count(struct llama_context * ctx) {
|
||||
return ctx->model.kv_self.n;
|
||||
}
|
||||
|
||||
// Sets the KV cache containing the current context for the model
|
||||
void llama_set_kv_cache(
|
||||
struct llama_context * ctx,
|
||||
const uint8_t * kv_cache,
|
||||
size_t n_size,
|
||||
int n_token_count) {
|
||||
// Make sure we have the same kv cache setup
|
||||
LLAMA_ASSERT(ctx->model.kv_self.buf.size == n_size);
|
||||
void * k_data = ctx->model.kv_self.k->data; // remember data pointers
|
||||
void * v_data = ctx->model.kv_self.v->data; // because their value is stored in buf and overwritten by memcpy
|
||||
memcpy(ctx->model.kv_self.buf.addr, kv_cache, n_size);
|
||||
ctx->model.kv_self.k->data = k_data; // restore correct data pointers
|
||||
ctx->model.kv_self.v->data = v_data;
|
||||
ctx->model.kv_self.n = n_token_count;
|
||||
#define LLAMA_MAX_RNG_STATE 64*1024
|
||||
|
||||
void llama_set_rng_seed(struct llama_context * ctx, int seed) {
|
||||
if (seed <= 0) {
|
||||
seed = time(NULL);
|
||||
}
|
||||
ctx->rng.seed(seed);
|
||||
}
|
||||
|
||||
// Returns the size of the state
|
||||
size_t llama_get_state_size(struct llama_context * ctx) {
|
||||
// we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
|
||||
// for reference, std::mt19937(1337) serializes to 6701 bytes.
|
||||
const size_t s_rng_size = sizeof(size_t);
|
||||
const size_t s_rng = LLAMA_MAX_RNG_STATE;
|
||||
const size_t s_logits_capacity = sizeof(size_t);
|
||||
const size_t s_logits_size = sizeof(size_t);
|
||||
const size_t s_logits = ctx->logits.capacity() * sizeof(float);
|
||||
const size_t s_embedding_size = sizeof(size_t);
|
||||
const size_t s_embedding = ctx->embedding.size() * sizeof(float);
|
||||
const size_t s_kv_size = sizeof(size_t);
|
||||
const size_t s_kv_ntok = sizeof(int);
|
||||
const size_t s_kv = ctx->model.kv_self.buf.size;
|
||||
|
||||
const size_t s_total = (
|
||||
+ s_rng_size
|
||||
+ s_rng
|
||||
+ s_logits_capacity
|
||||
+ s_logits_size
|
||||
+ s_logits
|
||||
+ s_embedding_size
|
||||
+ s_embedding
|
||||
+ s_kv_size
|
||||
+ s_kv_ntok
|
||||
+ s_kv
|
||||
);
|
||||
|
||||
return s_total;
|
||||
}
|
||||
|
||||
// Copies the state to the specified destination address
|
||||
size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
|
||||
uint8_t * out = dest;
|
||||
|
||||
// copy rng
|
||||
{
|
||||
std::stringstream rng_ss;
|
||||
rng_ss << ctx->rng;
|
||||
|
||||
const size_t rng_size = rng_ss.str().size();
|
||||
char rng_buf[LLAMA_MAX_RNG_STATE];
|
||||
|
||||
memset(&rng_buf[0], 0, LLAMA_MAX_RNG_STATE);
|
||||
memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size());
|
||||
|
||||
memcpy(out, &rng_size, sizeof(rng_size)); out += sizeof(rng_size);
|
||||
memcpy(out, &rng_buf[0], LLAMA_MAX_RNG_STATE); out += LLAMA_MAX_RNG_STATE;
|
||||
}
|
||||
|
||||
// copy logits
|
||||
{
|
||||
const size_t logits_cap = ctx->logits.capacity();
|
||||
const size_t logits_size = ctx->logits.size();
|
||||
|
||||
memcpy(out, &logits_cap, sizeof(logits_cap)); out += sizeof(logits_cap);
|
||||
memcpy(out, &logits_size, sizeof(logits_size)); out += sizeof(logits_size);
|
||||
|
||||
if (logits_size) {
|
||||
memcpy(out, ctx->logits.data(), logits_size * sizeof(float));
|
||||
}
|
||||
|
||||
out += logits_cap * sizeof(float);
|
||||
}
|
||||
|
||||
// copy embeddings
|
||||
{
|
||||
const size_t embedding_size = ctx->embedding.size();
|
||||
|
||||
memcpy(out, &embedding_size, sizeof(embedding_size)); out += sizeof(embedding_size);
|
||||
|
||||
if (embedding_size) {
|
||||
memcpy(out, ctx->embedding.data(), embedding_size * sizeof(float));
|
||||
out += embedding_size * sizeof(float);
|
||||
}
|
||||
}
|
||||
|
||||
// copy kv cache
|
||||
{
|
||||
const size_t kv_size = ctx->model.kv_self.buf.size;
|
||||
const int kv_ntok = llama_get_kv_cache_token_count(ctx);
|
||||
|
||||
memcpy(out, &kv_size, sizeof(kv_size)); out += sizeof(kv_size);
|
||||
memcpy(out, &kv_ntok, sizeof(kv_ntok)); out += sizeof(kv_ntok);
|
||||
|
||||
if (kv_size) {
|
||||
memcpy(out, ctx->model.kv_self.buf.addr, kv_size); out += kv_size;
|
||||
}
|
||||
}
|
||||
|
||||
const size_t written = out - dest;
|
||||
const size_t expected = llama_get_state_size(ctx);
|
||||
|
||||
LLAMA_ASSERT(written == expected);
|
||||
|
||||
return written;
|
||||
}
|
||||
|
||||
// Sets the state reading from the specified source address
|
||||
size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
||||
const uint8_t * in = src;
|
||||
|
||||
// set rng
|
||||
{
|
||||
size_t rng_size;
|
||||
char rng_buf[LLAMA_MAX_RNG_STATE];
|
||||
|
||||
memcpy(&rng_size, in, sizeof(rng_size)); in += sizeof(rng_size);
|
||||
memcpy(&rng_buf[0], in, LLAMA_MAX_RNG_STATE); in += LLAMA_MAX_RNG_STATE;
|
||||
|
||||
std::stringstream rng_ss;
|
||||
rng_ss.str(std::string(&rng_buf[0], rng_size));
|
||||
rng_ss >> ctx->rng;
|
||||
|
||||
LLAMA_ASSERT(rng_ss.fail() == false);
|
||||
}
|
||||
|
||||
// set logits
|
||||
{
|
||||
size_t logits_cap;
|
||||
size_t logits_size;
|
||||
|
||||
memcpy(&logits_cap, in, sizeof(logits_cap)); in += sizeof(logits_cap);
|
||||
memcpy(&logits_size, in, sizeof(logits_size)); in += sizeof(logits_size);
|
||||
|
||||
LLAMA_ASSERT(ctx->logits.capacity() == logits_cap);
|
||||
|
||||
if (logits_size) {
|
||||
ctx->logits.resize(logits_size);
|
||||
memcpy(ctx->logits.data(), in, logits_size * sizeof(float));
|
||||
}
|
||||
|
||||
in += logits_cap * sizeof(float);
|
||||
}
|
||||
|
||||
// set embeddings
|
||||
{
|
||||
size_t embedding_size;
|
||||
|
||||
memcpy(&embedding_size, in, sizeof(embedding_size)); in += sizeof(embedding_size);
|
||||
|
||||
LLAMA_ASSERT(ctx->embedding.capacity() == embedding_size);
|
||||
|
||||
if (embedding_size) {
|
||||
memcpy(ctx->embedding.data(), in, embedding_size * sizeof(float));
|
||||
in += embedding_size * sizeof(float);
|
||||
}
|
||||
}
|
||||
|
||||
// set kv cache
|
||||
{
|
||||
size_t kv_size;
|
||||
int kv_ntok;
|
||||
|
||||
memcpy(&kv_size, in, sizeof(kv_size)); in += sizeof(kv_size);
|
||||
memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok);
|
||||
|
||||
if (kv_size) {
|
||||
LLAMA_ASSERT(ctx->model.kv_self.buf.size == kv_size);
|
||||
|
||||
void * k_data = ctx->model.kv_self.k->data; // remember data pointers
|
||||
void * v_data = ctx->model.kv_self.v->data; // because their value is stored in buf and overwritten by memcpy
|
||||
|
||||
memcpy(ctx->model.kv_self.buf.addr, in, kv_size); in += kv_size;
|
||||
|
||||
ctx->model.kv_self.k->data = k_data; // restore correct data pointers
|
||||
ctx->model.kv_self.v->data = v_data;
|
||||
|
||||
}
|
||||
|
||||
ctx->model.kv_self.n = kv_ntok;
|
||||
}
|
||||
|
||||
const size_t nread = in - src;
|
||||
const size_t expected = llama_get_state_size(ctx);
|
||||
|
||||
LLAMA_ASSERT(nread == expected);
|
||||
|
||||
return nread;
|
||||
}
|
||||
|
||||
int llama_eval(
|
||||
@@ -2256,120 +2431,3 @@ std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_te
|
||||
return ctx->model.tensors_by_name;
|
||||
}
|
||||
|
||||
// Returns the size of the state
|
||||
size_t llama_get_state_size(struct llama_context * ctx) {
|
||||
// we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
|
||||
// for reference, std::mt19937(1337) serializes to 6701 bytes.
|
||||
const size_t s_rng_size = sizeof(size_t);
|
||||
const size_t s_rng = 64*1024;
|
||||
const size_t s_logits_capacity = sizeof(size_t);
|
||||
const size_t s_logits_size = sizeof(size_t);
|
||||
const size_t s_logits = ctx->logits.capacity() * sizeof(float);
|
||||
const size_t s_embedding_size = sizeof(size_t);
|
||||
const size_t s_embedding = ctx->embedding.size() * sizeof(float);
|
||||
const size_t s_kv_size = sizeof(size_t);
|
||||
const size_t s_kv_ntok = sizeof(int);
|
||||
const size_t s_kv = llama_get_kv_cache_size(ctx);
|
||||
const size_t s_total = (
|
||||
+ s_rng_size
|
||||
+ s_rng
|
||||
+ s_logits_capacity
|
||||
+ s_logits_size
|
||||
+ s_logits
|
||||
+ s_embedding_size
|
||||
+ s_embedding
|
||||
+ s_kv_size
|
||||
+ s_kv_ntok
|
||||
+ s_kv
|
||||
);
|
||||
return s_total;
|
||||
}
|
||||
|
||||
// Copies the state to the specified destination address
|
||||
size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
|
||||
std::stringstream rng_ss;
|
||||
rng_ss << ctx->rng;
|
||||
const size_t rng_size = rng_ss.str().size();
|
||||
char rng_buf[64*1024];
|
||||
memset(&rng_buf[0], 0, 64*1024);
|
||||
memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size());
|
||||
const size_t logits_capacity = ctx->logits.capacity();
|
||||
const size_t logits_size = ctx->logits.size();
|
||||
const size_t embedding_size = ctx->embedding.size();
|
||||
const size_t kv_size = llama_get_kv_cache_size(ctx);
|
||||
const int kv_ntok = llama_get_kv_cache_token_count(ctx);
|
||||
|
||||
uint8_t * out = dest;
|
||||
memcpy(out, &rng_size, sizeof(size_t)); out += sizeof(size_t);
|
||||
memcpy(out, &rng_buf[0], 64*1024); out += 64*1024;
|
||||
memcpy(out, &logits_capacity, sizeof(size_t)); out += sizeof(size_t);
|
||||
memcpy(out, &logits_size, sizeof(size_t)); out += sizeof(size_t);
|
||||
if (logits_size) {
|
||||
memcpy(out, ctx->logits.data(), logits_size * sizeof(float));
|
||||
}
|
||||
out += logits_capacity * sizeof(float);
|
||||
memcpy(out, &embedding_size, sizeof(size_t)); out += sizeof(size_t);
|
||||
if (embedding_size) {
|
||||
memcpy(out, ctx->embedding.data(), embedding_size * sizeof(float)); out += embedding_size * sizeof(float);
|
||||
}
|
||||
memcpy(out, &kv_size, sizeof(size_t)); out += sizeof(size_t);
|
||||
memcpy(out, &kv_ntok, sizeof(int)); out += sizeof(int);
|
||||
if (kv_size) {
|
||||
memcpy(out, llama_get_kv_cache(ctx), kv_size); out += kv_size;
|
||||
}
|
||||
const size_t written = out - dest;
|
||||
const size_t expected = llama_get_state_size(ctx);
|
||||
LLAMA_ASSERT(written == expected);
|
||||
return written;
|
||||
}
|
||||
|
||||
// Sets the state reading from the specified source address
|
||||
size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
||||
size_t rng_size;
|
||||
char rng_buf[64*1024];
|
||||
std::stringstream rng_ss;
|
||||
|
||||
const uint8_t * in = src;
|
||||
memcpy(&rng_size, in, sizeof(size_t)); in += sizeof(size_t);
|
||||
memcpy(&rng_buf[0], in, 64*1024); in += 64*1024;
|
||||
rng_ss.str(std::string(&rng_buf[0], rng_size));
|
||||
rng_ss >> ctx->rng;
|
||||
LLAMA_ASSERT(rng_ss.fail() == false);
|
||||
|
||||
size_t logits_capacity;
|
||||
size_t logits_size;
|
||||
size_t embedding_size;
|
||||
size_t kv_size;
|
||||
int kv_ntok;
|
||||
|
||||
memcpy(&logits_capacity, in, sizeof(size_t)); in += sizeof(size_t);
|
||||
memcpy(&logits_size, in, sizeof(size_t)); in += sizeof(size_t);
|
||||
LLAMA_ASSERT(ctx->logits.capacity() == logits_capacity);
|
||||
if (logits_size) {
|
||||
ctx->logits.resize(logits_size);
|
||||
memcpy(ctx->logits.data(), in, logits_size * sizeof(float));
|
||||
}
|
||||
in += logits_capacity * sizeof(float);
|
||||
memcpy(&embedding_size, in, sizeof(size_t)); in += sizeof(size_t);
|
||||
LLAMA_ASSERT(ctx->embedding.capacity() == embedding_size);
|
||||
if (embedding_size) {
|
||||
memcpy(ctx->embedding.data(), in, embedding_size * sizeof(float));
|
||||
in += embedding_size * sizeof(float);
|
||||
}
|
||||
memcpy(&kv_size, in, sizeof(size_t)); in += sizeof(size_t);
|
||||
memcpy(&kv_ntok, in, sizeof(int)); in += sizeof(int);
|
||||
if (kv_size) {
|
||||
LLAMA_ASSERT(ctx->model.kv_self.buf.size == kv_size);
|
||||
void * k_data = ctx->model.kv_self.k->data; // remember data pointers
|
||||
void * v_data = ctx->model.kv_self.v->data; // because their value is stored in buf and overwritten by memcpy
|
||||
memcpy(ctx->model.kv_self.buf.addr, in, kv_size);
|
||||
ctx->model.kv_self.k->data = k_data; // restore correct data pointers
|
||||
ctx->model.kv_self.v->data = v_data;
|
||||
in += kv_size;
|
||||
}
|
||||
ctx->model.kv_self.n = kv_ntok;
|
||||
const size_t nread = in - src;
|
||||
const size_t expected = llama_get_state_size(ctx);
|
||||
LLAMA_ASSERT(nread == expected);
|
||||
return nread;
|
||||
}
|
||||
|
||||
18
llama.h
18
llama.h
@@ -74,6 +74,9 @@ extern "C" {
|
||||
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
||||
LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
||||
};
|
||||
|
||||
LLAMA_API struct llama_context_params llama_context_default_params();
|
||||
@@ -112,22 +115,11 @@ extern "C" {
|
||||
const char * path_base_model,
|
||||
int n_threads);
|
||||
|
||||
// Returns the KV cache that will contain the context for the
|
||||
// ongoing prediction with the model.
|
||||
LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx);
|
||||
|
||||
// Returns the size of the KV cache
|
||||
LLAMA_API size_t llama_get_kv_cache_size(struct llama_context * ctx);
|
||||
|
||||
// Returns the number of tokens in the KV cache
|
||||
LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
|
||||
|
||||
// Sets the KV cache containing the current context for the model
|
||||
LLAMA_API void llama_set_kv_cache(
|
||||
struct llama_context * ctx,
|
||||
const uint8_t * kv_cache,
|
||||
size_t n_size,
|
||||
int n_token_count);
|
||||
// Sets the current rng seed.
|
||||
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
|
||||
|
||||
// Returns the size in bytes of the state (rng, logits, embedding and kv_cache)
|
||||
LLAMA_API size_t llama_get_state_size(struct llama_context * ctx);
|
||||
|
||||
6
scripts/sync-ggml.sh
Executable file
6
scripts/sync-ggml.sh
Executable file
@@ -0,0 +1,6 @@
|
||||
#!/bin/bash
|
||||
|
||||
cp -rpv ../ggml/src/ggml.c ./ggml.c
|
||||
cp -rpv ../ggml/src/ggml-cuda.cu ./ggml-cuda.cu
|
||||
cp -rpv ../ggml/src/ggml-cuda.h ./ggml-cuda.h
|
||||
cp -rpv ../ggml/include/ggml/ggml.h ./ggml.h
|
||||
@@ -36,7 +36,7 @@ float array_rmse(const float * a1, const float * a2, size_t n) {
|
||||
|
||||
// Total quantization error on test data
|
||||
float total_quantization_error(quantize_fns_t & qfns, size_t test_size, const float * test_data) {
|
||||
std::vector<uint8_t> tmp_q(test_size);
|
||||
std::vector<uint8_t> tmp_q(2*test_size);
|
||||
std::vector<float> tmp_out(test_size);
|
||||
|
||||
qfns.quantize_row_q(test_data, tmp_q.data(), test_size);
|
||||
@@ -46,7 +46,7 @@ float total_quantization_error(quantize_fns_t & qfns, size_t test_size, const fl
|
||||
|
||||
// Total quantization error on test data
|
||||
float reference_quantization_error(quantize_fns_t & qfns, size_t test_size, const float * test_data) {
|
||||
std::vector<uint8_t> tmp_q(test_size);
|
||||
std::vector<uint8_t> tmp_q(2*test_size);
|
||||
std::vector<float> tmp_out(test_size);
|
||||
std::vector<float> tmp_out_ref(test_size);
|
||||
|
||||
@@ -69,10 +69,10 @@ float dot_product(const float * a1, const float * a2, size_t test_size) {
|
||||
|
||||
// Total dot product error
|
||||
float dot_product_error(quantize_fns_t & qfns, size_t test_size, const float * test_data1, const float *test_data2) {
|
||||
std::vector<uint8_t> tmp_q1(test_size);
|
||||
std::vector<uint8_t> tmp_q2(test_size*2);
|
||||
std::vector<uint8_t> tmp_q1(2*test_size);
|
||||
std::vector<uint8_t> tmp_q2(2*test_size);
|
||||
|
||||
qfns.quantize_row_q(test_data1, tmp_q1.data(), test_size);
|
||||
qfns.quantize_row_q (test_data1, tmp_q1.data(), test_size);
|
||||
qfns.quantize_row_q_dot(test_data2, tmp_q2.data(), test_size);
|
||||
|
||||
float result = INFINITY;
|
||||
@@ -125,7 +125,7 @@ int main(int argc, char * argv[]) {
|
||||
failed = !(total_error < MAX_QUANTIZATION_TOTAL_ERROR);
|
||||
num_failed += failed;
|
||||
if (failed || verbose) {
|
||||
printf("%5s absolute quantization error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], total_error);
|
||||
printf("%5s absolute quantization error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], total_error);
|
||||
}
|
||||
|
||||
const float reference_error = reference_quantization_error(qfns, test_size, test_data.data());
|
||||
@@ -139,7 +139,7 @@ int main(int argc, char * argv[]) {
|
||||
failed = !(vec_dot_error < MAX_DOT_PRODUCT_ERROR);
|
||||
num_failed += failed;
|
||||
if (failed || verbose) {
|
||||
printf("%5s dot product error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], vec_dot_error);
|
||||
printf("%5s dot product error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], vec_dot_error);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user