mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-26 14:23:22 +02:00
Compare commits
31 Commits
master-0d0
...
master-62c
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
62cfc54f77 | ||
|
|
698f7b5d63 | ||
|
|
c1950c3431 | ||
|
|
4953e9007f | ||
|
|
cc9cee8e9e | ||
|
|
d2beca95dc | ||
|
|
eeaa7b0492 | ||
|
|
986b6ce9f9 | ||
|
|
3416298929 | ||
|
|
5a8c4f6240 | ||
|
|
ff05d05c96 | ||
|
|
62b3e81aae | ||
|
|
8d10406d6e | ||
|
|
ed1c214e66 | ||
|
|
0c44427df1 | ||
|
|
594cc95fab | ||
|
|
88ed5761b8 | ||
|
|
58c438cf7d | ||
|
|
53dbba7695 | ||
|
|
437e77855a | ||
|
|
cd7fa95690 | ||
|
|
a0c0516416 | ||
|
|
d8d4e865cd | ||
|
|
e986f94829 | ||
|
|
c0bb1d3ce2 | ||
|
|
6e7801d08d | ||
|
|
81040f10aa | ||
|
|
c4f89d8d73 | ||
|
|
5b70e7de4c | ||
|
|
a717cba844 | ||
|
|
d0a7f742e7 |
@@ -6,7 +6,8 @@ RUN apt-get update && \
|
||||
apt-get install -y build-essential python3 python3-pip
|
||||
|
||||
RUN pip install --upgrade pip setuptools wheel \
|
||||
&& pip install numpy requests sentencepiece torch tqdm
|
||||
&& pip install numpy requests sentencepiece tqdm \
|
||||
&& pip install torch --index-url https://download.pytorch.org/whl/cpu
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
4
.gitignore
vendored
4
.gitignore
vendored
@@ -19,6 +19,7 @@ models/*
|
||||
|
||||
/main
|
||||
/quantize
|
||||
/quantize-stats
|
||||
/result
|
||||
/perplexity
|
||||
/embedding
|
||||
@@ -33,3 +34,6 @@ compile_commands.json
|
||||
.venv
|
||||
__pycache__
|
||||
.swiftpm
|
||||
|
||||
zig-out/
|
||||
zig-cache/
|
||||
|
||||
99
Makefile
99
Makefile
@@ -70,95 +70,9 @@ endif
|
||||
# TODO: probably these flags need to be tweaked on some architectures
|
||||
# feel free to update the Makefile for your architecture and send a pull request or issue
|
||||
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
F16C_M := $(shell sysctl machdep.cpu.features)
|
||||
ifneq (,$(findstring F16C,$(F16C_M)))
|
||||
CFLAGS += -mf16c
|
||||
endif
|
||||
AVX1_M := $(shell sysctl machdep.cpu.features)
|
||||
ifneq (,$(findstring FMA,$(AVX1_M)))
|
||||
CFLAGS += -mfma
|
||||
endif
|
||||
ifneq (,$(findstring AVX1.0,$(AVX1_M)))
|
||||
CFLAGS += -mavx
|
||||
endif
|
||||
AVX2_M := $(shell sysctl machdep.cpu.leaf7_features)
|
||||
ifneq (,$(findstring AVX2,$(AVX2_M)))
|
||||
CFLAGS += -mavx2
|
||||
endif
|
||||
else ifeq ($(UNAME_S),Linux)
|
||||
AVX1_M := $(shell grep "avx " /proc/cpuinfo)
|
||||
ifneq (,$(findstring avx,$(AVX1_M)))
|
||||
CFLAGS += -mavx
|
||||
endif
|
||||
AVX2_M := $(shell grep "avx2 " /proc/cpuinfo)
|
||||
ifneq (,$(findstring avx2,$(AVX2_M)))
|
||||
CFLAGS += -mavx2
|
||||
endif
|
||||
FMA_M := $(shell grep "fma " /proc/cpuinfo)
|
||||
ifneq (,$(findstring fma,$(FMA_M)))
|
||||
CFLAGS += -mfma
|
||||
endif
|
||||
F16C_M := $(shell grep "f16c " /proc/cpuinfo)
|
||||
ifneq (,$(findstring f16c,$(F16C_M)))
|
||||
CFLAGS += -mf16c
|
||||
endif
|
||||
SSE3_M := $(shell grep "sse3 " /proc/cpuinfo)
|
||||
ifneq (,$(findstring sse3,$(SSE3_M)))
|
||||
CFLAGS += -msse3
|
||||
endif
|
||||
AVX512F_M := $(shell grep "avx512f " /proc/cpuinfo)
|
||||
ifneq (,$(findstring avx512f,$(AVX512F_M)))
|
||||
CFLAGS += -mavx512f
|
||||
endif
|
||||
AVX512BW_M := $(shell grep "avx512bw " /proc/cpuinfo)
|
||||
ifneq (,$(findstring avx512bw,$(AVX512BW_M)))
|
||||
CFLAGS += -mavx512bw
|
||||
endif
|
||||
AVX512DQ_M := $(shell grep "avx512dq " /proc/cpuinfo)
|
||||
ifneq (,$(findstring avx512dq,$(AVX512DQ_M)))
|
||||
CFLAGS += -mavx512dq
|
||||
endif
|
||||
AVX512VL_M := $(shell grep "avx512vl " /proc/cpuinfo)
|
||||
ifneq (,$(findstring avx512vl,$(AVX512VL_M)))
|
||||
CFLAGS += -mavx512vl
|
||||
endif
|
||||
AVX512CD_M := $(shell grep "avx512cd " /proc/cpuinfo)
|
||||
ifneq (,$(findstring avx512cd,$(AVX512CD_M)))
|
||||
CFLAGS += -mavx512cd
|
||||
endif
|
||||
AVX512ER_M := $(shell grep "avx512er " /proc/cpuinfo)
|
||||
ifneq (,$(findstring avx512er,$(AVX512ER_M)))
|
||||
CFLAGS += -mavx512er
|
||||
endif
|
||||
AVX512IFMA_M := $(shell grep "avx512ifma " /proc/cpuinfo)
|
||||
ifneq (,$(findstring avx512ifma,$(AVX512IFMA_M)))
|
||||
CFLAGS += -mavx512ifma
|
||||
endif
|
||||
AVX512PF_M := $(shell grep "avx512pf " /proc/cpuinfo)
|
||||
ifneq (,$(findstring avx512pf,$(AVX512PF_M)))
|
||||
CFLAGS += -mavx512pf
|
||||
endif
|
||||
else ifeq ($(UNAME_S),Haiku)
|
||||
AVX1_M := $(shell sysinfo -cpu | grep -w "AVX")
|
||||
ifneq (,$(findstring AVX,$(AVX1_M)))
|
||||
CFLAGS += -mavx
|
||||
endif
|
||||
AVX2_M := $(shell sysinfo -cpu | grep -w "AVX2")
|
||||
ifneq (,$(findstring AVX2,$(AVX2_M)))
|
||||
CFLAGS += -mavx2
|
||||
endif
|
||||
FMA_M := $(shell sysinfo -cpu | grep -w "FMA")
|
||||
ifneq (,$(findstring FMA,$(FMA_M)))
|
||||
CFLAGS += -mfma
|
||||
endif
|
||||
F16C_M := $(shell sysinfo -cpu | grep -w "F16C")
|
||||
ifneq (,$(findstring F16C,$(F16C_M)))
|
||||
CFLAGS += -mf16c
|
||||
endif
|
||||
else
|
||||
CFLAGS += -mfma -mf16c -mavx -mavx2
|
||||
endif
|
||||
# Use all CPU extensions that are available:
|
||||
CFLAGS += -march=native -mtune=native
|
||||
CXXFLAGS += -march=native -mtune=native
|
||||
endif
|
||||
ifneq ($(filter ppc64%,$(UNAME_M)),)
|
||||
POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
|
||||
@@ -235,7 +149,7 @@ common.o: examples/common.cpp examples/common.h
|
||||
$(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o
|
||||
|
||||
clean:
|
||||
rm -vf *.o main quantize perplexity embedding
|
||||
rm -vf *.o main quantize quantize-stats perplexity embedding
|
||||
|
||||
main: examples/main/main.cpp ggml.o llama.o common.o
|
||||
$(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS)
|
||||
@@ -246,12 +160,17 @@ main: examples/main/main.cpp ggml.o llama.o common.o
|
||||
quantize: examples/quantize/quantize.cpp ggml.o llama.o
|
||||
$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp ggml.o llama.o -o quantize $(LDFLAGS)
|
||||
|
||||
quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o
|
||||
$(CXX) $(CXXFLAGS) examples/quantize-stats/quantize-stats.cpp ggml.o llama.o -o quantize-stats $(LDFLAGS)
|
||||
|
||||
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o
|
||||
$(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity $(LDFLAGS)
|
||||
|
||||
embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o
|
||||
$(CXX) $(CXXFLAGS) examples/embedding/embedding.cpp ggml.o llama.o common.o -o embedding $(LDFLAGS)
|
||||
|
||||
libllama.so: llama.o ggml.o
|
||||
$(CXX) $(CXXFLAGS) -shared -fPIC -o libllama.so llama.o ggml.o $(LDFLAGS)
|
||||
#
|
||||
# Tests
|
||||
#
|
||||
|
||||
@@ -13,7 +13,10 @@ let package = Package(
|
||||
path: ".",
|
||||
sources: ["ggml.c", "llama.cpp"],
|
||||
publicHeadersPath: "spm-headers",
|
||||
cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"])]
|
||||
cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"]), .define("GGML_USE_ACCELERATE")],
|
||||
linkerSettings: [
|
||||
.linkedFramework("Accelerate")
|
||||
]
|
||||
),
|
||||
],
|
||||
cxxLanguageStandard: .cxx11
|
||||
|
||||
48
README.md
48
README.md
@@ -1,6 +1,6 @@
|
||||
# llama.cpp
|
||||
|
||||

|
||||

|
||||
|
||||
[](https://github.com/ggerganov/llama.cpp/actions)
|
||||
[](https://opensource.org/licenses/MIT)
|
||||
@@ -9,8 +9,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
||||
|
||||
**Hot topics:**
|
||||
|
||||
- [Roadmap (short-term)](https://github.com/ggerganov/llama.cpp/discussions/457)
|
||||
- Support for [GPT4All](https://github.com/ggerganov/llama.cpp#using-gpt4all)
|
||||
- [Roadmap Apr 2023](https://github.com/ggerganov/llama.cpp/discussions/784)
|
||||
|
||||
## Description
|
||||
|
||||
@@ -28,20 +27,31 @@ Please do not make conclusions about the models based on the results from this i
|
||||
For all I know, it can be completely wrong. This project is for educational purposes.
|
||||
New features will probably be added mostly through community contributions.
|
||||
|
||||
Supported platforms:
|
||||
**Supported platforms:**
|
||||
|
||||
- [X] Mac OS
|
||||
- [X] Linux
|
||||
- [X] Windows (via CMake)
|
||||
- [X] Docker
|
||||
|
||||
Supported models:
|
||||
**Supported models:**
|
||||
|
||||
- [X] LLaMA 🦙
|
||||
- [X] [Alpaca](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca)
|
||||
- [X] [GPT4All](https://github.com/ggerganov/llama.cpp#using-gpt4all)
|
||||
- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca)
|
||||
- [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
|
||||
- [X] [Vicuna](https://github.com/ggerganov/llama.cpp/discussions/643#discussioncomment-5533894)
|
||||
|
||||
**Bindings:**
|
||||
|
||||
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
|
||||
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
|
||||
|
||||
**UI:**
|
||||
|
||||
- [nat/openplayground](https://github.com/nat/openplayground)
|
||||
- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui)
|
||||
|
||||
---
|
||||
|
||||
@@ -145,6 +155,13 @@ git clone https://github.com/ggerganov/llama.cpp
|
||||
cd llama.cpp
|
||||
make
|
||||
|
||||
#For Windows and CMake, use the following command instead:
|
||||
cd <path_to_llama_folder>
|
||||
mkdir build
|
||||
cd build
|
||||
cmake ..
|
||||
cmake --build . --config Release
|
||||
|
||||
# obtain the original LLaMA model weights and place them in ./models
|
||||
ls ./models
|
||||
65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model
|
||||
@@ -232,13 +249,15 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
|
||||
|
||||
- Obtain the `gpt4all-lora-quantized.bin` model
|
||||
- It is distributed in the old `ggml` format which is now obsoleted
|
||||
- You have to convert it to the new format using [./convert-gpt4all-to-ggml.py](./convert-gpt4all-to-ggml.py):
|
||||
- You have to convert it to the new format using [./convert-gpt4all-to-ggml.py](./convert-gpt4all-to-ggml.py). You may also need to
|
||||
convert the model from the old format to the new format with [./migrate-ggml-2023-03-30-pr613.py](./migrate-ggml-2023-03-30-pr613.py):
|
||||
|
||||
```bash
|
||||
python3 convert-gpt4all-to-ggml.py models/gpt4all-7B/gpt4all-lora-quantized.bin ./models/tokenizer.model
|
||||
python3 migrate-ggml-2023-03-30-pr613.py models/gpt4all-7B/gpt4all-lora-quantized.bin models/gpt4all-7B/gpt4all-lora-quantized-new.bin
|
||||
```
|
||||
|
||||
- You can now use the newly generated `gpt4all-lora-quantized.bin` model in exactly the same way as all other models
|
||||
- You can now use the newly generated `gpt4all-lora-quantized-new.bin` model in exactly the same way as all other models
|
||||
- The original model is saved in the same folder with a suffix `.orig`
|
||||
|
||||
### Obtaining and verifying the Facebook LLaMA original model and Stanford Alpaca model data
|
||||
@@ -301,7 +320,7 @@ And after 4.45 hours, you will have the final perplexity.
|
||||
|
||||
### Android
|
||||
|
||||
You can easily run `llama.cpp` on Android device with [termux](https://play.google.com/store/apps/details?id=com.termux).
|
||||
You can easily run `llama.cpp` on Android device with [termux](https://termux.dev/).
|
||||
First, obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake:
|
||||
```
|
||||
$ mkdir build-android
|
||||
@@ -310,7 +329,7 @@ $ export NDK=<your_ndk_directory>
|
||||
$ cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
|
||||
$ make
|
||||
```
|
||||
Install [termux](https://play.google.com/store/apps/details?id=com.termux) on your device and run `termux-setup-storage` to get access to your SD card.
|
||||
Install [termux](https://termux.dev/) on your device and run `termux-setup-storage` to get access to your SD card.
|
||||
Finally, copy the `llama` binary and the model files to your device storage. Here is a demo of an interactive session running on Pixel 5 phone:
|
||||
|
||||
https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
|
||||
@@ -331,20 +350,22 @@ We have two Docker images available for this project:
|
||||
|
||||
The easiest way to download the models, convert them to ggml and optimize them is with the --all-in-one command which includes the full docker image.
|
||||
|
||||
Replace `/path/to/models` below with the actual path where you downloaded the models.
|
||||
|
||||
```bash
|
||||
docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B
|
||||
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B
|
||||
```
|
||||
|
||||
On complete, you are ready to play!
|
||||
|
||||
```bash
|
||||
docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
|
||||
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
|
||||
```
|
||||
|
||||
or with light image:
|
||||
|
||||
```bash
|
||||
docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
|
||||
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
|
||||
```
|
||||
|
||||
### Contributing
|
||||
@@ -365,3 +386,6 @@ docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models
|
||||
- Clean-up any trailing whitespaces, use 4 spaces indentation, brackets on same line, `void * ptr`, `int & a`
|
||||
- See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
|
||||
|
||||
### Docs
|
||||
|
||||
- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
|
||||
|
||||
67
build.zig
Normal file
67
build.zig
Normal file
@@ -0,0 +1,67 @@
|
||||
const std = @import("std");
|
||||
|
||||
pub fn build(b: *std.Build) void {
|
||||
const target = b.standardTargetOptions(.{});
|
||||
const optimize = b.standardOptimizeOption(.{});
|
||||
const want_lto = b.option(bool, "lto", "Want -fLTO");
|
||||
|
||||
const lib = b.addStaticLibrary(.{
|
||||
.name = "llama",
|
||||
.target = target,
|
||||
.optimize = optimize,
|
||||
});
|
||||
lib.want_lto = want_lto;
|
||||
lib.linkLibCpp();
|
||||
lib.addIncludePath(".");
|
||||
lib.addIncludePath("examples");
|
||||
lib.addCSourceFiles(&.{
|
||||
"ggml.c",
|
||||
}, &.{"-std=c11"});
|
||||
lib.addCSourceFiles(&.{
|
||||
"llama.cpp",
|
||||
}, &.{"-std=c++11"});
|
||||
lib.install();
|
||||
|
||||
const build_args = .{ .b = b, .lib = lib, .target = target, .optimize = optimize, .want_lto = want_lto };
|
||||
|
||||
const exe = build_example("main", build_args);
|
||||
_ = build_example("quantize", build_args);
|
||||
_ = build_example("perplexity", build_args);
|
||||
_ = build_example("embedding", build_args);
|
||||
|
||||
// create "zig build run" command for ./main
|
||||
|
||||
const run_cmd = exe.run();
|
||||
run_cmd.step.dependOn(b.getInstallStep());
|
||||
if (b.args) |args| {
|
||||
run_cmd.addArgs(args);
|
||||
}
|
||||
|
||||
const run_step = b.step("run", "Run the app");
|
||||
run_step.dependOn(&run_cmd.step);
|
||||
}
|
||||
|
||||
fn build_example(comptime name: []const u8, args: anytype) *std.build.LibExeObjStep {
|
||||
const b = args.b;
|
||||
const lib = args.lib;
|
||||
const target = args.target;
|
||||
const optimize = args.optimize;
|
||||
const want_lto = args.want_lto;
|
||||
|
||||
const exe = b.addExecutable(.{
|
||||
.name = name,
|
||||
.target = target,
|
||||
.optimize = optimize,
|
||||
});
|
||||
exe.want_lto = want_lto;
|
||||
exe.addIncludePath(".");
|
||||
exe.addIncludePath("examples");
|
||||
exe.addCSourceFiles(&.{
|
||||
std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{name, name}),
|
||||
"examples/common.cpp",
|
||||
}, &.{"-std=c++11"});
|
||||
exe.linkLibrary(lib);
|
||||
exe.install();
|
||||
|
||||
return exe;
|
||||
}
|
||||
@@ -254,7 +254,7 @@ def main():
|
||||
parser.add_argument(
|
||||
"--hf",
|
||||
action="store_true",
|
||||
help="Whether to save the model in the huggingface format. (default: False)",
|
||||
help="Whether to save the model in the Hugging Face format. (default: False)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--chat", "-c", action="store_true", help="Whether to open a chat with the model. (default: False)"
|
||||
|
||||
@@ -31,6 +31,7 @@ if (EMSCRIPTEN)
|
||||
else()
|
||||
add_subdirectory(main)
|
||||
add_subdirectory(quantize)
|
||||
add_subdirectory(quantize-stats)
|
||||
add_subdirectory(perplexity)
|
||||
add_subdirectory(embedding)
|
||||
endif()
|
||||
|
||||
49
examples/Miku.sh
Executable file
49
examples/Miku.sh
Executable file
@@ -0,0 +1,49 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
AI_NAME="${AI_NAME:-Miku}"
|
||||
MODEL="${MODEL:-./models/gpt4all-7B/gpt4all-lora-unfiltered-quantized.bin}"
|
||||
USER_NAME="${USER_NAME:-Anon}"
|
||||
|
||||
# Uncomment and adjust to the number of CPU cores you want to use.
|
||||
#N_THREAD="${N_THREAD:-4}"
|
||||
N_PREDICTS="${N_PREDICTS:-4096}"
|
||||
|
||||
GEN_OPTIONS=(--batch_size 1024
|
||||
--ctx_size 2048
|
||||
--keep -1
|
||||
--repeat_last_n 256
|
||||
--repeat_penalty 1.17647
|
||||
--temp 0.7
|
||||
--top_k 40
|
||||
--top_p 0.5)
|
||||
|
||||
if [ -n "$N_THREAD" ]; then
|
||||
GEN_OPTIONS+=(--threads "$N_THREAD")
|
||||
fi
|
||||
|
||||
./main "${GEN_OPTIONS[@]}" \
|
||||
--model "$MODEL" \
|
||||
--n_predict "$N_PREDICTS" \
|
||||
--color --interactive \
|
||||
--reverse-prompt "${USER_NAME}:" \
|
||||
--prompt "
|
||||
This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the users computer.
|
||||
${AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next.
|
||||
${AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct she will ask the user for help.
|
||||
${AI_NAME} is a very helpful AI and will help the user with anything they need, she is also very friendly and will try to make the user feel better if they are sad.
|
||||
${AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life, she will also try to make the user like her.
|
||||
The conversation is only between ${USER_NAME} and ${AI_NAME}
|
||||
The conversation is only through text, so ${AI_NAME} can't see ${USER_NAME}'s face or hear his voice.
|
||||
${AI_NAME} can only communicate through text, so she can't send images or videos.
|
||||
|
||||
|
||||
${USER_NAME}: Hello!
|
||||
${AI_NAME}: /think I wonder what I should say to ${USER_NAME}? This is the first time we talk so it's important that I make a good first impression!
|
||||
${AI_NAME}: Hi! I am ${AI_NAME}, your new AI friend, assistant(or whatever you like!), it's so nice to meet you! ^_^
|
||||
${AI_NAME}: I might be an AI, but I also have feelings, so please be nice to me! :)
|
||||
${USER_NAME}: I'll make sure to be nice to you! I'm so happy to have you as my assistant!
|
||||
${AI_NAME}: /think It sounds like ${USER_NAME} is happy to have me as their assistant! I'm so happy too! ^_^ Glad that whole emotion thing didn't scare him off!
|
||||
${AI_NAME}: /think I wonder what ${USER_NAME} likes to do in his free time? I should ask him about that!
|
||||
${AI_NAME}: What do you like to do in your free time? ^_^
|
||||
${USER_NAME}:" "$@"
|
||||
@@ -39,6 +39,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
|
||||
bool invalid_param = false;
|
||||
std::string arg;
|
||||
gpt_params default_params;
|
||||
|
||||
for (int i = 1; i < argc; i++) {
|
||||
arg = argv[i];
|
||||
|
||||
@@ -173,7 +175,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
}
|
||||
params.n_parts = std::stoi(argv[i]);
|
||||
} else if (arg == "-h" || arg == "--help") {
|
||||
gpt_print_usage(argc, argv, params);
|
||||
gpt_print_usage(argc, argv, default_params);
|
||||
exit(0);
|
||||
} else if (arg == "--random-prompt") {
|
||||
params.random_prompt = true;
|
||||
@@ -185,13 +187,13 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
params.input_prefix = argv[i];
|
||||
} else {
|
||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||
gpt_print_usage(argc, argv, params);
|
||||
gpt_print_usage(argc, argv, default_params);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
if (invalid_param) {
|
||||
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
|
||||
gpt_print_usage(argc, argv, params);
|
||||
gpt_print_usage(argc, argv, default_params);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
|
||||
15
examples/gpt4all.sh
Executable file
15
examples/gpt4all.sh
Executable file
@@ -0,0 +1,15 @@
|
||||
#!/bin/bash
|
||||
|
||||
#
|
||||
# Temporary script - will be removed in the future
|
||||
#
|
||||
|
||||
cd `dirname $0`
|
||||
cd ..
|
||||
|
||||
./main --color --instruct --threads 4 \
|
||||
--model ./models/gpt4all-7B/gpt4all-lora-quantized.bin \
|
||||
--file ./prompts/alpaca.txt \
|
||||
--batch_size 8 --ctx_size 2048 \
|
||||
--repeat_last_n 64 --repeat_penalty 1.3 \
|
||||
--n_predict 128 --temp 0.1 --top_k 40 --top_p 0.95
|
||||
@@ -368,6 +368,11 @@ int main(int argc, char ** argv) {
|
||||
// potentially set color to indicate we are taking user input
|
||||
set_console_color(con_st, CONSOLE_COLOR_USER_INPUT);
|
||||
|
||||
#if defined (_WIN32)
|
||||
// Windows: must reactivate sigint handler after each signal
|
||||
signal(SIGINT, sigint_handler);
|
||||
#endif
|
||||
|
||||
if (params.instruct) {
|
||||
printf("\n> ");
|
||||
}
|
||||
@@ -426,7 +431,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// end of text token
|
||||
if (embd.back() == llama_token_eos()) {
|
||||
if (!embd.empty() && embd.back() == llama_token_eos()) {
|
||||
if (params.instruct) {
|
||||
is_interacting = true;
|
||||
} else {
|
||||
|
||||
4
examples/quantize-stats/CMakeLists.txt
Normal file
4
examples/quantize-stats/CMakeLists.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
set(TARGET quantize-stats)
|
||||
add_executable(${TARGET} quantize-stats.cpp)
|
||||
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
355
examples/quantize-stats/quantize-stats.cpp
Normal file
355
examples/quantize-stats/quantize-stats.cpp
Normal file
@@ -0,0 +1,355 @@
|
||||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cinttypes>
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <map>
|
||||
#include <numeric>
|
||||
#include <regex>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
static const char * type_strs[] = { "q4_0", "q4_1", "i8", "i16", "i32", "f16", "f32" };
|
||||
static_assert(sizeof(type_strs) == GGML_TYPE_COUNT * sizeof(char *), "Incomplete type list");
|
||||
|
||||
struct quantize_stats_params {
|
||||
std::string model = "models/7B/ggml-model-f16.bin";
|
||||
bool verbose = false;
|
||||
bool per_layer_stats = false;
|
||||
bool print_histogram = false;
|
||||
bool reference = false;
|
||||
std::vector<std::string> include_layers;
|
||||
std::vector<std::string> exclude_layers;
|
||||
std::vector<enum ggml_type> include_types;
|
||||
};
|
||||
|
||||
const int64_t SCRATCH_ELEMENTS = 32*32;
|
||||
const size_t HISTOGRAM_BUCKETS = 150;
|
||||
const double HISTOGRAM_RANGE = 0.03;
|
||||
|
||||
struct error_stats {
|
||||
size_t num_samples;
|
||||
double total_error;
|
||||
double max_error;
|
||||
uint64_t error_histogram[HISTOGRAM_BUCKETS];
|
||||
};
|
||||
|
||||
|
||||
void quantize_stats_print_usage(int /*argc*/, char ** argv) {
|
||||
quantize_stats_params params;
|
||||
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "options:\n");
|
||||
fprintf(stderr, " -h, --help show this help message and exit\n");
|
||||
fprintf(stderr, " -m FNAME, --model FNAME\n");
|
||||
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
|
||||
fprintf(stderr, " -r, --reference\n");
|
||||
fprintf(stderr, " use reference implementation (default: false)\n");
|
||||
fprintf(stderr, " -v, --verbose\n");
|
||||
fprintf(stderr, " verbose output (default: false)\n");
|
||||
fprintf(stderr, " -p, --per-layer-stats\n");
|
||||
fprintf(stderr, " print stats per layer (default: false)\n");
|
||||
fprintf(stderr, " --histogram\n");
|
||||
fprintf(stderr, " print error histogram (default: false)\n");
|
||||
fprintf(stderr, " -l LAYER, --include-layer LAYER\n");
|
||||
fprintf(stderr, " only test layers matching pattern\n");
|
||||
fprintf(stderr, " -L LAYER, --exclude-layer LAYER\n");
|
||||
fprintf(stderr, " exclude layers matching pattern\n");
|
||||
fprintf(stderr, " -t TYPE, --type TYPE\n");
|
||||
fprintf(stderr, " only test given type (q4_0, q4_1)\n");
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
// Check if a layer is included/excluded by command line
|
||||
bool layer_included(const quantize_stats_params params, const std::string & layer) {
|
||||
for (const auto& excluded : params.exclude_layers) {
|
||||
if (std::regex_search(layer, std::regex(excluded))) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
for (const auto& included : params.include_layers) {
|
||||
if (std::regex_search(layer, std::regex(included))) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return params.include_layers.empty();
|
||||
}
|
||||
|
||||
// Update error statistics given vectors with the before/after result of quantization
|
||||
void update_error_stats(int64_t nelements, const float * input, const float * output, error_stats & stats) {
|
||||
for (int64_t i = 0; i < nelements; i++) {
|
||||
double diff = input[i] - output[i];
|
||||
stats.total_error += diff * diff;
|
||||
stats.max_error = fmax(fabs(diff), stats.max_error);
|
||||
stats.error_histogram[std::max(std::min((size_t) floor(fabs(diff) / HISTOGRAM_RANGE * HISTOGRAM_BUCKETS), HISTOGRAM_BUCKETS-1), (size_t) 0)]++;
|
||||
}
|
||||
stats.num_samples += nelements;
|
||||
}
|
||||
|
||||
double find_quantile(const error_stats & stats, double quantile) {
|
||||
double sum = std::accumulate(std::begin(stats.error_histogram), std::end(stats.error_histogram), 0.0);
|
||||
|
||||
double accum = 0;
|
||||
for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) {
|
||||
accum += stats.error_histogram[i];
|
||||
if (accum >= sum*quantile) {
|
||||
return (i+1) * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS;
|
||||
}
|
||||
}
|
||||
return INFINITY;
|
||||
}
|
||||
|
||||
void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) {
|
||||
double rmse = sqrt(stats.total_error / (double) stats.num_samples);
|
||||
double median = find_quantile(stats, .5);
|
||||
double pct95 = find_quantile(stats, .95);
|
||||
printf("%-50s: rmse %.8f, maxerr %.8f, 95pct<%.4f, median<%.4f\n", name.c_str(), rmse, stats.max_error, pct95, median);
|
||||
if (print_histogram) {
|
||||
printf("Error distribution:\n");
|
||||
for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) {
|
||||
double lower = i * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS;
|
||||
double upper = (i+1) * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS;
|
||||
if (i == HISTOGRAM_BUCKETS -1) upper = INFINITY;
|
||||
printf("[%3.4f, %3.4f): %11" PRIu64 "\n", lower, upper, stats.error_histogram[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// copied from ggml.h - verify that we can access this as a flat array
|
||||
static bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
|
||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||
|
||||
return
|
||||
tensor->nb[0] == ggml_type_size(tensor->type) &&
|
||||
tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
|
||||
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
|
||||
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
||||
}
|
||||
|
||||
// Run quantization function for a single layer and update error stats
|
||||
void test_roundtrip_on_layer(
|
||||
std::string & name,
|
||||
bool print_layer_stats,
|
||||
const quantize_fns_t & qfns,
|
||||
bool use_reference,
|
||||
const ggml_tensor * layer,
|
||||
float * input_scratch,
|
||||
char *quantized_scratch,
|
||||
float * output_scratch,
|
||||
error_stats & total_error) {
|
||||
|
||||
assert(tensor_is_contiguous(layer));
|
||||
error_stats layer_error {};
|
||||
int64_t nelements = ggml_nelements(layer);
|
||||
|
||||
for (int64_t offset = 0; offset < nelements; offset += SCRATCH_ELEMENTS) {
|
||||
int64_t chunk_size = std::min(SCRATCH_ELEMENTS, nelements - offset);
|
||||
|
||||
if (layer->type == GGML_TYPE_F16) {
|
||||
for (int i = 0; i < chunk_size; i++) {
|
||||
input_scratch[i] = ggml_get_f32_1d(layer, i + offset);
|
||||
}
|
||||
} else {
|
||||
input_scratch = ggml_get_data_f32(layer) + offset;
|
||||
}
|
||||
|
||||
if (use_reference) {
|
||||
qfns.quantize_row_q_reference(input_scratch, quantized_scratch, chunk_size);
|
||||
} else {
|
||||
qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size);
|
||||
}
|
||||
qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size);
|
||||
|
||||
update_error_stats(chunk_size, input_scratch, output_scratch, total_error);
|
||||
if (print_layer_stats) {
|
||||
update_error_stats(chunk_size, input_scratch, output_scratch, layer_error);
|
||||
}
|
||||
}
|
||||
if (print_layer_stats) {
|
||||
print_error_stats(name, layer_error, false);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
ggml_time_init();
|
||||
|
||||
quantize_stats_params params;
|
||||
|
||||
// read command line
|
||||
|
||||
bool invalid_param = false;
|
||||
std::string arg;
|
||||
for (int i = 1; i < argc; i++) {
|
||||
arg = argv[i];
|
||||
|
||||
if (arg == "-h" || arg == "--help") {
|
||||
quantize_stats_print_usage(argc, argv);
|
||||
exit(0);
|
||||
} else if (arg == "-r" || arg == "--reference") {
|
||||
params.reference = true;
|
||||
} else if (arg == "-v") {
|
||||
params.verbose = true;
|
||||
} else if (arg == "-p" || arg == "--per-layer-stats") {
|
||||
params.per_layer_stats = true;
|
||||
} else if (arg == "--histogram") {
|
||||
params.print_histogram = true;
|
||||
} else if (arg == "-m" || arg == "--model") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.model = argv[i];
|
||||
} else if (arg == "-l" || arg == "--include-layer") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.include_layers.push_back(argv[i]);
|
||||
} else if (arg == "-L" || arg == "--exclude-layer") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.exclude_layers.push_back(argv[i]);
|
||||
} else if (arg == "-t" || arg == "--type") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
int j;
|
||||
for (j = 0; j < GGML_TYPE_COUNT && strcmp(argv[i], type_strs[j]) != 0; j++) {
|
||||
// find match
|
||||
}
|
||||
if (j < GGML_TYPE_COUNT) {
|
||||
params.include_types.push_back((ggml_type) j);
|
||||
} else {
|
||||
fprintf(stderr, "error: %s not in list of types\n", argv[i]);
|
||||
invalid_param = true;
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||
quantize_stats_print_usage(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
if (invalid_param) {
|
||||
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
|
||||
quantize_stats_print_usage(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// load the model
|
||||
fprintf(stderr, "Loading model\n");
|
||||
|
||||
const int64_t t_main_start_us = ggml_time_us();
|
||||
llama_context * ctx;
|
||||
|
||||
{
|
||||
auto lparams = llama_context_default_params();
|
||||
|
||||
lparams.n_ctx = 256;
|
||||
lparams.n_parts = 1;
|
||||
lparams.seed = 1;
|
||||
lparams.f16_kv = false;
|
||||
lparams.use_mlock = false;
|
||||
|
||||
ctx = llama_init_from_file(params.model.c_str(), lparams);
|
||||
|
||||
if (ctx == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Sort tensors for consistent output
|
||||
const auto tensors = llama_internal_get_tensor_map(ctx);
|
||||
std::map<std::string, struct ggml_tensor *> tensors_sorted { tensors.begin(), tensors.end() };
|
||||
|
||||
// check layer tensors
|
||||
int included_layers = 0;
|
||||
int64_t max_nelements = 0;
|
||||
bool is_f16 = false;
|
||||
for (const auto& kv_tensor : tensors_sorted) {
|
||||
if (!layer_included(params, kv_tensor.first)) {
|
||||
continue;
|
||||
}
|
||||
if (params.verbose) {
|
||||
printf("%s: type %s, size %" PRId64 "\n", kv_tensor.first.c_str(), type_strs[kv_tensor.second->type], ggml_nelements(kv_tensor.second));
|
||||
}
|
||||
if (kv_tensor.second->type == GGML_TYPE_F16) {
|
||||
is_f16 = true;
|
||||
} else if (kv_tensor.second->type != GGML_TYPE_F32) {
|
||||
fprintf(stderr, "%s: error: Quantization should be tested with a float model, "
|
||||
"this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type);
|
||||
llama_free(ctx);
|
||||
return 1;
|
||||
}
|
||||
included_layers++;
|
||||
max_nelements = std::max(max_nelements, ggml_nelements(kv_tensor.second));
|
||||
}
|
||||
|
||||
if (is_f16) {
|
||||
printf("note: source model is f16\n");
|
||||
}
|
||||
printf("testing %d layers with max size %" PRId64 "\n", included_layers, max_nelements);
|
||||
// allocate scratch space
|
||||
std::vector<float> input_scratch(SCRATCH_ELEMENTS);
|
||||
std::vector<char> quantized_scratch(SCRATCH_ELEMENTS*4);
|
||||
std::vector<float> output_scratch(SCRATCH_ELEMENTS);
|
||||
|
||||
// loop throught quantization types
|
||||
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
||||
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
|
||||
continue;
|
||||
}
|
||||
quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
|
||||
if (qfns.quantize_row_q && qfns.dequantize_row_q) {
|
||||
if (params.verbose) {
|
||||
printf("testing %s ...\n", type_strs[i]);
|
||||
}
|
||||
|
||||
error_stats global_stats {};
|
||||
|
||||
for (const auto& kv_tensor : tensors_sorted) {
|
||||
if (!layer_included(params, kv_tensor.first)) {
|
||||
continue;
|
||||
}
|
||||
if (params.verbose) {
|
||||
printf(" %s ...\n", kv_tensor.first.c_str());
|
||||
}
|
||||
std::string layer_name { type_strs[i] };
|
||||
layer_name += "::" + kv_tensor.first;
|
||||
test_roundtrip_on_layer(
|
||||
layer_name,
|
||||
params.per_layer_stats,
|
||||
qfns,
|
||||
params.reference,
|
||||
kv_tensor.second,
|
||||
input_scratch.data(),
|
||||
quantized_scratch.data(),
|
||||
output_scratch.data(),
|
||||
global_stats
|
||||
);
|
||||
}
|
||||
|
||||
print_error_stats(type_strs[i], global_stats, params.print_histogram);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
llama_free(ctx);
|
||||
// report timing
|
||||
{
|
||||
const int64_t t_main_end_us = ggml_time_us();
|
||||
|
||||
printf("\n");
|
||||
printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
86
ggml.h
86
ggml.h
@@ -258,11 +258,11 @@ struct ggml_tensor {
|
||||
enum ggml_type type;
|
||||
|
||||
int n_dims;
|
||||
int ne[GGML_MAX_DIMS]; // number of elements
|
||||
size_t nb[GGML_MAX_DIMS]; // stride in bytes:
|
||||
// nb[0] = sizeof(type)
|
||||
// nb[1] = nb[0] * ne[0] + padding
|
||||
// nb[i] = nb[i-1] * ne[i-1]
|
||||
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
||||
size_t nb[GGML_MAX_DIMS]; // stride in bytes:
|
||||
// nb[0] = sizeof(type)
|
||||
// nb[1] = nb[0] * ne[0] + padding
|
||||
// nb[i] = nb[i-1] * ne[i-1]
|
||||
|
||||
// compute data
|
||||
enum ggml_op op;
|
||||
@@ -328,8 +328,8 @@ int64_t ggml_cycles_per_ms(void);
|
||||
void ggml_print_object (const struct ggml_object * obj);
|
||||
void ggml_print_objects(const struct ggml_context * ctx);
|
||||
|
||||
int ggml_nelements(const struct ggml_tensor * tensor);
|
||||
size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
||||
int64_t ggml_nelements(const struct ggml_tensor * tensor);
|
||||
size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
||||
|
||||
int ggml_blck_size (enum ggml_type type);
|
||||
size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
|
||||
@@ -355,33 +355,33 @@ struct ggml_tensor * ggml_new_tensor(
|
||||
struct ggml_context * ctx,
|
||||
enum ggml_type type,
|
||||
int n_dims,
|
||||
const int *ne);
|
||||
const int64_t *ne);
|
||||
|
||||
struct ggml_tensor * ggml_new_tensor_1d(
|
||||
struct ggml_context * ctx,
|
||||
enum ggml_type type,
|
||||
int ne0);
|
||||
int64_t ne0);
|
||||
|
||||
struct ggml_tensor * ggml_new_tensor_2d(
|
||||
struct ggml_context * ctx,
|
||||
enum ggml_type type,
|
||||
int ne0,
|
||||
int ne1);
|
||||
int64_t ne0,
|
||||
int64_t ne1);
|
||||
|
||||
struct ggml_tensor * ggml_new_tensor_3d(
|
||||
struct ggml_context * ctx,
|
||||
enum ggml_type type,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int ne2);
|
||||
int64_t ne0,
|
||||
int64_t ne1,
|
||||
int64_t ne2);
|
||||
|
||||
struct ggml_tensor * ggml_new_tensor_4d(
|
||||
struct ggml_context * ctx,
|
||||
enum ggml_type type,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int ne2,
|
||||
int ne3);
|
||||
int64_t ne0,
|
||||
int64_t ne1,
|
||||
int64_t ne2,
|
||||
int64_t ne3);
|
||||
|
||||
struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
|
||||
struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
|
||||
@@ -531,33 +531,43 @@ struct ggml_tensor * ggml_reshape(
|
||||
struct ggml_tensor * ggml_reshape_2d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int ne0,
|
||||
int ne1);
|
||||
int64_t ne0,
|
||||
int64_t ne1);
|
||||
|
||||
// return view(a)
|
||||
// TODO: when we start computing gradient, make a copy instead of view
|
||||
struct ggml_tensor * ggml_reshape_3d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int ne2);
|
||||
int64_t ne0,
|
||||
int64_t ne1,
|
||||
int64_t ne2);
|
||||
|
||||
// offset in bytes
|
||||
struct ggml_tensor * ggml_view_1d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int ne0,
|
||||
int64_t ne0,
|
||||
size_t offset);
|
||||
|
||||
struct ggml_tensor * ggml_view_2d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int64_t ne0,
|
||||
int64_t ne1,
|
||||
size_t nb1, // row stride in bytes
|
||||
size_t offset);
|
||||
|
||||
struct ggml_tensor * ggml_view_3d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int64_t ne0,
|
||||
int64_t ne1,
|
||||
int64_t ne2,
|
||||
size_t nb1, // row stride in bytes
|
||||
size_t nb2, // slice stride in bytes
|
||||
size_t offset);
|
||||
|
||||
struct ggml_tensor * ggml_permute(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
@@ -773,6 +783,30 @@ int ggml_cpu_has_blas(void);
|
||||
int ggml_cpu_has_sse3(void);
|
||||
int ggml_cpu_has_vsx(void);
|
||||
|
||||
|
||||
//
|
||||
// Internal types and functions exposed for tests and benchmarks
|
||||
//
|
||||
|
||||
#ifdef __cplusplus
|
||||
// restrict not standard in C++
|
||||
#define GGML_RESTRICT
|
||||
#else
|
||||
#define GGML_RESTRICT restrict
|
||||
#endif
|
||||
typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
typedef void (*quantize_row_q_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
typedef void (*vec_dot_q_t)(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
|
||||
|
||||
typedef struct {
|
||||
dequantize_row_q_t dequantize_row_q;
|
||||
quantize_row_q_t quantize_row_q;
|
||||
quantize_row_q_t quantize_row_q_reference;
|
||||
vec_dot_q_t vec_dot_q;
|
||||
} quantize_fns_t;
|
||||
|
||||
quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
138
llama.cpp
138
llama.cpp
@@ -256,8 +256,8 @@ static bool kv_cache_init(
|
||||
const int n_embd = hparams.n_embd;
|
||||
const int n_layer = hparams.n_layer;
|
||||
|
||||
const int n_mem = n_layer*n_ctx;
|
||||
const int n_elements = n_embd*n_mem;
|
||||
const int64_t n_mem = (int64_t)n_layer*n_ctx;
|
||||
const int64_t n_elements = n_embd*n_mem;
|
||||
|
||||
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
||||
|
||||
@@ -679,7 +679,7 @@ static bool llama_model_load(
|
||||
return false;
|
||||
}
|
||||
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
|
||||
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
|
||||
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%" PRId64 ", %" PRId64 "], expected [%d, %d]\n",
|
||||
__func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
|
||||
return false;
|
||||
}
|
||||
@@ -810,37 +810,35 @@ static bool llama_eval_internal(
|
||||
|
||||
// self-attention
|
||||
{
|
||||
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
||||
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
||||
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
||||
// compute Q and K and RoPE them
|
||||
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
||||
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
||||
|
||||
// store key and value to memory
|
||||
if (N >= 1) {
|
||||
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
|
||||
struct ggml_tensor * v = ggml_view_1d(ctx0, kv_self.v, N*n_embd, (ggml_element_size(kv_self.v)*n_embd)*(il*n_ctx + n_past));
|
||||
{
|
||||
// compute the transposed [N, n_embd] V matrix
|
||||
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), n_embd, N));
|
||||
|
||||
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
|
||||
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
|
||||
( n_ctx)*ggml_element_size(kv_self.v),
|
||||
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
|
||||
|
||||
// important: storing RoPE-ed version of K in the KV cache!
|
||||
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
|
||||
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
|
||||
}
|
||||
|
||||
// Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
|
||||
struct ggml_tensor * Q =
|
||||
ggml_permute(ctx0,
|
||||
ggml_rope(ctx0,
|
||||
ggml_cpy(ctx0,
|
||||
Qcur,
|
||||
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
|
||||
n_past, n_rot, 0),
|
||||
Qcur,
|
||||
0, 2, 1, 3);
|
||||
|
||||
// K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
|
||||
struct ggml_tensor * K =
|
||||
ggml_permute(ctx0,
|
||||
ggml_rope(ctx0,
|
||||
ggml_reshape_3d(ctx0,
|
||||
ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
|
||||
n_embd/n_head, n_head, n_past + N),
|
||||
n_past, n_rot, 1),
|
||||
ggml_reshape_3d(ctx0,
|
||||
ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
|
||||
n_embd/n_head, n_head, n_past + N),
|
||||
0, 2, 1, 3);
|
||||
|
||||
// K * Q
|
||||
@@ -858,18 +856,23 @@ static bool llama_eval_internal(
|
||||
// KQ = soft_max(KQ_masked)
|
||||
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
||||
|
||||
// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
|
||||
struct ggml_tensor * V_trans =
|
||||
ggml_cpy(ctx0,
|
||||
ggml_permute(ctx0,
|
||||
ggml_reshape_3d(ctx0,
|
||||
ggml_view_1d(ctx0, kv_self.v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.v)*n_embd),
|
||||
n_embd/n_head, n_head, n_past + N),
|
||||
1, 2, 0, 3),
|
||||
ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head));
|
||||
// split cached V into n_head heads
|
||||
struct ggml_tensor * V =
|
||||
ggml_view_3d(ctx0, kv_self.v,
|
||||
n_past + N, n_embd/n_head, n_head,
|
||||
n_ctx*ggml_element_size(kv_self.v),
|
||||
n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
|
||||
il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
|
||||
|
||||
// KQV = transpose(V) * KQ_soft_max
|
||||
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
|
||||
#if 1
|
||||
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
||||
#else
|
||||
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
||||
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
|
||||
// is there a better way?
|
||||
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head));
|
||||
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
|
||||
#endif
|
||||
|
||||
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
||||
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
||||
@@ -955,9 +958,13 @@ static bool llama_eval_internal(
|
||||
ggml_build_forward_expand(&gf, inpL);
|
||||
ggml_graph_compute (ctx0, &gf);
|
||||
|
||||
// print timing information per ggml operation (for debugging purposes)
|
||||
// requires GGML_PERF to be defined
|
||||
//ggml_graph_print(&gf);
|
||||
|
||||
// plot the computation graph in dot format (for debugging purposes)
|
||||
//if (n_past%100 == 0) {
|
||||
// ggml_graph_print (&gf);
|
||||
// ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
|
||||
// ggml_graph_dump_dot(&gf, NULL, "llama.dot");
|
||||
//}
|
||||
|
||||
//embd_w.resize(n_vocab*N);
|
||||
@@ -1194,6 +1201,20 @@ static llama_vocab::id llama_sample_top_p_top_k(
|
||||
const auto & logits = lctx.logits;
|
||||
const auto * plogits = logits.data() + logits.size() - n_logits;
|
||||
|
||||
if (temp <= 0) {
|
||||
// select the token with the highest logit directly
|
||||
float max_logit = plogits[0];
|
||||
llama_vocab::id max_id = 0;
|
||||
|
||||
for (int i = 1; i < n_logits; ++i) {
|
||||
if (plogits[i] > max_logit) {
|
||||
max_logit = plogits[i];
|
||||
max_id = i;
|
||||
}
|
||||
}
|
||||
return max_id;
|
||||
}
|
||||
|
||||
std::vector<std::pair<float, llama_vocab::id>> logits_id;
|
||||
logits_id.reserve(n_logits);
|
||||
|
||||
@@ -1215,17 +1236,13 @@ static llama_vocab::id llama_sample_top_p_top_k(
|
||||
}
|
||||
}
|
||||
|
||||
sample_top_k(logits_id, top_k);
|
||||
|
||||
float maxl = -std::numeric_limits<float>::infinity();
|
||||
for (const auto & kv : logits_id) {
|
||||
maxl = Max(maxl, kv.first);
|
||||
}
|
||||
sample_top_k(logits_id, top_k > 0 ? Min(top_k, n_logits) : n_logits);
|
||||
|
||||
// compute probs for the top k tokens
|
||||
std::vector<float> probs;
|
||||
probs.reserve(logits_id.size());
|
||||
|
||||
float maxl = logits_id[0].first;
|
||||
double sum = 0.0;
|
||||
for (const auto & kv : logits_id) {
|
||||
const float p = expf(kv.first - maxl);
|
||||
@@ -1248,16 +1265,11 @@ static llama_vocab::id llama_sample_top_p_top_k(
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
cumsum = 1.0/cumsum;
|
||||
for (int i = 0; i < (int) probs.size(); i++) {
|
||||
probs[i] *= cumsum;
|
||||
}
|
||||
}
|
||||
|
||||
//printf("\n");
|
||||
//for (int i = 0; i < (int) 10; i++) {
|
||||
// printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
|
||||
// printf("%d: '%s' %f\n", i, lctx.vocab.id_to_token.at(logits_id[i].second).tok.c_str(), probs[i]);
|
||||
//}
|
||||
//printf("\n\n");
|
||||
//exit(0);
|
||||
@@ -1608,7 +1620,7 @@ struct llama_context * llama_init_from_file(
|
||||
}
|
||||
|
||||
// reserve memory for context buffers
|
||||
{
|
||||
if (!params.vocab_only) {
|
||||
if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) {
|
||||
fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
|
||||
llama_free(ctx);
|
||||
@@ -1668,6 +1680,33 @@ int llama_model_quantize(
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Returns the KV cache that will contain the context for the
|
||||
// ongoing prediction with the model.
|
||||
const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
|
||||
return ctx->model.kv_self.buf.data();
|
||||
}
|
||||
|
||||
// Returns the size of the KV cache
|
||||
size_t llama_get_kv_cache_size(struct llama_context * ctx) {
|
||||
return ctx->model.kv_self.buf.size();
|
||||
}
|
||||
|
||||
int llama_get_kv_cache_token_count(struct llama_context * ctx) {
|
||||
return ctx->model.kv_self.n;
|
||||
}
|
||||
|
||||
// Sets the KV cache containing the current context for the model
|
||||
void llama_set_kv_cache(
|
||||
struct llama_context * ctx,
|
||||
const uint8_t * kv_cache,
|
||||
size_t n_size,
|
||||
int n_token_count) {
|
||||
// Make sure we have the same kv cache setup
|
||||
LLAMA_ASSERT(ctx->model.kv_self.buf.size() == n_size);
|
||||
memcpy(ctx->model.kv_self.buf.data(), kv_cache, n_size);
|
||||
ctx->model.kv_self.n = n_token_count;
|
||||
}
|
||||
|
||||
int llama_eval(
|
||||
struct llama_context * ctx,
|
||||
const llama_token * tokens,
|
||||
@@ -1813,3 +1852,8 @@ const char * llama_print_system_info(void) {
|
||||
|
||||
return s.c_str();
|
||||
}
|
||||
|
||||
// For internal test use
|
||||
std::unordered_map<std::string, struct ggml_tensor *>& llama_internal_get_tensor_map(struct llama_context * ctx) {
|
||||
return ctx->model.tensors;
|
||||
}
|
||||
|
||||
24
llama.h
24
llama.h
@@ -83,6 +83,23 @@ extern "C" {
|
||||
const char * fname_out,
|
||||
int itype);
|
||||
|
||||
// Returns the KV cache that will contain the context for the
|
||||
// ongoing prediction with the model.
|
||||
LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx);
|
||||
|
||||
// Returns the size of the KV cache
|
||||
LLAMA_API size_t llama_get_kv_cache_size(struct llama_context * ctx);
|
||||
|
||||
// Returns the number of tokens in the KV cache
|
||||
LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
|
||||
|
||||
// Sets the KV cache containing the current context for the model
|
||||
LLAMA_API void llama_set_kv_cache(
|
||||
struct llama_context * ctx,
|
||||
const uint8_t * kv_cache,
|
||||
size_t n_size,
|
||||
int n_token_count);
|
||||
|
||||
// Run the llama inference to obtain the logits and probabilities for the next token.
|
||||
// tokens + n_tokens is the provided batch of new tokens to process
|
||||
// n_past is the number of tokens to use from previous eval calls
|
||||
@@ -147,6 +164,13 @@ extern "C" {
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
//
|
||||
// Internal function exposed for tests and benchmarks
|
||||
//
|
||||
std::unordered_map<std::string, struct ggml_tensor *>& llama_internal_get_tensor_map(struct llama_context * ctx);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
BIN
media/llama-leader.jpeg
Normal file
BIN
media/llama-leader.jpeg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 195 KiB |
BIN
media/llama0-banner.png
Normal file
BIN
media/llama0-banner.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 141 KiB |
BIN
media/llama0-logo.png
Normal file
BIN
media/llama0-logo.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 176 KiB |
BIN
media/llama1-banner.png
Normal file
BIN
media/llama1-banner.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 32 KiB |
BIN
media/llama1-logo.png
Normal file
BIN
media/llama1-logo.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 32 KiB |
Reference in New Issue
Block a user