mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-26 14:23:22 +02:00
Compare commits
53 Commits
master-0ba
...
master-698
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
698f7b5d63 | ||
|
|
c1950c3431 | ||
|
|
4953e9007f | ||
|
|
cc9cee8e9e | ||
|
|
d2beca95dc | ||
|
|
eeaa7b0492 | ||
|
|
986b6ce9f9 | ||
|
|
3416298929 | ||
|
|
5a8c4f6240 | ||
|
|
ff05d05c96 | ||
|
|
62b3e81aae | ||
|
|
8d10406d6e | ||
|
|
ed1c214e66 | ||
|
|
0c44427df1 | ||
|
|
594cc95fab | ||
|
|
88ed5761b8 | ||
|
|
58c438cf7d | ||
|
|
53dbba7695 | ||
|
|
437e77855a | ||
|
|
cd7fa95690 | ||
|
|
a0c0516416 | ||
|
|
d8d4e865cd | ||
|
|
e986f94829 | ||
|
|
c0bb1d3ce2 | ||
|
|
6e7801d08d | ||
|
|
81040f10aa | ||
|
|
c4f89d8d73 | ||
|
|
5b70e7de4c | ||
|
|
a717cba844 | ||
|
|
d0a7f742e7 | ||
|
|
0d054e292e | ||
|
|
3525899277 | ||
|
|
1d08882afa | ||
|
|
02c5b27e91 | ||
|
|
cbef542879 | ||
|
|
9733104be5 | ||
|
|
3df890aef4 | ||
|
|
ee0c40dd6d | ||
|
|
6f23ba5ee2 | ||
|
|
78ca9838ee | ||
|
|
a017390358 | ||
|
|
ac184d5147 | ||
|
|
276e5b7811 | ||
|
|
d68c5dc435 | ||
|
|
64bde3ffd4 | ||
|
|
c03ae8dca1 | ||
|
|
3bcc129ba8 | ||
|
|
a4755cf288 | ||
|
|
1f0414feec | ||
|
|
77efdf5a50 | ||
|
|
ed3c680bcd | ||
|
|
9cbc404ba6 | ||
|
|
b51c717d5c |
@@ -6,7 +6,8 @@ RUN apt-get update && \
|
||||
apt-get install -y build-essential python3 python3-pip
|
||||
|
||||
RUN pip install --upgrade pip setuptools wheel \
|
||||
&& pip install numpy requests sentencepiece torch tqdm
|
||||
&& pip install numpy requests sentencepiece tqdm \
|
||||
&& pip install torch --index-url https://download.pytorch.org/whl/cpu
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
8
.github/workflows/build.yml
vendored
8
.github/workflows/build.yml
vendored
@@ -176,7 +176,13 @@ jobs:
|
||||
if: ${{ matrix.build == 'avx512' }}
|
||||
continue-on-error: true
|
||||
run: |
|
||||
echo "TODO: check avx512f"
|
||||
cd build
|
||||
$vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
|
||||
$msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
|
||||
$cl = $(join-path $msvc 'bin\Hostx64\x64\cl.exe')
|
||||
echo 'int main(void){unsigned int a[4];__cpuid(a,7);return !(a[1]&65536);}' >> avx512f.c
|
||||
& $cl /O2 /GS- /kernel avx512f.c /link /nodefaultlib /entry:main
|
||||
.\avx512f.exe && echo "AVX512F: YES" && ( echo HAS_AVX512F=1 >> $env:GITHUB_ENV ) || echo "AVX512F: NO"
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
|
||||
4
.gitignore
vendored
4
.gitignore
vendored
@@ -22,6 +22,7 @@ models/*
|
||||
/result
|
||||
/perplexity
|
||||
/embedding
|
||||
/Pipfile
|
||||
|
||||
arm_neon.h
|
||||
compile_commands.json
|
||||
@@ -32,3 +33,6 @@ compile_commands.json
|
||||
.venv
|
||||
__pycache__
|
||||
.swiftpm
|
||||
|
||||
zig-out/
|
||||
zig-cache/
|
||||
|
||||
@@ -68,7 +68,9 @@ option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
||||
# Compile flags
|
||||
#
|
||||
|
||||
set(CMAKE_CXX_STANDARD 11)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED true)
|
||||
set(CMAKE_C_STANDARD 11)
|
||||
set(CMAKE_C_STANDARD_REQUIRED true)
|
||||
set(THREADS_PREFER_PTHREAD_FLAG ON)
|
||||
find_package(Threads REQUIRED)
|
||||
@@ -253,7 +255,7 @@ endif()
|
||||
#
|
||||
|
||||
if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
|
||||
enable_testing()
|
||||
include(CTest)
|
||||
add_subdirectory(tests)
|
||||
endif ()
|
||||
|
||||
|
||||
91
Makefile
91
Makefile
@@ -70,92 +70,9 @@ endif
|
||||
# TODO: probably these flags need to be tweaked on some architectures
|
||||
# feel free to update the Makefile for your architecture and send a pull request or issue
|
||||
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
CFLAGS += -mf16c
|
||||
AVX1_M := $(shell sysctl machdep.cpu.features)
|
||||
ifneq (,$(findstring FMA,$(AVX1_M)))
|
||||
CFLAGS += -mfma
|
||||
endif
|
||||
ifneq (,$(findstring AVX1.0,$(AVX1_M)))
|
||||
CFLAGS += -mavx
|
||||
endif
|
||||
AVX2_M := $(shell sysctl machdep.cpu.leaf7_features)
|
||||
ifneq (,$(findstring AVX2,$(AVX2_M)))
|
||||
CFLAGS += -mavx2
|
||||
endif
|
||||
else ifeq ($(UNAME_S),Linux)
|
||||
AVX1_M := $(shell grep "avx " /proc/cpuinfo)
|
||||
ifneq (,$(findstring avx,$(AVX1_M)))
|
||||
CFLAGS += -mavx
|
||||
endif
|
||||
AVX2_M := $(shell grep "avx2 " /proc/cpuinfo)
|
||||
ifneq (,$(findstring avx2,$(AVX2_M)))
|
||||
CFLAGS += -mavx2
|
||||
endif
|
||||
FMA_M := $(shell grep "fma " /proc/cpuinfo)
|
||||
ifneq (,$(findstring fma,$(FMA_M)))
|
||||
CFLAGS += -mfma
|
||||
endif
|
||||
F16C_M := $(shell grep "f16c " /proc/cpuinfo)
|
||||
ifneq (,$(findstring f16c,$(F16C_M)))
|
||||
CFLAGS += -mf16c
|
||||
endif
|
||||
SSE3_M := $(shell grep "sse3 " /proc/cpuinfo)
|
||||
ifneq (,$(findstring sse3,$(SSE3_M)))
|
||||
CFLAGS += -msse3
|
||||
endif
|
||||
AVX512F_M := $(shell grep "avx512f " /proc/cpuinfo)
|
||||
ifneq (,$(findstring avx512f,$(AVX512F_M)))
|
||||
CFLAGS += -mavx512f
|
||||
endif
|
||||
AVX512BW_M := $(shell grep "avx512bw " /proc/cpuinfo)
|
||||
ifneq (,$(findstring avx512bw,$(AVX512BW_M)))
|
||||
CFLAGS += -mavx512bw
|
||||
endif
|
||||
AVX512DQ_M := $(shell grep "avx512dq " /proc/cpuinfo)
|
||||
ifneq (,$(findstring avx512dq,$(AVX512DQ_M)))
|
||||
CFLAGS += -mavx512dq
|
||||
endif
|
||||
AVX512VL_M := $(shell grep "avx512vl " /proc/cpuinfo)
|
||||
ifneq (,$(findstring avx512vl,$(AVX512VL_M)))
|
||||
CFLAGS += -mavx512vl
|
||||
endif
|
||||
AVX512CD_M := $(shell grep "avx512cd " /proc/cpuinfo)
|
||||
ifneq (,$(findstring avx512cd,$(AVX512CD_M)))
|
||||
CFLAGS += -mavx512cd
|
||||
endif
|
||||
AVX512ER_M := $(shell grep "avx512er " /proc/cpuinfo)
|
||||
ifneq (,$(findstring avx512er,$(AVX512ER_M)))
|
||||
CFLAGS += -mavx512er
|
||||
endif
|
||||
AVX512IFMA_M := $(shell grep "avx512ifma " /proc/cpuinfo)
|
||||
ifneq (,$(findstring avx512ifma,$(AVX512IFMA_M)))
|
||||
CFLAGS += -mavx512ifma
|
||||
endif
|
||||
AVX512PF_M := $(shell grep "avx512pf " /proc/cpuinfo)
|
||||
ifneq (,$(findstring avx512pf,$(AVX512PF_M)))
|
||||
CFLAGS += -mavx512pf
|
||||
endif
|
||||
else ifeq ($(UNAME_S),Haiku)
|
||||
AVX1_M := $(shell sysinfo -cpu | grep -w "AVX")
|
||||
ifneq (,$(findstring AVX,$(AVX1_M)))
|
||||
CFLAGS += -mavx
|
||||
endif
|
||||
AVX2_M := $(shell sysinfo -cpu | grep -w "AVX2")
|
||||
ifneq (,$(findstring AVX2,$(AVX2_M)))
|
||||
CFLAGS += -mavx2
|
||||
endif
|
||||
FMA_M := $(shell sysinfo -cpu | grep -w "FMA")
|
||||
ifneq (,$(findstring FMA,$(FMA_M)))
|
||||
CFLAGS += -mfma
|
||||
endif
|
||||
F16C_M := $(shell sysinfo -cpu | grep -w "F16C")
|
||||
ifneq (,$(findstring F16C,$(F16C_M)))
|
||||
CFLAGS += -mf16c
|
||||
endif
|
||||
else
|
||||
CFLAGS += -mfma -mf16c -mavx -mavx2
|
||||
endif
|
||||
# Use all CPU extensions that are available:
|
||||
CFLAGS += -march=native -mtune=native
|
||||
CXXFLAGS += -march=native -mtune=native
|
||||
endif
|
||||
ifneq ($(filter ppc64%,$(UNAME_M)),)
|
||||
POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
|
||||
@@ -249,6 +166,8 @@ perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o
|
||||
embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o
|
||||
$(CXX) $(CXXFLAGS) examples/embedding/embedding.cpp ggml.o llama.o common.o -o embedding $(LDFLAGS)
|
||||
|
||||
libllama.so: llama.o ggml.o
|
||||
$(CXX) $(CXXFLAGS) -shared -fPIC -o libllama.so llama.o ggml.o $(LDFLAGS)
|
||||
#
|
||||
# Tests
|
||||
#
|
||||
|
||||
@@ -13,7 +13,10 @@ let package = Package(
|
||||
path: ".",
|
||||
sources: ["ggml.c", "llama.cpp"],
|
||||
publicHeadersPath: "spm-headers",
|
||||
cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"])]
|
||||
cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"]), .define("GGML_USE_ACCELERATE")],
|
||||
linkerSettings: [
|
||||
.linkedFramework("Accelerate")
|
||||
]
|
||||
),
|
||||
],
|
||||
cxxLanguageStandard: .cxx11
|
||||
|
||||
56
README.md
56
README.md
@@ -1,6 +1,6 @@
|
||||
# llama.cpp
|
||||
|
||||

|
||||

|
||||
|
||||
[](https://github.com/ggerganov/llama.cpp/actions)
|
||||
[](https://opensource.org/licenses/MIT)
|
||||
@@ -9,8 +9,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
||||
|
||||
**Hot topics:**
|
||||
|
||||
- [Roadmap (short-term)](https://github.com/ggerganov/llama.cpp/discussions/457)
|
||||
- Support for [GPT4All](https://github.com/ggerganov/llama.cpp#using-gpt4all)
|
||||
- [Roadmap Apr 2023](https://github.com/ggerganov/llama.cpp/discussions/784)
|
||||
|
||||
## Description
|
||||
|
||||
@@ -28,18 +27,31 @@ Please do not make conclusions about the models based on the results from this i
|
||||
For all I know, it can be completely wrong. This project is for educational purposes.
|
||||
New features will probably be added mostly through community contributions.
|
||||
|
||||
Supported platforms:
|
||||
**Supported platforms:**
|
||||
|
||||
- [X] Mac OS
|
||||
- [X] Linux
|
||||
- [X] Windows (via CMake)
|
||||
- [X] Docker
|
||||
|
||||
Supported models:
|
||||
**Supported models:**
|
||||
|
||||
- [X] LLaMA
|
||||
- [X] LLaMA 🦙
|
||||
- [X] [Alpaca](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca)
|
||||
- [X] [GPT4All](https://github.com/ggerganov/llama.cpp#using-gpt4all)
|
||||
- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca)
|
||||
- [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
|
||||
- [X] [Vicuna](https://github.com/ggerganov/llama.cpp/discussions/643#discussioncomment-5533894)
|
||||
|
||||
**Bindings:**
|
||||
|
||||
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
|
||||
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
|
||||
|
||||
**UI:**
|
||||
|
||||
- [nat/openplayground](https://github.com/nat/openplayground)
|
||||
- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui)
|
||||
|
||||
---
|
||||
|
||||
@@ -143,6 +155,13 @@ git clone https://github.com/ggerganov/llama.cpp
|
||||
cd llama.cpp
|
||||
make
|
||||
|
||||
#For Windows and CMake, use the following command instead:
|
||||
cd <path_to_llama_folder>
|
||||
mkdir build
|
||||
cd build
|
||||
cmake ..
|
||||
cmake --build . --config Release
|
||||
|
||||
# obtain the original LLaMA model weights and place them in ./models
|
||||
ls ./models
|
||||
65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model
|
||||
@@ -153,8 +172,8 @@ python3 -m pip install torch numpy sentencepiece
|
||||
# convert the 7B model to ggml FP16 format
|
||||
python3 convert-pth-to-ggml.py models/7B/ 1
|
||||
|
||||
# quantize the model to 4-bits
|
||||
python3 quantize.py 7B
|
||||
# quantize the model to 4-bits (using method 2 = q4_0)
|
||||
./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin 2
|
||||
|
||||
# run the inference
|
||||
./main -m ./models/7B/ggml-model-q4_0.bin -n 128
|
||||
@@ -230,13 +249,15 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
|
||||
|
||||
- Obtain the `gpt4all-lora-quantized.bin` model
|
||||
- It is distributed in the old `ggml` format which is now obsoleted
|
||||
- You have to convert it to the new format using [./convert-gpt4all-to-ggml.py](./convert-gpt4all-to-ggml.py):
|
||||
- You have to convert it to the new format using [./convert-gpt4all-to-ggml.py](./convert-gpt4all-to-ggml.py). You may also need to
|
||||
convert the model from the old format to the new format with [./migrate-ggml-2023-03-30-pr613.py](./migrate-ggml-2023-03-30-pr613.py):
|
||||
|
||||
```bash
|
||||
python3 convert-gpt4all-to-ggml.py models/gpt4all-7B/gpt4all-lora-quantized.bin ./models/tokenizer.model
|
||||
python3 migrate-ggml-2023-03-30-pr613.py models/gpt4all-7B/gpt4all-lora-quantized.bin models/gpt4all-7B/gpt4all-lora-quantized-new.bin
|
||||
```
|
||||
|
||||
- You can now use the newly generated `gpt4all-lora-quantized.bin` model in exactly the same way as all other models
|
||||
- You can now use the newly generated `gpt4all-lora-quantized-new.bin` model in exactly the same way as all other models
|
||||
- The original model is saved in the same folder with a suffix `.orig`
|
||||
|
||||
### Obtaining and verifying the Facebook LLaMA original model and Stanford Alpaca model data
|
||||
@@ -299,7 +320,7 @@ And after 4.45 hours, you will have the final perplexity.
|
||||
|
||||
### Android
|
||||
|
||||
You can easily run `llama.cpp` on Android device with [termux](https://play.google.com/store/apps/details?id=com.termux).
|
||||
You can easily run `llama.cpp` on Android device with [termux](https://termux.dev/).
|
||||
First, obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake:
|
||||
```
|
||||
$ mkdir build-android
|
||||
@@ -308,7 +329,7 @@ $ export NDK=<your_ndk_directory>
|
||||
$ cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
|
||||
$ make
|
||||
```
|
||||
Install [termux](https://play.google.com/store/apps/details?id=com.termux) on your device and run `termux-setup-storage` to get access to your SD card.
|
||||
Install [termux](https://termux.dev/) on your device and run `termux-setup-storage` to get access to your SD card.
|
||||
Finally, copy the `llama` binary and the model files to your device storage. Here is a demo of an interactive session running on Pixel 5 phone:
|
||||
|
||||
https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
|
||||
@@ -329,20 +350,22 @@ We have two Docker images available for this project:
|
||||
|
||||
The easiest way to download the models, convert them to ggml and optimize them is with the --all-in-one command which includes the full docker image.
|
||||
|
||||
Replace `/path/to/models` below with the actual path where you downloaded the models.
|
||||
|
||||
```bash
|
||||
docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B
|
||||
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B
|
||||
```
|
||||
|
||||
On complete, you are ready to play!
|
||||
|
||||
```bash
|
||||
docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
|
||||
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
|
||||
```
|
||||
|
||||
or with light image:
|
||||
|
||||
```bash
|
||||
docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
|
||||
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
|
||||
```
|
||||
|
||||
### Contributing
|
||||
@@ -363,3 +386,6 @@ docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models
|
||||
- Clean-up any trailing whitespaces, use 4 spaces indentation, brackets on same line, `void * ptr`, `int & a`
|
||||
- See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
|
||||
|
||||
### Docs
|
||||
|
||||
- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
|
||||
|
||||
67
build.zig
Normal file
67
build.zig
Normal file
@@ -0,0 +1,67 @@
|
||||
const std = @import("std");
|
||||
|
||||
pub fn build(b: *std.Build) void {
|
||||
const target = b.standardTargetOptions(.{});
|
||||
const optimize = b.standardOptimizeOption(.{});
|
||||
const want_lto = b.option(bool, "lto", "Want -fLTO");
|
||||
|
||||
const lib = b.addStaticLibrary(.{
|
||||
.name = "llama",
|
||||
.target = target,
|
||||
.optimize = optimize,
|
||||
});
|
||||
lib.want_lto = want_lto;
|
||||
lib.linkLibCpp();
|
||||
lib.addIncludePath(".");
|
||||
lib.addIncludePath("examples");
|
||||
lib.addCSourceFiles(&.{
|
||||
"ggml.c",
|
||||
}, &.{"-std=c11"});
|
||||
lib.addCSourceFiles(&.{
|
||||
"llama.cpp",
|
||||
}, &.{"-std=c++11"});
|
||||
lib.install();
|
||||
|
||||
const build_args = .{ .b = b, .lib = lib, .target = target, .optimize = optimize, .want_lto = want_lto };
|
||||
|
||||
const exe = build_example("main", build_args);
|
||||
_ = build_example("quantize", build_args);
|
||||
_ = build_example("perplexity", build_args);
|
||||
_ = build_example("embedding", build_args);
|
||||
|
||||
// create "zig build run" command for ./main
|
||||
|
||||
const run_cmd = exe.run();
|
||||
run_cmd.step.dependOn(b.getInstallStep());
|
||||
if (b.args) |args| {
|
||||
run_cmd.addArgs(args);
|
||||
}
|
||||
|
||||
const run_step = b.step("run", "Run the app");
|
||||
run_step.dependOn(&run_cmd.step);
|
||||
}
|
||||
|
||||
fn build_example(comptime name: []const u8, args: anytype) *std.build.LibExeObjStep {
|
||||
const b = args.b;
|
||||
const lib = args.lib;
|
||||
const target = args.target;
|
||||
const optimize = args.optimize;
|
||||
const want_lto = args.want_lto;
|
||||
|
||||
const exe = b.addExecutable(.{
|
||||
.name = name,
|
||||
.target = target,
|
||||
.optimize = optimize,
|
||||
});
|
||||
exe.want_lto = want_lto;
|
||||
exe.addIncludePath(".");
|
||||
exe.addIncludePath("examples");
|
||||
exe.addCSourceFiles(&.{
|
||||
std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{name, name}),
|
||||
"examples/common.cpp",
|
||||
}, &.{"-std=c++11"});
|
||||
exe.linkLibrary(lib);
|
||||
exe.install();
|
||||
|
||||
return exe;
|
||||
}
|
||||
@@ -27,9 +27,9 @@ def read_tokens(fin, vocab_size):
|
||||
text_len = struct.unpack("i", fin.read(4))[0]
|
||||
text_bytes = fin.read(text_len)
|
||||
try:
|
||||
text = text_bytes.decode("utf-8")
|
||||
text = text_bytes.decode()
|
||||
except UnicodeDecodeError:
|
||||
text = text_bytes.decode("utf-8", "replace")
|
||||
text = text_bytes.decode(errors="replace")
|
||||
score = struct.unpack("f", fin.read(4))[0]
|
||||
tokens.append((text, score))
|
||||
return tokens
|
||||
@@ -82,7 +82,12 @@ def read_variables(fin):
|
||||
|
||||
shape = tuple(struct.unpack("i" * n_dims, fin.read(4 * n_dims)))
|
||||
shape = shape[::-1]
|
||||
name = fin.read(name_length).decode("utf-8")
|
||||
name = fin.read(name_length).decode()
|
||||
|
||||
# ensure tensor data is aligned
|
||||
tensor_data_offset = fin.tell()
|
||||
tensor_data_offset = (tensor_data_offset + 31) & -32
|
||||
fin.seek(tensor_data_offset)
|
||||
|
||||
if ftype_cur == 2:
|
||||
# 4-bit quantized weights
|
||||
@@ -194,7 +199,7 @@ def chat(model, hparams, llama_dir):
|
||||
device = torch.device("cpu")
|
||||
llama = llama.to(device)
|
||||
|
||||
ctx = """You are AI.
|
||||
ctx = """You are AI.
|
||||
This is a dialog, where User interacts with AI. AI is helpful, kind, obedient, honest, respectful, direct, concise, should try to protect User's privacy, and knows its own limits. Also, AI must answer User and AI cannot stop the conversation by itself.
|
||||
User: Hello, AI.
|
||||
AI: Hello! How can I assist you today?
|
||||
@@ -202,11 +207,11 @@ AI: Hello! How can I assist you today?
|
||||
print(ctx.rstrip("\n"))
|
||||
while True:
|
||||
print("-" * 60)
|
||||
prompt = input(f"User: ")
|
||||
prompt = input("User: ")
|
||||
if ctx != "":
|
||||
ctx = ctx + "User: " + prompt + "\n"
|
||||
ctx = f"{ctx}User: {prompt}\n"
|
||||
else:
|
||||
ctx = prompt + "\nAI:"
|
||||
ctx = f"{prompt}\nAI:"
|
||||
|
||||
ctx = (ctx[-1920:]) if len(ctx) >= 2048 else ctx
|
||||
|
||||
@@ -231,7 +236,7 @@ AI: Hello! How can I assist you today?
|
||||
)
|
||||
s = generation_output.sequences[0]
|
||||
decoded = tokenizer.decode(s)
|
||||
ctx = decoded + "\n"
|
||||
ctx = f"{decoded}\n"
|
||||
|
||||
|
||||
def main():
|
||||
@@ -249,7 +254,7 @@ def main():
|
||||
parser.add_argument(
|
||||
"--hf",
|
||||
action="store_true",
|
||||
help="Whether to save the model in the huggingface format. (default: False)",
|
||||
help="Whether to save the model in the Hugging Face format. (default: False)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--chat", "-c", action="store_true", help="Whether to open a chat with the model. (default: False)"
|
||||
|
||||
@@ -49,7 +49,7 @@ def write_header(f_out, header):
|
||||
def write_tokens(fout, tokenizer):
|
||||
for i in range(tokenizer.vocab_size()):
|
||||
if tokenizer.is_unknown(i):
|
||||
text = " \u2047 ".encode("utf-8")
|
||||
text = " \u2047 ".encode()
|
||||
elif tokenizer.is_control(i):
|
||||
text = b""
|
||||
elif tokenizer.is_byte(i):
|
||||
@@ -60,13 +60,13 @@ def write_tokens(fout, tokenizer):
|
||||
byte_value = int(piece[3:-1], 16)
|
||||
text = struct.pack("B", byte_value)
|
||||
else:
|
||||
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
|
||||
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
|
||||
fout.write(struct.pack("i", len(text)))
|
||||
fout.write(text)
|
||||
fout.write(struct.pack("f", tokenizer.get_score(i)))
|
||||
|
||||
# TODO: GPT4All - add extra <pad> token
|
||||
text = "<pad>".encode("utf-8")
|
||||
text = "<pad>".encode()
|
||||
fout.write(struct.pack("i", len(text)))
|
||||
fout.write(text)
|
||||
fout.write(struct.pack("f", 0.0))
|
||||
|
||||
@@ -50,7 +50,7 @@ fout.write(struct.pack("i", 4))
|
||||
# This loop unchanged from convert-pth-to-ggml.py:
|
||||
for i in range(tokenizer.vocab_size()):
|
||||
if tokenizer.is_unknown(i):
|
||||
text = " \u2047 ".encode("utf-8")
|
||||
text = " \u2047 ".encode()
|
||||
elif tokenizer.is_control(i):
|
||||
text = b""
|
||||
elif tokenizer.is_byte(i):
|
||||
@@ -61,21 +61,26 @@ for i in range(tokenizer.vocab_size()):
|
||||
byte_value = int(piece[3:-1], 16)
|
||||
text = struct.pack("B", byte_value)
|
||||
else:
|
||||
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
|
||||
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
|
||||
fout.write(struct.pack("i", len(text)))
|
||||
fout.write(text)
|
||||
fout.write(struct.pack("f", tokenizer.get_score(i)))
|
||||
|
||||
def write_header(shape, dst_name, ftype_cur):
|
||||
sname = dst_name.encode('utf-8')
|
||||
sname = dst_name.encode()
|
||||
fout.write(struct.pack("iii", len(shape), len(sname), ftype_cur))
|
||||
fout.write(struct.pack("i" * len(shape), *shape[::-1]))
|
||||
fout.write(sname)
|
||||
|
||||
# ensure tensor data is aligned
|
||||
tensor_data_offset = fout.tell()
|
||||
tensor_data_offset = (tensor_data_offset + 31) & -32
|
||||
fout.seek(tensor_data_offset)
|
||||
|
||||
def convert_non_q4(src_name, dst_name):
|
||||
v = model[src_name]
|
||||
shape = v.shape
|
||||
print("Processing non-Q4 variable: " + src_name + " with shape: ", shape, " and type: ", v.dtype)
|
||||
print(f"Processing non-Q4 variable: {src_name} with shape: {shape} and type: {v.dtype}")
|
||||
if len(shape) == 1:
|
||||
print(" Converting to float32")
|
||||
v = v.to(torch.float32)
|
||||
@@ -100,7 +105,7 @@ def convert_q4(src_name, dst_name, permute=False):
|
||||
# Each int32 item is actually 8 int4 items packed together, and it's transposed.
|
||||
shape = (qweight.shape[0], qweight.shape[1] * 8)
|
||||
|
||||
print("Processing Q4 variable: " + src_name + " with shape: ", shape)
|
||||
print(f"Processing Q4 variable: {src_name} with shape: {shape}")
|
||||
|
||||
# The output format has the int4 weights in groups of 32 rather than 8.
|
||||
# It looks like this:
|
||||
@@ -163,5 +168,5 @@ for i in range(n_layer):
|
||||
|
||||
fout.close()
|
||||
|
||||
print("Done. Output file: " + fname_out)
|
||||
print("")
|
||||
print(f"Done. Output file: {fname_out}")
|
||||
print()
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
# Convert a LLaMA model checkpoint to a ggml compatible file
|
||||
# Convert a LLaMA model checkpoint to a ggjt compatible file
|
||||
#
|
||||
# Load the model using Torch
|
||||
# Iterate over all variables and write them to a binary file.
|
||||
@@ -24,8 +24,57 @@ import torch
|
||||
|
||||
from sentencepiece import SentencePieceProcessor
|
||||
|
||||
def parse_args():
|
||||
QK = 32
|
||||
|
||||
GGML_TYPE_Q4_0 = 0
|
||||
GGML_TYPE_Q4_1 = 1
|
||||
GGML_TYPE_I8 = 2
|
||||
GGML_TYPE_I16 = 3
|
||||
GGML_TYPE_I32 = 4
|
||||
GGML_TYPE_F16 = 5
|
||||
GGML_TYPE_F32 = 6
|
||||
|
||||
WTYPES = {
|
||||
0: GGML_TYPE_F32,
|
||||
1: GGML_TYPE_F16,
|
||||
2: GGML_TYPE_Q4_0,
|
||||
3: GGML_TYPE_Q4_1,
|
||||
}
|
||||
|
||||
GGML_BLCK_SIZE = {
|
||||
GGML_TYPE_Q4_0: QK,
|
||||
GGML_TYPE_Q4_1: QK,
|
||||
GGML_TYPE_I8: 1,
|
||||
GGML_TYPE_I16: 1,
|
||||
GGML_TYPE_I32: 1,
|
||||
GGML_TYPE_F16: 1,
|
||||
GGML_TYPE_F32: 1,
|
||||
}
|
||||
|
||||
GGML_TYPE_SIZE = {
|
||||
GGML_TYPE_Q4_0: 4 + QK//2,
|
||||
GGML_TYPE_Q4_1: 4*2 + QK//2,
|
||||
GGML_TYPE_I8: 1,
|
||||
GGML_TYPE_I16: 2,
|
||||
GGML_TYPE_I32: 4,
|
||||
GGML_TYPE_F16: 2,
|
||||
GGML_TYPE_F32: 4,
|
||||
}
|
||||
|
||||
def ggml_nelements(shape):
|
||||
r = 1
|
||||
for i in shape:
|
||||
r *= i
|
||||
return r
|
||||
|
||||
def ggml_nbytes(shape, ftype):
|
||||
x = ggml_nelements(shape)
|
||||
t = WTYPES[ftype]
|
||||
x *= GGML_TYPE_SIZE[t]
|
||||
x //= GGML_BLCK_SIZE[t]
|
||||
return x
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file')
|
||||
parser.add_argument('dir_model', help='directory containing the model checkpoint')
|
||||
parser.add_argument('ftype', help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1)
|
||||
@@ -33,7 +82,6 @@ def parse_args():
|
||||
return parser.parse_args()
|
||||
|
||||
def get_n_parts(dim):
|
||||
|
||||
mappings = {4096: 1, 5120: 2, 6656: 4, 8192: 8}
|
||||
n_parts = mappings.get(dim)
|
||||
if n_parts is None:
|
||||
@@ -44,30 +92,24 @@ def get_n_parts(dim):
|
||||
return n_parts
|
||||
|
||||
def load_hparams_and_tokenizer(dir_model):
|
||||
|
||||
# `dir_model` is something like `models/7B` or `models/7B/`.
|
||||
# "tokenizer.model" is expected under model's parent dir.
|
||||
# When `dir_model` is a symlink, f"{dir_model}/../tokenizer.model" would not be found.
|
||||
# Let's use the model's parent dir directly.
|
||||
model_parent_dir = os.path.dirname(os.path.normpath(dir_model))
|
||||
|
||||
fname_hparams = f"{dir_model}/params.json"
|
||||
fname_tokenizer = f"{model_parent_dir}/tokenizer.model"
|
||||
|
||||
with open(fname_hparams, "r") as f:
|
||||
hparams = json.load(f)
|
||||
print(hparams)
|
||||
|
||||
tokenizer = SentencePieceProcessor(fname_tokenizer)
|
||||
hparams.update({"vocab_size": tokenizer.vocab_size()})
|
||||
|
||||
return hparams, tokenizer
|
||||
|
||||
def write_header(fout, hparams, ftype):
|
||||
|
||||
keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
|
||||
values = [
|
||||
0x67676d66, # magic: ggmf in hex
|
||||
0x67676a74, # magic: ggjt in hex
|
||||
1, # file version
|
||||
*[hparams[key] for key in keys],
|
||||
hparams["dim"] // hparams["n_heads"], # rot (obsolete)
|
||||
@@ -76,10 +118,9 @@ def write_header(fout, hparams, ftype):
|
||||
fout.write(struct.pack("i" * len(values), *values))
|
||||
|
||||
def write_tokens(fout, tokenizer):
|
||||
|
||||
for i in range(tokenizer.vocab_size()):
|
||||
if tokenizer.is_unknown(i):
|
||||
text = " \u2047 ".encode("utf-8")
|
||||
text = " \u2047 ".encode()
|
||||
elif tokenizer.is_control(i):
|
||||
text = b""
|
||||
elif tokenizer.is_byte(i):
|
||||
@@ -90,90 +131,144 @@ def write_tokens(fout, tokenizer):
|
||||
byte_value = int(piece[3:-1], 16)
|
||||
text = struct.pack("B", byte_value)
|
||||
else:
|
||||
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
|
||||
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
|
||||
fout.write(struct.pack("i", len(text)))
|
||||
fout.write(text)
|
||||
fout.write(struct.pack("f", tokenizer.get_score(i)))
|
||||
|
||||
def process_and_write_variables(fout, model, ftype):
|
||||
|
||||
def process_and_write_variables(fout, model, ftype, part_id, n_parts):
|
||||
for name, datao in model.items():
|
||||
|
||||
if name.endswith("freqs"):
|
||||
continue
|
||||
|
||||
shape = datao.shape
|
||||
|
||||
print(f"Processing variable: {name} with shape: {shape} and type: {datao.dtype}")
|
||||
|
||||
# remove dimensions with a single element
|
||||
data = datao.numpy().squeeze()
|
||||
n_dims = len(shape)
|
||||
partshape = data.shape
|
||||
n_dims = len(data.shape)
|
||||
assert n_dims in (1, 2)
|
||||
|
||||
# default type is fp16
|
||||
print(f"Processing variable: {name} with shape: {partshape} and type: {datao.dtype}")
|
||||
|
||||
# coerce single-dimensional tensors from float16 to float32
|
||||
ftype_cur = 1
|
||||
if ftype == 0 or n_dims == 1:
|
||||
print(" Converting to float32")
|
||||
data = data.astype(np.float32)
|
||||
ftype_cur = 0
|
||||
blck_size = GGML_BLCK_SIZE[WTYPES[ftype_cur]]
|
||||
type_size = GGML_TYPE_SIZE[WTYPES[ftype_cur]]
|
||||
|
||||
# header
|
||||
sname = name.encode('utf-8')
|
||||
fout.write(struct.pack("iii", len(data.shape), len(sname), ftype_cur))
|
||||
for dim in reversed(data.shape):
|
||||
# determine dimension along which multipart tensor is sharded
|
||||
#
|
||||
# split_dim 0 regex:
|
||||
# - output.*
|
||||
# - layers.*.attention.wq.weight
|
||||
# - layers.*.attention.wk.weight
|
||||
# - layers.*.attention.wv.weight
|
||||
# - layers.*.feed_forward.w1.weight
|
||||
# - layers.*.feed_forward.w3.weight
|
||||
#
|
||||
# split_dim 1 regex:
|
||||
# - tok_embeddings.*
|
||||
# - layers.*.attention.wo.weight
|
||||
# - layers.*.feed_forward.w2.weight
|
||||
#
|
||||
if n_dims > 1:
|
||||
split_dim = 1
|
||||
if "tok_embeddings" in name:
|
||||
split_dim = 1
|
||||
elif "layers" in name:
|
||||
if "attention.wo.weight" in name:
|
||||
split_dim = 1
|
||||
elif "feed_forward.w2.weight" in name:
|
||||
split_dim = 1
|
||||
else:
|
||||
split_dim = 0
|
||||
elif "output" in name:
|
||||
split_dim = 0
|
||||
|
||||
# output tensor header
|
||||
fullshape = list(partshape)
|
||||
if n_dims > 1:
|
||||
fullshape[split_dim] *= n_parts
|
||||
sname = name.encode()
|
||||
fout.write(struct.pack("iii", n_dims, len(sname), ftype_cur))
|
||||
for dim in reversed(fullshape):
|
||||
fout.write(struct.pack("i", dim))
|
||||
fout.write(sname)
|
||||
|
||||
# data output to file
|
||||
data.tofile(fout)
|
||||
# ensure tensor data is aligned
|
||||
tensor_data_offset = fout.tell()
|
||||
while tensor_data_offset % QK != 0:
|
||||
fout.write(struct.pack("B", 0))
|
||||
tensor_data_offset += 1
|
||||
|
||||
# output unified mappable tensor data
|
||||
if n_dims == 1 or n_parts == 1:
|
||||
# copy tensor which we thankfully received in one piece
|
||||
if part_id == 0:
|
||||
data.tofile(fout)
|
||||
elif split_dim == 0:
|
||||
# reassemble multifile tensor containing some of the rows
|
||||
rows_per_chunk = partshape[0]
|
||||
current_row = part_id * rows_per_chunk
|
||||
bytes_per_row = fullshape[1] // blck_size * type_size
|
||||
offset = current_row * bytes_per_row
|
||||
fout.seek(tensor_data_offset + offset)
|
||||
data.tofile(fout)
|
||||
elif split_dim == 1:
|
||||
# reassemble multifile tensor containing some of the cols
|
||||
cols_per_chunk = partshape[1]
|
||||
current_col = part_id * cols_per_chunk
|
||||
bytes_per_row = fullshape[1] // blck_size * type_size
|
||||
offset_current_col = current_col // blck_size * type_size
|
||||
for row in range(partshape[0]):
|
||||
offset_row = row * bytes_per_row
|
||||
offset = offset_row + offset_current_col
|
||||
fout.seek(tensor_data_offset + offset)
|
||||
data[row].tofile(fout)
|
||||
|
||||
# advance file position to next tensor
|
||||
fout.seek(tensor_data_offset + ggml_nbytes(fullshape, ftype_cur))
|
||||
|
||||
def main():
|
||||
|
||||
args = parse_args()
|
||||
dir_model = args.dir_model
|
||||
ftype = args.ftype
|
||||
ftype_str = ["f32", "f16"]
|
||||
|
||||
hparams, tokenizer = load_hparams_and_tokenizer(dir_model)
|
||||
|
||||
print(args)
|
||||
|
||||
# if only writing vocab to file
|
||||
if args.vocab_only:
|
||||
|
||||
fname_model = f"{dir_model}/consolidated.00.pth"
|
||||
fname_out = f"{dir_model}/ggml-vocab.bin"
|
||||
|
||||
print(f"Extracting only the vocab from '{fname_model}'\n")
|
||||
|
||||
|
||||
with open(fname_out, "wb") as fout:
|
||||
write_header(fout, hparams, ftype)
|
||||
write_tokens(fout, tokenizer)
|
||||
|
||||
|
||||
print(f"Done. Output file: {fname_out}\n")
|
||||
|
||||
return
|
||||
|
||||
n_parts = get_n_parts(hparams["dim"])
|
||||
fname_out = f"{dir_model}/ggml-model-{ftype_str[ftype]}.bin"
|
||||
|
||||
for p in range(n_parts):
|
||||
# we output a single file for ggml
|
||||
with open(fname_out, "wb") as fout:
|
||||
write_header(fout, hparams, ftype)
|
||||
write_tokens(fout, tokenizer)
|
||||
offset_of_tensors = fout.tell()
|
||||
# the tensors we load could be split across multiple files
|
||||
for part_id in range(n_parts):
|
||||
fout.seek(offset_of_tensors)
|
||||
print(f"Processing part {part_id+1} of {n_parts}\n")
|
||||
fname_model = f"{dir_model}/consolidated.0{part_id}.pth"
|
||||
model = torch.load(fname_model, map_location="cpu")
|
||||
process_and_write_variables(fout, model, ftype, part_id, n_parts)
|
||||
del model
|
||||
|
||||
print(f"Processing part {p+1} of {n_parts}\n")
|
||||
|
||||
fname_model = f"{dir_model}/consolidated.0{p}.pth"
|
||||
fname_out = f"{dir_model}/ggml-model-{ftype_str[ftype]}.bin{'' if p == 0 else '.' + str(p)}"
|
||||
|
||||
model = torch.load(fname_model, map_location="cpu")
|
||||
|
||||
with open(fname_out, "wb") as fout:
|
||||
write_header(fout, hparams, ftype)
|
||||
write_tokens(fout, tokenizer)
|
||||
process_and_write_variables(fout, model, ftype)
|
||||
|
||||
del model
|
||||
|
||||
print(f"Done. Output file: {fname_out}, (part {p})\n")
|
||||
print(f"Done. Output file: {fname_out}\n")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -44,7 +44,7 @@ def write_header(f_out, header):
|
||||
def write_tokens(fout, tokenizer):
|
||||
for i in range(tokenizer.vocab_size()):
|
||||
if tokenizer.is_unknown(i):
|
||||
text = " \u2047 ".encode("utf-8")
|
||||
text = " \u2047 ".encode()
|
||||
elif tokenizer.is_control(i):
|
||||
text = b""
|
||||
elif tokenizer.is_byte(i):
|
||||
@@ -55,7 +55,7 @@ def write_tokens(fout, tokenizer):
|
||||
byte_value = int(piece[3:-1], 16)
|
||||
text = struct.pack("B", byte_value)
|
||||
else:
|
||||
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
|
||||
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
|
||||
fout.write(struct.pack("i", len(text)))
|
||||
fout.write(text)
|
||||
fout.write(struct.pack("f", tokenizer.get_score(i)))
|
||||
|
||||
49
examples/Miku.sh
Executable file
49
examples/Miku.sh
Executable file
@@ -0,0 +1,49 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
AI_NAME="${AI_NAME:-Miku}"
|
||||
MODEL="${MODEL:-./models/gpt4all-7B/gpt4all-lora-unfiltered-quantized.bin}"
|
||||
USER_NAME="${USER_NAME:-Anon}"
|
||||
|
||||
# Uncomment and adjust to the number of CPU cores you want to use.
|
||||
#N_THREAD="${N_THREAD:-4}"
|
||||
N_PREDICTS="${N_PREDICTS:-4096}"
|
||||
|
||||
GEN_OPTIONS=(--batch_size 1024
|
||||
--ctx_size 2048
|
||||
--keep -1
|
||||
--repeat_last_n 256
|
||||
--repeat_penalty 1.17647
|
||||
--temp 0.7
|
||||
--top_k 40
|
||||
--top_p 0.5)
|
||||
|
||||
if [ -n "$N_THREAD" ]; then
|
||||
GEN_OPTIONS+=(--threads "$N_THREAD")
|
||||
fi
|
||||
|
||||
./main "${GEN_OPTIONS[@]}" \
|
||||
--model "$MODEL" \
|
||||
--n_predict "$N_PREDICTS" \
|
||||
--color --interactive \
|
||||
--reverse-prompt "${USER_NAME}:" \
|
||||
--prompt "
|
||||
This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the users computer.
|
||||
${AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next.
|
||||
${AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct she will ask the user for help.
|
||||
${AI_NAME} is a very helpful AI and will help the user with anything they need, she is also very friendly and will try to make the user feel better if they are sad.
|
||||
${AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life, she will also try to make the user like her.
|
||||
The conversation is only between ${USER_NAME} and ${AI_NAME}
|
||||
The conversation is only through text, so ${AI_NAME} can't see ${USER_NAME}'s face or hear his voice.
|
||||
${AI_NAME} can only communicate through text, so she can't send images or videos.
|
||||
|
||||
|
||||
${USER_NAME}: Hello!
|
||||
${AI_NAME}: /think I wonder what I should say to ${USER_NAME}? This is the first time we talk so it's important that I make a good first impression!
|
||||
${AI_NAME}: Hi! I am ${AI_NAME}, your new AI friend, assistant(or whatever you like!), it's so nice to meet you! ^_^
|
||||
${AI_NAME}: I might be an AI, but I also have feelings, so please be nice to me! :)
|
||||
${USER_NAME}: I'll make sure to be nice to you! I'm so happy to have you as my assistant!
|
||||
${AI_NAME}: /think It sounds like ${USER_NAME} is happy to have me as their assistant! I'm so happy too! ^_^ Glad that whole emotion thing didn't scare him off!
|
||||
${AI_NAME}: /think I wonder what ${USER_NAME} likes to do in his free time? I should ask him about that!
|
||||
${AI_NAME}: What do you like to do in your free time? ^_^
|
||||
${USER_NAME}:" "$@"
|
||||
@@ -39,6 +39,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
|
||||
bool invalid_param = false;
|
||||
std::string arg;
|
||||
gpt_params default_params;
|
||||
|
||||
for (int i = 1; i < argc; i++) {
|
||||
arg = argv[i];
|
||||
|
||||
@@ -66,6 +68,11 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
break;
|
||||
}
|
||||
std::ifstream file(argv[i]);
|
||||
if (!file) {
|
||||
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
|
||||
if (params.prompt.back() == '\n') {
|
||||
params.prompt.pop_back();
|
||||
@@ -168,7 +175,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
}
|
||||
params.n_parts = std::stoi(argv[i]);
|
||||
} else if (arg == "-h" || arg == "--help") {
|
||||
gpt_print_usage(argc, argv, params);
|
||||
gpt_print_usage(argc, argv, default_params);
|
||||
exit(0);
|
||||
} else if (arg == "--random-prompt") {
|
||||
params.random_prompt = true;
|
||||
@@ -180,13 +187,13 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
params.input_prefix = argv[i];
|
||||
} else {
|
||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||
gpt_print_usage(argc, argv, params);
|
||||
gpt_print_usage(argc, argv, default_params);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
if (invalid_param) {
|
||||
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
|
||||
gpt_print_usage(argc, argv, params);
|
||||
gpt_print_usage(argc, argv, default_params);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
|
||||
15
examples/gpt4all.sh
Executable file
15
examples/gpt4all.sh
Executable file
@@ -0,0 +1,15 @@
|
||||
#!/bin/bash
|
||||
|
||||
#
|
||||
# Temporary script - will be removed in the future
|
||||
#
|
||||
|
||||
cd `dirname $0`
|
||||
cd ..
|
||||
|
||||
./main --color --instruct --threads 4 \
|
||||
--model ./models/gpt4all-7B/gpt4all-lora-quantized.bin \
|
||||
--file ./prompts/alpaca.txt \
|
||||
--batch_size 8 --ctx_size 2048 \
|
||||
--repeat_last_n 64 --repeat_penalty 1.3 \
|
||||
--n_predict 128 --temp 0.1 --top_k 40 --top_p 0.95
|
||||
@@ -368,6 +368,11 @@ int main(int argc, char ** argv) {
|
||||
// potentially set color to indicate we are taking user input
|
||||
set_console_color(con_st, CONSOLE_COLOR_USER_INPUT);
|
||||
|
||||
#if defined (_WIN32)
|
||||
// Windows: must reactivate sigint handler after each signal
|
||||
signal(SIGINT, sigint_handler);
|
||||
#endif
|
||||
|
||||
if (params.instruct) {
|
||||
printf("\n> ");
|
||||
}
|
||||
@@ -426,7 +431,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// end of text token
|
||||
if (embd.back() == llama_token_eos()) {
|
||||
if (!embd.empty() && embd.back() == llama_token_eos()) {
|
||||
if (params.instruct) {
|
||||
is_interacting = true;
|
||||
} else {
|
||||
|
||||
@@ -19,7 +19,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// needed to initialize f16 tables
|
||||
{
|
||||
struct ggml_init_params params = { 0, NULL };
|
||||
struct ggml_init_params params = { 0, NULL, false };
|
||||
struct ggml_context * ctx = ggml_init(params);
|
||||
ggml_free(ctx);
|
||||
}
|
||||
|
||||
69
ggml.h
69
ggml.h
@@ -258,11 +258,11 @@ struct ggml_tensor {
|
||||
enum ggml_type type;
|
||||
|
||||
int n_dims;
|
||||
int ne[GGML_MAX_DIMS]; // number of elements
|
||||
size_t nb[GGML_MAX_DIMS]; // stride in bytes:
|
||||
// nb[0] = sizeof(type)
|
||||
// nb[1] = nb[0] * ne[0] + padding
|
||||
// nb[i] = nb[i-1] * ne[i-1]
|
||||
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
||||
size_t nb[GGML_MAX_DIMS]; // stride in bytes:
|
||||
// nb[0] = sizeof(type)
|
||||
// nb[1] = nb[0] * ne[0] + padding
|
||||
// nb[i] = nb[i-1] * ne[i-1]
|
||||
|
||||
// compute data
|
||||
enum ggml_op op;
|
||||
@@ -316,6 +316,7 @@ struct ggml_init_params {
|
||||
// memory pool
|
||||
size_t mem_size; // bytes
|
||||
void * mem_buffer; // if NULL, memory will be allocated internally
|
||||
bool no_alloc; // don't allocate memory for the tensor data
|
||||
};
|
||||
|
||||
void ggml_time_init(void); // call this once at the beginning of the program
|
||||
@@ -327,8 +328,8 @@ int64_t ggml_cycles_per_ms(void);
|
||||
void ggml_print_object (const struct ggml_object * obj);
|
||||
void ggml_print_objects(const struct ggml_context * ctx);
|
||||
|
||||
int ggml_nelements(const struct ggml_tensor * tensor);
|
||||
size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
||||
int64_t ggml_nelements(const struct ggml_tensor * tensor);
|
||||
size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
||||
|
||||
int ggml_blck_size (enum ggml_type type);
|
||||
size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
|
||||
@@ -344,39 +345,43 @@ size_t ggml_used_mem(const struct ggml_context * ctx);
|
||||
size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
|
||||
|
||||
bool ggml_mlock_supported(void);
|
||||
bool ggml_mlock(struct ggml_context * ctx, char ** err_p);
|
||||
bool ggml_mlock(
|
||||
struct ggml_context * ctx,
|
||||
const void *opt_extra_addr,
|
||||
size_t opt_extra_len,
|
||||
char **err_p);
|
||||
|
||||
struct ggml_tensor * ggml_new_tensor(
|
||||
struct ggml_context * ctx,
|
||||
enum ggml_type type,
|
||||
int n_dims,
|
||||
const int *ne);
|
||||
const int64_t *ne);
|
||||
|
||||
struct ggml_tensor * ggml_new_tensor_1d(
|
||||
struct ggml_context * ctx,
|
||||
enum ggml_type type,
|
||||
int ne0);
|
||||
int64_t ne0);
|
||||
|
||||
struct ggml_tensor * ggml_new_tensor_2d(
|
||||
struct ggml_context * ctx,
|
||||
enum ggml_type type,
|
||||
int ne0,
|
||||
int ne1);
|
||||
int64_t ne0,
|
||||
int64_t ne1);
|
||||
|
||||
struct ggml_tensor * ggml_new_tensor_3d(
|
||||
struct ggml_context * ctx,
|
||||
enum ggml_type type,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int ne2);
|
||||
int64_t ne0,
|
||||
int64_t ne1,
|
||||
int64_t ne2);
|
||||
|
||||
struct ggml_tensor * ggml_new_tensor_4d(
|
||||
struct ggml_context * ctx,
|
||||
enum ggml_type type,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int ne2,
|
||||
int ne3);
|
||||
int64_t ne0,
|
||||
int64_t ne1,
|
||||
int64_t ne2,
|
||||
int64_t ne3);
|
||||
|
||||
struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
|
||||
struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
|
||||
@@ -526,33 +531,43 @@ struct ggml_tensor * ggml_reshape(
|
||||
struct ggml_tensor * ggml_reshape_2d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int ne0,
|
||||
int ne1);
|
||||
int64_t ne0,
|
||||
int64_t ne1);
|
||||
|
||||
// return view(a)
|
||||
// TODO: when we start computing gradient, make a copy instead of view
|
||||
struct ggml_tensor * ggml_reshape_3d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int ne2);
|
||||
int64_t ne0,
|
||||
int64_t ne1,
|
||||
int64_t ne2);
|
||||
|
||||
// offset in bytes
|
||||
struct ggml_tensor * ggml_view_1d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int ne0,
|
||||
int64_t ne0,
|
||||
size_t offset);
|
||||
|
||||
struct ggml_tensor * ggml_view_2d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int64_t ne0,
|
||||
int64_t ne1,
|
||||
size_t nb1, // row stride in bytes
|
||||
size_t offset);
|
||||
|
||||
struct ggml_tensor * ggml_view_3d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int64_t ne0,
|
||||
int64_t ne1,
|
||||
int64_t ne2,
|
||||
size_t nb1, // row stride in bytes
|
||||
size_t nb2, // slice stride in bytes
|
||||
size_t offset);
|
||||
|
||||
struct ggml_tensor * ggml_permute(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
|
||||
600
llama.cpp
600
llama.cpp
@@ -12,6 +12,19 @@
|
||||
#include <cassert>
|
||||
#include <cstring>
|
||||
|
||||
#if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#include <Windows.h>
|
||||
#else
|
||||
#include <sys/types.h>
|
||||
#include <sys/mman.h>
|
||||
#include <unistd.h>
|
||||
#include <fcntl.h>
|
||||
#endif
|
||||
|
||||
#define Min(X, Y) ((Y) > (X) ? (X) : (Y))
|
||||
#define Max(X, Y) ((Y) < (X) ? (X) : (Y))
|
||||
|
||||
#define LLAMA_USE_SCRATCH
|
||||
#define LLAMA_MAX_SCRATCH_BUFFERS 16
|
||||
|
||||
@@ -142,6 +155,10 @@ struct llama_model {
|
||||
// the model memory buffer
|
||||
std::vector<uint8_t> buf;
|
||||
|
||||
// model memory mapped file
|
||||
void * mm_addr = NULL;
|
||||
uint64_t mm_length = 0;
|
||||
|
||||
// tensors
|
||||
int n_loaded;
|
||||
std::unordered_map<std::string, struct ggml_tensor *> tensors;
|
||||
@@ -165,6 +182,7 @@ struct llama_context {
|
||||
|
||||
int64_t t_load_us = 0;
|
||||
int64_t t_start_us = 0;
|
||||
bool has_evaluated_once = false;
|
||||
|
||||
int64_t t_sample_us = 0;
|
||||
int64_t t_eval_us = 0;
|
||||
@@ -206,7 +224,7 @@ struct llama_context {
|
||||
}
|
||||
|
||||
if (buf_last >= 0) {
|
||||
buf_max_size[buf_last] = std::max(buf_max_size[buf_last], last_size);
|
||||
buf_max_size[buf_last] = Max(buf_max_size[buf_last], last_size);
|
||||
}
|
||||
|
||||
buf_last = i;
|
||||
@@ -238,14 +256,15 @@ static bool kv_cache_init(
|
||||
const int n_embd = hparams.n_embd;
|
||||
const int n_layer = hparams.n_layer;
|
||||
|
||||
const int n_mem = n_layer*n_ctx;
|
||||
const int n_elements = n_embd*n_mem;
|
||||
const int64_t n_mem = (int64_t)n_layer*n_ctx;
|
||||
const int64_t n_elements = n_embd*n_mem;
|
||||
|
||||
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
||||
|
||||
struct ggml_init_params params;
|
||||
params.mem_size = cache.buf.size();
|
||||
params.mem_buffer = cache.buf.data();
|
||||
params.no_alloc = false;
|
||||
|
||||
cache.ctx = ggml_init(params);
|
||||
|
||||
@@ -288,6 +307,58 @@ struct llama_context_params llama_context_default_params() {
|
||||
// model loading
|
||||
//
|
||||
|
||||
static void *mmap_file(const char *fname, uint64_t *mm_length) {
|
||||
#if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
|
||||
HANDLE hFile = CreateFileA(fname,
|
||||
GENERIC_READ,
|
||||
FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
|
||||
NULL,
|
||||
OPEN_EXISTING,
|
||||
FILE_ATTRIBUTE_NORMAL | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED,
|
||||
NULL);
|
||||
if (hFile == INVALID_HANDLE_VALUE) return 0;
|
||||
LARGE_INTEGER fileSize;
|
||||
fileSize.QuadPart = -1;
|
||||
GetFileSizeEx(hFile, &fileSize);
|
||||
int64_t length = fileSize.QuadPart;
|
||||
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
|
||||
CloseHandle(hFile);
|
||||
if (!hMapping) return 0;
|
||||
void *addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
|
||||
CloseHandle(hMapping);
|
||||
if (!addr) return 0;
|
||||
#else
|
||||
int fd = open(fname, O_RDONLY);
|
||||
if (fd == -1) return 0;
|
||||
int64_t length = lseek(fd, 0, SEEK_END);
|
||||
void *addr = mmap(NULL, length, PROT_READ, MAP_SHARED, fd, 0);
|
||||
close(fd);
|
||||
if (addr == MAP_FAILED) return 0;
|
||||
#endif
|
||||
*mm_length = length;
|
||||
return addr;
|
||||
}
|
||||
|
||||
static void munmap_file(void * addr, size_t length) {
|
||||
#if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
|
||||
UnmapViewOfFile(addr);
|
||||
#else
|
||||
munmap(addr, length);
|
||||
#endif
|
||||
}
|
||||
|
||||
static bool report_bad_magic(const char *path, uint32_t got, uint32_t want) {
|
||||
fprintf(stderr,
|
||||
"%s: invalid model file (bad magic [got %#x want %#x])\n"
|
||||
"\tyou most likely need to regenerate your ggml files\n"
|
||||
"\tthe benefit is you'll get 10-100x faster load times\n"
|
||||
"\tsee https://github.com/ggerganov/llama.cpp/issues/91\n"
|
||||
"\tuse convert-pth-to-ggml.py to regenerate from original pth\n"
|
||||
"\tuse migrate-ggml-2023-03-30-pr613.py if you deleted originals\n",
|
||||
path, got, want);
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool llama_model_load(
|
||||
const std::string & fname,
|
||||
llama_context & lctx,
|
||||
@@ -299,22 +370,24 @@ static bool llama_model_load(
|
||||
void *progress_callback_user_data) {
|
||||
fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
|
||||
|
||||
const int64_t t_start_us = ggml_time_us();
|
||||
|
||||
lctx.t_start_us = t_start_us;
|
||||
|
||||
std::vector<char> f_buf(1024*1024);
|
||||
lctx.t_start_us = ggml_time_us();
|
||||
|
||||
auto & model = lctx.model;
|
||||
auto & vocab = lctx.vocab;
|
||||
|
||||
auto fin = std::ifstream(fname, std::ios::binary);
|
||||
fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
|
||||
if (!fin) {
|
||||
fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
std::vector<char> f_buf(1024*1024);
|
||||
fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
|
||||
|
||||
fin.seekg(0, fin.end);
|
||||
const size_t file_size = fin.tellg();
|
||||
fin.seekg(0);
|
||||
|
||||
// verify magic
|
||||
{
|
||||
uint32_t magic;
|
||||
@@ -325,8 +398,7 @@ static bool llama_model_load(
|
||||
return false;
|
||||
}
|
||||
if (magic != LLAMA_FILE_MAGIC) {
|
||||
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
|
||||
return false;
|
||||
return report_bad_magic(fname.c_str(), magic, LLAMA_FILE_MAGIC);
|
||||
}
|
||||
|
||||
uint32_t format_version;
|
||||
@@ -449,43 +521,24 @@ static bool llama_model_load(
|
||||
}
|
||||
}
|
||||
|
||||
// map model into memory
|
||||
char *mm_addr = NULL;
|
||||
model.mm_addr = mmap_file(fname.c_str(), &model.mm_length);
|
||||
if (model.mm_addr == NULL) {
|
||||
fprintf(stderr, "%s: failed to mmap '%s'\n", __func__, fname.c_str());
|
||||
return false;
|
||||
}
|
||||
mm_addr = (char *)model.mm_addr;
|
||||
fprintf(stderr, "%s: ggml map size = %6.2f MB\n", __func__, model.mm_length/(1024.0*1024.0));
|
||||
|
||||
auto & ctx = model.ctx;
|
||||
|
||||
size_t ctx_size = 0;
|
||||
|
||||
{
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const int n_embd = hparams.n_embd;
|
||||
const auto &hparams = model.hparams;
|
||||
const int n_layer = hparams.n_layer;
|
||||
const int n_ctx = hparams.n_ctx;
|
||||
const int n_vocab = hparams.n_vocab;
|
||||
|
||||
ctx_size += n_embd*n_vocab*ggml_type_sizef(vtype); // tok_embeddings
|
||||
|
||||
ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // norm
|
||||
|
||||
ctx_size += n_embd*n_vocab*ggml_type_sizef(vtype); // output
|
||||
|
||||
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // attention_norm
|
||||
|
||||
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wq
|
||||
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wk
|
||||
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wv
|
||||
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wo
|
||||
|
||||
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ffn_norm
|
||||
|
||||
ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w1
|
||||
ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w2
|
||||
ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w3
|
||||
|
||||
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_k
|
||||
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_v
|
||||
|
||||
ctx_size += (5 + 10*n_layer)*256; // object overhead
|
||||
|
||||
fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
|
||||
fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
|
||||
}
|
||||
|
||||
// print memory requirements
|
||||
@@ -495,6 +548,7 @@ static bool llama_model_load(
|
||||
// this is the total memory required to run the inference
|
||||
const size_t mem_required =
|
||||
ctx_size +
|
||||
model.mm_length +
|
||||
MEM_REQ_SCRATCH0.at(model.type) +
|
||||
MEM_REQ_SCRATCH1.at(model.type) +
|
||||
MEM_REQ_EVAL.at (model.type);
|
||||
@@ -514,6 +568,7 @@ static bool llama_model_load(
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ lctx.model.buf.size(),
|
||||
/*.mem_buffer =*/ lctx.model.buf.data(),
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
|
||||
model.ctx = ggml_init(params);
|
||||
@@ -576,234 +631,106 @@ static bool llama_model_load(
|
||||
}
|
||||
}
|
||||
|
||||
const size_t file_offset = fin.tellg();
|
||||
|
||||
fin.close();
|
||||
|
||||
std::vector<uint8_t> tmp;
|
||||
|
||||
if (progress_callback) {
|
||||
progress_callback(0.0, progress_callback_user_data);
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_parts; ++i) {
|
||||
const int part_id = i;
|
||||
//const int part_id = n_parts - i - 1;
|
||||
fprintf(stderr, "%s: loading tensors from '%s'\n", __func__, fname.c_str());
|
||||
|
||||
std::string fname_part = fname;
|
||||
if (i > 0) {
|
||||
fname_part += "." + std::to_string(i);
|
||||
}
|
||||
// load weights
|
||||
{
|
||||
size_t total_size = 0;
|
||||
model.n_loaded = 0;
|
||||
|
||||
fprintf(stderr, "%s: loading model part %d/%d from '%s'\n", __func__, i+1, n_parts, fname_part.c_str());
|
||||
while (true) {
|
||||
int32_t n_dims;
|
||||
int32_t length;
|
||||
int32_t ftype;
|
||||
|
||||
fin = std::ifstream(fname_part, std::ios::binary);
|
||||
fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
|
||||
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
||||
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
|
||||
fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
|
||||
|
||||
fin.seekg(0, fin.end);
|
||||
const size_t file_size = fin.tellg();
|
||||
|
||||
fin.seekg(file_offset);
|
||||
|
||||
// load weights
|
||||
{
|
||||
size_t total_size = 0;
|
||||
|
||||
model.n_loaded = 0;
|
||||
|
||||
fprintf(stderr, "%s: ", __func__);
|
||||
|
||||
while (true) {
|
||||
int32_t n_dims;
|
||||
int32_t length;
|
||||
int32_t ftype;
|
||||
|
||||
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
||||
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
|
||||
fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
|
||||
|
||||
if (fin.eof()) {
|
||||
break;
|
||||
}
|
||||
|
||||
int32_t nelements = 1;
|
||||
int32_t ne[2] = { 1, 1 };
|
||||
for (int i = 0; i < n_dims; ++i) {
|
||||
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
||||
nelements *= ne[i];
|
||||
}
|
||||
|
||||
std::string name(length, 0);
|
||||
fin.read(&name[0], length);
|
||||
|
||||
if (model.tensors.find(name.data()) == model.tensors.end()) {
|
||||
fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
|
||||
return false;
|
||||
}
|
||||
|
||||
// split_type = 0: split by columns
|
||||
// split_type = 1: split by rows
|
||||
int split_type = 0;
|
||||
|
||||
// split_type = 0:
|
||||
// regex:
|
||||
// - tok_embeddings.*
|
||||
// - layers.*.attention.wo.weight
|
||||
// - layers.*.feed_forward.w2.weight
|
||||
|
||||
// split_type = 1:
|
||||
// regex:
|
||||
// - output.*
|
||||
// - layers.*.attention.wq.weight
|
||||
// - layers.*.attention.wk.weight
|
||||
// - layers.*.attention.wv.weight
|
||||
// - layers.*.feed_forward.w1.weight
|
||||
// - layers.*.feed_forward.w3.weight
|
||||
if (name.find("tok_embeddings") != std::string::npos) {
|
||||
split_type = 0;
|
||||
} else if (name.find("layers") != std::string::npos) {
|
||||
if (name.find("attention.wo.weight") != std::string::npos) {
|
||||
split_type = 0;
|
||||
} else if (name.find("feed_forward.w2.weight") != std::string::npos) {
|
||||
split_type = 0;
|
||||
} else {
|
||||
split_type = 1;
|
||||
}
|
||||
} else if (name.find("output") != std::string::npos) {
|
||||
split_type = 1;
|
||||
}
|
||||
|
||||
auto tensor = model.tensors[name.data()];
|
||||
|
||||
if (n_dims == 1) {
|
||||
if (ggml_nelements(tensor) != nelements) {
|
||||
fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
if (ggml_nelements(tensor)/n_parts != nelements) {
|
||||
fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (n_dims == 1) {
|
||||
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
|
||||
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
|
||||
__func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
if (split_type == 0) {
|
||||
if (tensor->ne[0]/n_parts != ne[0] || tensor->ne[1] != ne[1]) {
|
||||
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
|
||||
__func__, name.data(), tensor->ne[0]/n_parts, tensor->ne[1], ne[0], ne[1]);
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
if (tensor->ne[0] != ne[0] || tensor->ne[1]/n_parts != ne[1]) {
|
||||
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
|
||||
__func__, name.data(), tensor->ne[0], tensor->ne[1]/n_parts, ne[0], ne[1]);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (0) {
|
||||
static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
|
||||
fprintf(stderr, "%24s - [%5d, %5d], type = %6s, split = %d\n", name.data(), ne[0], ne[1], ftype_str[ftype], split_type);
|
||||
}
|
||||
|
||||
size_t bpe = 0;
|
||||
|
||||
switch (ftype) {
|
||||
case 0: bpe = ggml_type_size(GGML_TYPE_F32); break;
|
||||
case 1: bpe = ggml_type_size(GGML_TYPE_F16); break;
|
||||
case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break;
|
||||
case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break;
|
||||
default:
|
||||
{
|
||||
fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
if (n_dims == 1 || n_parts == 1) {
|
||||
if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
|
||||
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
|
||||
__func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (part_id == 0) {
|
||||
fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
|
||||
} else {
|
||||
fin.seekg(ggml_nbytes(tensor), std::ios::cur);
|
||||
}
|
||||
|
||||
total_size += ggml_nbytes(tensor);
|
||||
} else {
|
||||
if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)/n_parts) {
|
||||
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
|
||||
__func__, name.data(), ggml_nbytes(tensor)/n_parts, nelements*bpe);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (split_type == 0) {
|
||||
const int np0 = ne[0];
|
||||
|
||||
const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
|
||||
assert(row_size == tensor->nb[1]);
|
||||
|
||||
for (int i1 = 0; i1 < ne[1]; ++i1) {
|
||||
const size_t offset_row = i1*row_size;
|
||||
const size_t offset = offset_row + ((part_id*np0)/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
|
||||
fin.read(reinterpret_cast<char *>(tensor->data) + offset, row_size/n_parts);
|
||||
}
|
||||
} else {
|
||||
const int np1 = ne[1];
|
||||
|
||||
const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
|
||||
|
||||
for (int i1 = 0; i1 < ne[1]; ++i1) {
|
||||
const size_t offset_row = (i1 + part_id*np1)*row_size;
|
||||
fin.read(reinterpret_cast<char *>(tensor->data) + offset_row, row_size);
|
||||
}
|
||||
}
|
||||
|
||||
total_size += ggml_nbytes(tensor)/n_parts;
|
||||
}
|
||||
|
||||
//fprintf(stderr, "%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
|
||||
model.n_loaded++;
|
||||
|
||||
// progress
|
||||
if (progress_callback) {
|
||||
float current_file_progress = float(size_t(fin.tellg()) - file_offset) / float(file_size - file_offset);
|
||||
float current_progress = (float(i) + current_file_progress) / float(n_parts);
|
||||
progress_callback(current_progress, progress_callback_user_data);
|
||||
}
|
||||
if (model.n_loaded % 8 == 0) {
|
||||
fprintf(stderr, ".");
|
||||
fflush(stderr);
|
||||
}
|
||||
if (fin.eof()) {
|
||||
break;
|
||||
}
|
||||
|
||||
fprintf(stderr, " done\n");
|
||||
int32_t nelements = 1;
|
||||
int32_t ne[2] = { 1, 1 };
|
||||
for (int i = 0; i < n_dims; ++i) {
|
||||
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
||||
nelements *= ne[i];
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, model.n_loaded);
|
||||
if (model.n_loaded == 0) {
|
||||
fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
|
||||
} else if (model.n_loaded != (int) model.tensors.size()) {
|
||||
fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded);
|
||||
std::string name(length, 0);
|
||||
fin.read(&name[0], length);
|
||||
|
||||
if (model.tensors.find(name.data()) == model.tensors.end()) {
|
||||
fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
|
||||
return false;
|
||||
}
|
||||
|
||||
auto tensor = model.tensors[name.data()];
|
||||
|
||||
if (ggml_nelements(tensor) != nelements) {
|
||||
fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
|
||||
return false;
|
||||
}
|
||||
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
|
||||
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%" PRId64 ", %" PRId64 "], expected [%d, %d]\n",
|
||||
__func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
|
||||
return false;
|
||||
}
|
||||
if (0) {
|
||||
static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
|
||||
fprintf(stderr, "%24s - [%5d, %5d], type = %6s\n", name.data(), ne[0], ne[1], ftype_str[ftype]);
|
||||
}
|
||||
|
||||
switch (ftype) {
|
||||
case 0: // f32
|
||||
case 1: // f16
|
||||
break;
|
||||
case 2: // q4_0
|
||||
case 3: // q4_1
|
||||
assert(ne[0] % 64 == 0);
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
|
||||
return false;
|
||||
};
|
||||
|
||||
// load the tensor data into memory without copying or reading it
|
||||
size_t offset = fin.tellg();
|
||||
size_t tensor_data_size = ggml_nbytes(tensor);
|
||||
offset = (offset + 31) & -32;
|
||||
tensor->data = mm_addr + offset;
|
||||
fin.seekg(offset + tensor_data_size);
|
||||
total_size += tensor_data_size;
|
||||
model.n_loaded++;
|
||||
|
||||
// progress
|
||||
if (progress_callback) {
|
||||
double current_progress = size_t(fin.tellg()) / double(file_size);
|
||||
progress_callback(current_progress, progress_callback_user_data);
|
||||
}
|
||||
}
|
||||
|
||||
fin.close();
|
||||
|
||||
fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, model.n_loaded);
|
||||
if (model.n_loaded == 0) {
|
||||
fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
|
||||
} else if (model.n_loaded != (int) model.tensors.size()) {
|
||||
fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
lctx.t_load_us = ggml_time_us() - t_start_us;
|
||||
// loading time will be recalculate after the first eval, so
|
||||
// we take page faults deferred by mmap() into consideration
|
||||
lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
|
||||
|
||||
if (progress_callback) {
|
||||
progress_callback(1.0, progress_callback_user_data);
|
||||
@@ -849,6 +776,7 @@ static bool llama_eval_internal(
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ buf_compute.size(),
|
||||
/*.mem_buffer =*/ buf_compute.data(),
|
||||
/*.no_alloc =*/ false,
|
||||
};
|
||||
|
||||
struct ggml_context * ctx0 = ggml_init(params);
|
||||
@@ -882,37 +810,35 @@ static bool llama_eval_internal(
|
||||
|
||||
// self-attention
|
||||
{
|
||||
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
||||
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
||||
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
||||
// compute Q and K and RoPE them
|
||||
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
||||
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
||||
|
||||
// store key and value to memory
|
||||
if (N >= 1) {
|
||||
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
|
||||
struct ggml_tensor * v = ggml_view_1d(ctx0, kv_self.v, N*n_embd, (ggml_element_size(kv_self.v)*n_embd)*(il*n_ctx + n_past));
|
||||
{
|
||||
// compute the transposed [N, n_embd] V matrix
|
||||
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), n_embd, N));
|
||||
|
||||
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
|
||||
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
|
||||
( n_ctx)*ggml_element_size(kv_self.v),
|
||||
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
|
||||
|
||||
// important: storing RoPE-ed version of K in the KV cache!
|
||||
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
|
||||
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
|
||||
}
|
||||
|
||||
// Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
|
||||
struct ggml_tensor * Q =
|
||||
ggml_permute(ctx0,
|
||||
ggml_rope(ctx0,
|
||||
ggml_cpy(ctx0,
|
||||
Qcur,
|
||||
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
|
||||
n_past, n_rot, 0),
|
||||
Qcur,
|
||||
0, 2, 1, 3);
|
||||
|
||||
// K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
|
||||
struct ggml_tensor * K =
|
||||
ggml_permute(ctx0,
|
||||
ggml_rope(ctx0,
|
||||
ggml_reshape_3d(ctx0,
|
||||
ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
|
||||
n_embd/n_head, n_head, n_past + N),
|
||||
n_past, n_rot, 1),
|
||||
ggml_reshape_3d(ctx0,
|
||||
ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
|
||||
n_embd/n_head, n_head, n_past + N),
|
||||
0, 2, 1, 3);
|
||||
|
||||
// K * Q
|
||||
@@ -930,18 +856,23 @@ static bool llama_eval_internal(
|
||||
// KQ = soft_max(KQ_masked)
|
||||
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
||||
|
||||
// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
|
||||
struct ggml_tensor * V_trans =
|
||||
ggml_cpy(ctx0,
|
||||
ggml_permute(ctx0,
|
||||
ggml_reshape_3d(ctx0,
|
||||
ggml_view_1d(ctx0, kv_self.v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.v)*n_embd),
|
||||
n_embd/n_head, n_head, n_past + N),
|
||||
1, 2, 0, 3),
|
||||
ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head));
|
||||
// split cached V into n_head heads
|
||||
struct ggml_tensor * V =
|
||||
ggml_view_3d(ctx0, kv_self.v,
|
||||
n_past + N, n_embd/n_head, n_head,
|
||||
n_ctx*ggml_element_size(kv_self.v),
|
||||
n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
|
||||
il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
|
||||
|
||||
// KQV = transpose(V) * KQ_soft_max
|
||||
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
|
||||
#if 1
|
||||
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
||||
#else
|
||||
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
||||
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
|
||||
// is there a better way?
|
||||
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head));
|
||||
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
|
||||
#endif
|
||||
|
||||
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
||||
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
||||
@@ -1027,9 +958,13 @@ static bool llama_eval_internal(
|
||||
ggml_build_forward_expand(&gf, inpL);
|
||||
ggml_graph_compute (ctx0, &gf);
|
||||
|
||||
// print timing information per ggml operation (for debugging purposes)
|
||||
// requires GGML_PERF to be defined
|
||||
//ggml_graph_print(&gf);
|
||||
|
||||
// plot the computation graph in dot format (for debugging purposes)
|
||||
//if (n_past%100 == 0) {
|
||||
// ggml_graph_print (&gf);
|
||||
// ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
|
||||
// ggml_graph_dump_dot(&gf, NULL, "llama.dot");
|
||||
//}
|
||||
|
||||
//embd_w.resize(n_vocab*N);
|
||||
@@ -1126,7 +1061,7 @@ struct llama_tokenizer {
|
||||
size_t offs = 0;
|
||||
while (offs < text.size()) {
|
||||
llama_sp_symbol sym;
|
||||
size_t char_len = std::min(text.size() - offs, utf8_len(text[offs]));
|
||||
size_t char_len = Min(text.size() - offs, utf8_len(text[offs]));
|
||||
sym.text = text.c_str() + offs;
|
||||
sym.n = char_len;
|
||||
offs += char_len;
|
||||
@@ -1266,6 +1201,20 @@ static llama_vocab::id llama_sample_top_p_top_k(
|
||||
const auto & logits = lctx.logits;
|
||||
const auto * plogits = logits.data() + logits.size() - n_logits;
|
||||
|
||||
if (temp <= 0) {
|
||||
// select the token with the highest logit directly
|
||||
float max_logit = plogits[0];
|
||||
llama_vocab::id max_id = 0;
|
||||
|
||||
for (int i = 1; i < n_logits; ++i) {
|
||||
if (plogits[i] > max_logit) {
|
||||
max_logit = plogits[i];
|
||||
max_id = i;
|
||||
}
|
||||
}
|
||||
return max_id;
|
||||
}
|
||||
|
||||
std::vector<std::pair<float, llama_vocab::id>> logits_id;
|
||||
logits_id.reserve(n_logits);
|
||||
|
||||
@@ -1287,17 +1236,13 @@ static llama_vocab::id llama_sample_top_p_top_k(
|
||||
}
|
||||
}
|
||||
|
||||
sample_top_k(logits_id, top_k);
|
||||
|
||||
float maxl = -std::numeric_limits<float>::infinity();
|
||||
for (const auto & kv : logits_id) {
|
||||
maxl = std::max(maxl, kv.first);
|
||||
}
|
||||
sample_top_k(logits_id, top_k > 0 ? Min(top_k, n_logits) : n_logits);
|
||||
|
||||
// compute probs for the top k tokens
|
||||
std::vector<float> probs;
|
||||
probs.reserve(logits_id.size());
|
||||
|
||||
float maxl = logits_id[0].first;
|
||||
double sum = 0.0;
|
||||
for (const auto & kv : logits_id) {
|
||||
const float p = expf(kv.first - maxl);
|
||||
@@ -1320,16 +1265,11 @@ static llama_vocab::id llama_sample_top_p_top_k(
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
cumsum = 1.0/cumsum;
|
||||
for (int i = 0; i < (int) probs.size(); i++) {
|
||||
probs[i] *= cumsum;
|
||||
}
|
||||
}
|
||||
|
||||
//printf("\n");
|
||||
//for (int i = 0; i < (int) 10; i++) {
|
||||
// printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
|
||||
// printf("%d: '%s' %f\n", i, lctx.vocab.id_to_token.at(logits_id[i].second).tok.c_str(), probs[i]);
|
||||
//}
|
||||
//printf("\n\n");
|
||||
//exit(0);
|
||||
@@ -1385,8 +1325,7 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
return false;
|
||||
}
|
||||
if (magic != LLAMA_FILE_MAGIC) {
|
||||
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
|
||||
return false;
|
||||
return report_bad_magic(fname_inp.c_str(), magic, LLAMA_FILE_MAGIC);
|
||||
}
|
||||
|
||||
fout.write((char *) &magic, sizeof(magic));
|
||||
@@ -1452,8 +1391,8 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
fout.write((char *) &len, sizeof(len));
|
||||
|
||||
word.resize(len);
|
||||
finp.read ((char *) word.data(), len);
|
||||
fout.write((char *) word.data(), len);
|
||||
finp.read ((char *) &word[0], len);
|
||||
fout.write((char *) &word[0], len);
|
||||
|
||||
float score;
|
||||
finp.read ((char *) &score, sizeof(score));
|
||||
@@ -1503,6 +1442,13 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
std::string name(length, 0);
|
||||
finp.read (&name[0], length);
|
||||
|
||||
{
|
||||
// ensure tensor data is aligned
|
||||
uint64_t offset = finp.tellg();
|
||||
offset = (offset + 31) & -32;
|
||||
finp.seekg(offset);
|
||||
}
|
||||
|
||||
{
|
||||
static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
|
||||
printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]);
|
||||
@@ -1558,6 +1504,13 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
}
|
||||
fout.write(&name[0], length);
|
||||
|
||||
{
|
||||
// ensure tensor data is aligned
|
||||
uint64_t offset = fout.tellp();
|
||||
offset = (offset + 31) & -32;
|
||||
fout.seekp(offset);
|
||||
}
|
||||
|
||||
if (quantize) {
|
||||
printf("quantizing .. ");
|
||||
work.resize(nelements); // for quantization
|
||||
@@ -1655,7 +1608,10 @@ struct llama_context * llama_init_from_file(
|
||||
|
||||
if (params.use_mlock) {
|
||||
char *err;
|
||||
if (!ggml_mlock(ctx->model.ctx, &err)) {
|
||||
if (!ggml_mlock(ctx->model.ctx,
|
||||
ctx->model.mm_addr,
|
||||
ctx->model.mm_length,
|
||||
&err)) {
|
||||
fprintf(stderr, "%s\n", err);
|
||||
free(err);
|
||||
llama_free(ctx);
|
||||
@@ -1664,7 +1620,7 @@ struct llama_context * llama_init_from_file(
|
||||
}
|
||||
|
||||
// reserve memory for context buffers
|
||||
{
|
||||
if (!params.vocab_only) {
|
||||
if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) {
|
||||
fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
|
||||
llama_free(ctx);
|
||||
@@ -1705,6 +1661,10 @@ void llama_free(struct llama_context * ctx) {
|
||||
ggml_free(ctx->model.ctx);
|
||||
}
|
||||
|
||||
if (ctx->model.mm_addr) {
|
||||
munmap_file(ctx->model.mm_addr, ctx->model.mm_length);
|
||||
}
|
||||
|
||||
delete ctx;
|
||||
}
|
||||
|
||||
@@ -1720,6 +1680,33 @@ int llama_model_quantize(
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Returns the KV cache that will contain the context for the
|
||||
// ongoing prediction with the model.
|
||||
const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
|
||||
return ctx->model.kv_self.buf.data();
|
||||
}
|
||||
|
||||
// Returns the size of the KV cache
|
||||
size_t llama_get_kv_cache_size(struct llama_context * ctx) {
|
||||
return ctx->model.kv_self.buf.size();
|
||||
}
|
||||
|
||||
int llama_get_kv_cache_token_count(struct llama_context * ctx) {
|
||||
return ctx->model.kv_self.n;
|
||||
}
|
||||
|
||||
// Sets the KV cache containing the current context for the model
|
||||
void llama_set_kv_cache(
|
||||
struct llama_context * ctx,
|
||||
const uint8_t * kv_cache,
|
||||
size_t n_size,
|
||||
int n_token_count) {
|
||||
// Make sure we have the same kv cache setup
|
||||
LLAMA_ASSERT(ctx->model.kv_self.buf.size() == n_size);
|
||||
memcpy(ctx->model.kv_self.buf.data(), kv_cache, n_size);
|
||||
ctx->model.kv_self.n = n_token_count;
|
||||
}
|
||||
|
||||
int llama_eval(
|
||||
struct llama_context * ctx,
|
||||
const llama_token * tokens,
|
||||
@@ -1730,7 +1717,11 @@ int llama_eval(
|
||||
fprintf(stderr, "%s: failed to eval\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// get a more accurate load time, upon first eval
|
||||
if (!ctx->has_evaluated_once) {
|
||||
ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
|
||||
ctx->has_evaluated_once = true;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -1823,9 +1814,9 @@ llama_token llama_sample_top_p_top_k(
|
||||
void llama_print_timings(struct llama_context * ctx) {
|
||||
const int64_t t_end_us = ggml_time_us();
|
||||
|
||||
const int32_t n_sample = std::max(1, ctx->n_sample);
|
||||
const int32_t n_eval = std::max(1, ctx->n_eval);
|
||||
const int32_t n_p_eval = std::max(1, ctx->n_p_eval);
|
||||
const int32_t n_sample = Max(1, ctx->n_sample);
|
||||
const int32_t n_eval = Max(1, ctx->n_eval);
|
||||
const int32_t n_p_eval = Max(1, ctx->n_p_eval);
|
||||
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
|
||||
@@ -1837,7 +1828,6 @@ void llama_print_timings(struct llama_context * ctx) {
|
||||
|
||||
void llama_reset_timings(struct llama_context * ctx) {
|
||||
ctx->t_start_us = ggml_time_us();
|
||||
|
||||
ctx->t_sample_us = ctx->n_sample = 0;
|
||||
ctx->t_eval_us = ctx->n_eval = 0;
|
||||
ctx->t_p_eval_us = ctx->n_p_eval = 0;
|
||||
|
||||
19
llama.h
19
llama.h
@@ -20,7 +20,7 @@
|
||||
#endif
|
||||
|
||||
#define LLAMA_FILE_VERSION 1
|
||||
#define LLAMA_FILE_MAGIC 0x67676d66 // 'ggmf' in hex
|
||||
#define LLAMA_FILE_MAGIC 0x67676a74 // 'ggjt' in hex
|
||||
#define LLAMA_FILE_MAGIC_UNVERSIONED 0x67676d6c // pre-versioned files
|
||||
|
||||
#ifdef __cplusplus
|
||||
@@ -83,6 +83,23 @@ extern "C" {
|
||||
const char * fname_out,
|
||||
int itype);
|
||||
|
||||
// Returns the KV cache that will contain the context for the
|
||||
// ongoing prediction with the model.
|
||||
LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx);
|
||||
|
||||
// Returns the size of the KV cache
|
||||
LLAMA_API size_t llama_get_kv_cache_size(struct llama_context * ctx);
|
||||
|
||||
// Returns the number of tokens in the KV cache
|
||||
LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
|
||||
|
||||
// Sets the KV cache containing the current context for the model
|
||||
LLAMA_API void llama_set_kv_cache(
|
||||
struct llama_context * ctx,
|
||||
const uint8_t * kv_cache,
|
||||
size_t n_size,
|
||||
int n_token_count);
|
||||
|
||||
// Run the llama inference to obtain the logits and probabilities for the next token.
|
||||
// tokens + n_tokens is the provided batch of new tokens to process
|
||||
// n_past is the number of tokens to use from previous eval calls
|
||||
|
||||
BIN
media/llama-leader.jpeg
Normal file
BIN
media/llama-leader.jpeg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 195 KiB |
BIN
media/llama0-banner.png
Normal file
BIN
media/llama0-banner.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 141 KiB |
BIN
media/llama0-logo.png
Normal file
BIN
media/llama0-logo.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 176 KiB |
BIN
media/llama1-banner.png
Normal file
BIN
media/llama1-banner.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 32 KiB |
BIN
media/llama1-logo.png
Normal file
BIN
media/llama1-logo.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 32 KiB |
311
migrate-ggml-2023-03-30-pr613.py
Normal file
311
migrate-ggml-2023-03-30-pr613.py
Normal file
@@ -0,0 +1,311 @@
|
||||
# Migrate ggml file(s) with ggmf magic to ggml file with ggjt magic
|
||||
#
|
||||
# We caused a breaking change to the file format on 2023-03-30 in:
|
||||
# https://github.com/ggerganov/llama.cpp/pull/613
|
||||
#
|
||||
# (1) If you still have the Meta LLaMA .pth files, then close this
|
||||
# file now; you can just run `convert-pth-to-ggml.py` again to
|
||||
# migrate to the new format. The tool is easier to use too. It
|
||||
# isn't necessary anymore to manage split output files because
|
||||
# the new format always combines things into a single file.
|
||||
#
|
||||
# (2) If you deleted the Meta LLaMA .pth files due to save on disk
|
||||
# space, then this tool is intended to help you. Please check
|
||||
# out the instructions below.
|
||||
#
|
||||
# USAGE
|
||||
#
|
||||
# python migrate-ggml-2023-03-30-pr613.py INPUT OUTPUT
|
||||
#
|
||||
# PREREQUISITES
|
||||
#
|
||||
# pip install numpy
|
||||
# cd llama.cpp
|
||||
# make -j4
|
||||
#
|
||||
# EXAMPLE (7B MODEL)
|
||||
#
|
||||
# # you can replace all the 'f16' with 'q4_0' if you're using quantized weights
|
||||
# python migrate-ggml-2023-03-30-pr613.py models/7B/ggml-model-f16.bin models/7B/ggml-model-f16-ggjt.bin
|
||||
#
|
||||
# # check that it works
|
||||
# ./main -m models/7B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?'
|
||||
#
|
||||
# # you can delete the old files
|
||||
# rm -f models/7B/ggml-model-f16.bin
|
||||
# mv models/7B/ggml-model-f16-ggjt.bin models/7B/ggml-model-f16.bin
|
||||
#
|
||||
# EXAMPLE (13B MODEL)
|
||||
#
|
||||
# # you can replace all the 'f16' with 'q4_0' if you're using quantized weights
|
||||
# python migrate-ggml-2023-03-30-pr613.py models/13B/ggml-model-f16.bin models/13B/ggml-model-f16-ggjt.bin
|
||||
#
|
||||
# # check that it works
|
||||
# ./main -m models/13B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?'
|
||||
#
|
||||
# # you can delete the old files
|
||||
# rm -f models/13B/ggml-model-f16.bin*
|
||||
# mv models/13B/ggml-model-f16-ggjt.bin models/13B/ggml-model-f16.bin
|
||||
#
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import struct
|
||||
import numpy as np
|
||||
|
||||
QK = 32
|
||||
|
||||
GGML_TYPE_Q4_0 = 0
|
||||
GGML_TYPE_Q4_1 = 1
|
||||
GGML_TYPE_I8 = 2
|
||||
GGML_TYPE_I16 = 3
|
||||
GGML_TYPE_I32 = 4
|
||||
GGML_TYPE_F16 = 5
|
||||
GGML_TYPE_F32 = 6
|
||||
|
||||
WTYPE_NAMES = {
|
||||
0: "F32",
|
||||
1: "F16",
|
||||
2: "Q4_0",
|
||||
3: "Q4_1",
|
||||
}
|
||||
|
||||
WTYPES = {
|
||||
0: GGML_TYPE_F32,
|
||||
1: GGML_TYPE_F16,
|
||||
2: GGML_TYPE_Q4_0,
|
||||
3: GGML_TYPE_Q4_1,
|
||||
}
|
||||
|
||||
GGML_BLCK_SIZE = {
|
||||
GGML_TYPE_Q4_0: QK,
|
||||
GGML_TYPE_Q4_1: QK,
|
||||
GGML_TYPE_I8: 1,
|
||||
GGML_TYPE_I16: 1,
|
||||
GGML_TYPE_I32: 1,
|
||||
GGML_TYPE_F16: 1,
|
||||
GGML_TYPE_F32: 1,
|
||||
}
|
||||
|
||||
GGML_TYPE_SIZE = {
|
||||
GGML_TYPE_Q4_0: 4 + QK//2,
|
||||
GGML_TYPE_Q4_1: 4*2 + QK//2,
|
||||
GGML_TYPE_I8: 1,
|
||||
GGML_TYPE_I16: 2,
|
||||
GGML_TYPE_I32: 4,
|
||||
GGML_TYPE_F16: 2,
|
||||
GGML_TYPE_F32: 4,
|
||||
}
|
||||
|
||||
HPARAMS = [
|
||||
'magic', # int32
|
||||
'version', # int32
|
||||
'n_vocab', # int32
|
||||
'n_embd', # int32
|
||||
'n_mult', # int32
|
||||
'n_head', # int32
|
||||
'n_layer', # int32
|
||||
'n_rot', # int32
|
||||
'f16', # int32
|
||||
]
|
||||
|
||||
def read_hparams(fin):
|
||||
struct_fmt = "i" * len(HPARAMS)
|
||||
struct_size = struct.calcsize(struct_fmt)
|
||||
buf = fin.read(struct_size)
|
||||
ints = struct.unpack(struct_fmt, buf)
|
||||
hparams = dict(zip(HPARAMS, ints))
|
||||
return hparams
|
||||
|
||||
def write_hparams(fout, hparams):
|
||||
struct_fmt = "i" * len(HPARAMS)
|
||||
struct_size = struct.calcsize(struct_fmt)
|
||||
ints = [hparams[h] for h in HPARAMS]
|
||||
fout.write(struct.pack(struct_fmt, *ints))
|
||||
|
||||
def read_tokens(fin, hparams):
|
||||
tokens = []
|
||||
for i in range(hparams['n_vocab']):
|
||||
len_b = fin.read(4)
|
||||
(length,) = struct.unpack("i", len_b)
|
||||
word = fin.read(length)
|
||||
score_b = fin.read(4)
|
||||
(score,) = struct.unpack("f", score_b)
|
||||
tokens.append((word, score))
|
||||
return tokens
|
||||
|
||||
def write_tokens(fout, tokens):
|
||||
for word, score in tokens:
|
||||
fout.write(struct.pack("i", len(word)))
|
||||
fout.write(word)
|
||||
fout.write(struct.pack("f", score))
|
||||
|
||||
def ggml_nelements(shape):
|
||||
r = 1
|
||||
for i in shape:
|
||||
r *= i
|
||||
return r
|
||||
|
||||
def ggml_nbytes(shape, ftype):
|
||||
x = ggml_nelements(shape)
|
||||
t = WTYPES[ftype]
|
||||
x *= GGML_TYPE_SIZE[t]
|
||||
x //= GGML_BLCK_SIZE[t]
|
||||
return x
|
||||
|
||||
def copy_tensors(fin, fout, part_id, n_parts):
|
||||
while True:
|
||||
|
||||
b = fin.read(4)
|
||||
if not b: break
|
||||
(n_dims,) = struct.unpack("i", b)
|
||||
b = fin.read(4)
|
||||
(length,) = struct.unpack("i", b)
|
||||
b = fin.read(4)
|
||||
(ftype,) = struct.unpack("i", b)
|
||||
|
||||
assert n_dims in (1, 2)
|
||||
|
||||
partshape = list(range(n_dims))
|
||||
for i in range(n_dims):
|
||||
b = fin.read(4)
|
||||
partshape[i] = struct.unpack("i", b)[0]
|
||||
partshape = list(reversed(partshape))
|
||||
|
||||
name = fin.read(length)
|
||||
data = fin.read(ggml_nbytes(partshape, ftype))
|
||||
|
||||
blck_size = GGML_BLCK_SIZE[WTYPES[ftype]]
|
||||
type_size = GGML_TYPE_SIZE[WTYPES[ftype]]
|
||||
|
||||
print(f"Processing tensor {name} with shape: {partshape} and type: {WTYPE_NAMES[ftype]}")
|
||||
|
||||
# determine dimension along which multipart tensor is sharded
|
||||
#
|
||||
# split_dim 0 regex:
|
||||
# - output.*
|
||||
# - layers.*.attention.wq.weight
|
||||
# - layers.*.attention.wk.weight
|
||||
# - layers.*.attention.wv.weight
|
||||
# - layers.*.feed_forward.w1.weight
|
||||
# - layers.*.feed_forward.w3.weight
|
||||
#
|
||||
# split_dim 1 regex:
|
||||
# - tok_embeddings.*
|
||||
# - layers.*.attention.wo.weight
|
||||
# - layers.*.feed_forward.w2.weight
|
||||
#
|
||||
if n_dims > 1:
|
||||
split_dim = 1
|
||||
if b"tok_embeddings" in name:
|
||||
split_dim = 1
|
||||
elif b"layers" in name:
|
||||
if b"attention.wo.weight" in name:
|
||||
split_dim = 1
|
||||
elif b"feed_forward.w2.weight" in name:
|
||||
split_dim = 1
|
||||
else:
|
||||
split_dim = 0
|
||||
elif b"output" in name:
|
||||
split_dim = 0
|
||||
|
||||
# output tensor header
|
||||
fullshape = list(partshape)
|
||||
if n_dims > 1:
|
||||
fullshape[split_dim] *= n_parts
|
||||
fout.write(struct.pack("iii", n_dims, len(name), ftype))
|
||||
for dim in reversed(fullshape):
|
||||
fout.write(struct.pack("i", dim))
|
||||
fout.write(name)
|
||||
|
||||
# ensure tensor data is aligned
|
||||
tensor_data_offset = fout.tell()
|
||||
while tensor_data_offset % QK != 0:
|
||||
fout.write(struct.pack("B", 0))
|
||||
tensor_data_offset += 1
|
||||
|
||||
# output unified mappable tensor data
|
||||
if n_dims == 1 or n_parts == 1:
|
||||
# copy tensor which we thankfully received in one piece
|
||||
if part_id == 0:
|
||||
fout.write(data)
|
||||
elif split_dim == 0:
|
||||
# reassemble multifile tensor containing some of the rows
|
||||
rows_per_chunk = partshape[0]
|
||||
current_row = part_id * rows_per_chunk
|
||||
bytes_per_row = fullshape[1] // blck_size * type_size
|
||||
offset = current_row * bytes_per_row
|
||||
fout.seek(tensor_data_offset + offset)
|
||||
fout.write(data)
|
||||
elif split_dim == 1:
|
||||
# reassemble multifile tensor containing some of the cols
|
||||
cols_per_chunk = partshape[1]
|
||||
current_col = part_id * cols_per_chunk
|
||||
bpr = partshape[1] // blck_size * type_size
|
||||
bytes_per_row = fullshape[1] // blck_size * type_size
|
||||
offset_current_col = current_col // blck_size * type_size
|
||||
for row in range(partshape[0]):
|
||||
offset_row = row * bytes_per_row
|
||||
offset = offset_row + offset_current_col
|
||||
fout.seek(tensor_data_offset + offset)
|
||||
fout.write(data[row * bpr:row * bpr + bpr])
|
||||
|
||||
# advance file position to next tensor
|
||||
fout.seek(tensor_data_offset + ggml_nbytes(fullshape, ftype))
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description='Migrate from GGML to new GGJT file format')
|
||||
parser.add_argument('fin_path', help='your old ggml file (leave out the .1 .2 etc.)')
|
||||
parser.add_argument('fout_path', help='your new ggjt file name')
|
||||
return parser.parse_args()
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
assert args.fin_path
|
||||
assert args.fout_path
|
||||
assert args.fin_path != args.fout_path
|
||||
|
||||
with open(args.fin_path, "rb") as fin:
|
||||
hparams = read_hparams(fin)
|
||||
tokens = read_tokens(fin, hparams)
|
||||
|
||||
if hparams['magic'] == 0x67676a74: # ggjt
|
||||
print(f"{args.fin_path}: input ggml has already been converted to 'ggjt' magic\n")
|
||||
sys.exit(1)
|
||||
|
||||
if hparams['magic'] != 0x67676d66: # ggmf
|
||||
print(f"{args.fin_path}: input ggml file doesn't have expected 'ggmf' magic: {hparams['magic']:#x}\n")
|
||||
sys.exit(1)
|
||||
|
||||
hparams['magic'] = 0x67676a74 # ggjt
|
||||
|
||||
# count number of multipart files by convention
|
||||
n_parts = 1
|
||||
while True:
|
||||
if os.path.exists(f"{args.fin_path}.{n_parts}"):
|
||||
n_parts += 1
|
||||
else:
|
||||
break
|
||||
|
||||
# we output a single file for ggml
|
||||
with open(args.fout_path, "wb") as fout:
|
||||
write_hparams(fout, hparams)
|
||||
write_tokens(fout, tokens)
|
||||
offset_of_tensors = fout.tell()
|
||||
# the tensors we load could be split across multiple files
|
||||
for part_id in range(n_parts):
|
||||
fout.seek(offset_of_tensors)
|
||||
print(f"Processing part {part_id+1} of {n_parts}\n")
|
||||
fin_path = args.fin_path
|
||||
if part_id > 0:
|
||||
fin_path += f".{part_id}"
|
||||
with open(fin_path, "rb") as fin:
|
||||
read_tokens(fin, read_hparams(fin))
|
||||
copy_tensors(fin, fout, part_id, n_parts)
|
||||
|
||||
print(f"Done. Output file: {args.fout_path}\n")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Binary file not shown.
131
quantize.py
131
quantize.py
@@ -1,131 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
"""Script to execute the "quantize" script on a given set of models."""
|
||||
|
||||
import subprocess
|
||||
import argparse
|
||||
import glob
|
||||
import sys
|
||||
import os
|
||||
|
||||
|
||||
def main():
|
||||
"""Update the quantize binary name depending on the platform and parse
|
||||
the command line arguments and execute the script.
|
||||
"""
|
||||
|
||||
if "linux" in sys.platform or "darwin" in sys.platform:
|
||||
quantize_script_binary = "quantize"
|
||||
|
||||
elif "win32" in sys.platform or "cygwin" in sys.platform:
|
||||
quantize_script_binary = "quantize.exe"
|
||||
|
||||
else:
|
||||
print("WARNING: Unknown platform. Assuming a UNIX-like OS.\n")
|
||||
quantize_script_binary = "quantize"
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
prog='python3 quantize.py',
|
||||
description='This script quantizes the given models by applying the '
|
||||
f'"{quantize_script_binary}" script on them.'
|
||||
)
|
||||
parser.add_argument(
|
||||
'models', nargs='+', choices=('7B', '13B', '30B', '65B'),
|
||||
help='The models to quantize.'
|
||||
)
|
||||
parser.add_argument(
|
||||
'-r', '--remove-16', action='store_true', dest='remove_f16',
|
||||
help='Remove the f16 model after quantizing it.'
|
||||
)
|
||||
parser.add_argument(
|
||||
'-m', '--models-path', dest='models_path',
|
||||
default=os.path.join(os.getcwd(), "models"),
|
||||
help='Specify the directory where the models are located.'
|
||||
)
|
||||
parser.add_argument(
|
||||
'-q', '--quantize-script-path', dest='quantize_script_path',
|
||||
default=os.path.join(os.getcwd(), quantize_script_binary),
|
||||
help='Specify the path to the "quantize" script.'
|
||||
)
|
||||
|
||||
# TODO: Revise this code
|
||||
# parser.add_argument(
|
||||
# '-t', '--threads', dest='threads', type='int',
|
||||
# default=os.cpu_count(),
|
||||
# help='Specify the number of threads to use to quantize many models at '
|
||||
# 'once. Defaults to os.cpu_count().'
|
||||
# )
|
||||
|
||||
args = parser.parse_args()
|
||||
args.models_path = os.path.abspath(args.models_path)
|
||||
|
||||
if not os.path.isfile(args.quantize_script_path):
|
||||
print(
|
||||
f'The "{quantize_script_binary}" script was not found in the '
|
||||
"current location.\nIf you want to use it from another location, "
|
||||
"set the --quantize-script-path argument from the command line."
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
for model in args.models:
|
||||
# The model is separated in various parts
|
||||
# (ggml-model-f16.bin, ggml-model-f16.bin.0, ggml-model-f16.bin.1...)
|
||||
f16_model_path_base = os.path.join(
|
||||
args.models_path, model, "ggml-model-f16.bin"
|
||||
)
|
||||
|
||||
if not os.path.isfile(f16_model_path_base):
|
||||
print(f'The file %s was not found' % f16_model_path_base)
|
||||
sys.exit(1)
|
||||
|
||||
f16_model_parts_paths = map(
|
||||
lambda filename: os.path.join(f16_model_path_base, filename),
|
||||
glob.glob(f"{f16_model_path_base}*")
|
||||
)
|
||||
|
||||
for f16_model_part_path in f16_model_parts_paths:
|
||||
if not os.path.isfile(f16_model_part_path):
|
||||
print(
|
||||
f"The f16 model {os.path.basename(f16_model_part_path)} "
|
||||
f"was not found in {args.models_path}{os.path.sep}{model}"
|
||||
". If you want to use it from another location, set the "
|
||||
"--models-path argument from the command line."
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
__run_quantize_script(
|
||||
args.quantize_script_path, f16_model_part_path
|
||||
)
|
||||
|
||||
if args.remove_f16:
|
||||
os.remove(f16_model_part_path)
|
||||
|
||||
|
||||
# This was extracted to a top-level function for parallelization, if
|
||||
# implemented. See https://github.com/ggerganov/llama.cpp/pull/222/commits/f8db3d6cd91bf1a1342db9d29e3092bc12dd783c#r1140496406
|
||||
|
||||
def __run_quantize_script(script_path, f16_model_part_path):
|
||||
"""Run the quantize script specifying the path to it and the path to the
|
||||
f16 model to quantize.
|
||||
"""
|
||||
|
||||
new_quantized_model_path = f16_model_part_path.replace("f16", "q4_0")
|
||||
subprocess.run(
|
||||
[script_path, f16_model_part_path, new_quantized_model_path, "2"],
|
||||
check=True
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
main()
|
||||
|
||||
except subprocess.CalledProcessError:
|
||||
print("\nAn error ocurred while trying to quantize the models.")
|
||||
sys.exit(1)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
sys.exit(0)
|
||||
|
||||
else:
|
||||
print("\nSuccesfully quantized all models.")
|
||||
Reference in New Issue
Block a user