Compare commits

..

76 Commits

Author SHA1 Message Date
Georgi Gerganov
986b6ce9f9 ggml, llama : avoid heavy V transpose + improvements (#775)
ggml :

- added ggml_view_3d()
- ggml_view_tensor() now inherits the stride too
- reimplement ggml_cpy() to account for dst stride
- no longer require tensor->data to be memory aligned

llama :

- compute RoPE on 32-bit tensors (should be more accurate)
- store RoPE-ed K in the KV cache
- store transposed V in the KV cache (significant speed-up)
- avoid unnecessary Q copy
2023-04-05 22:07:33 +03:00
Georgi Gerganov
3416298929 Update README.md 2023-04-05 19:54:30 +03:00
Ivan Stepanov
5a8c4f6240 llama : define non-positive top_k; top_k range check (#779)
* Define non-positive top_k; top_k range check

* minor : brackets

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-04-05 19:20:05 +03:00
at8u
ff05d05c96 miku.sh : add executable bit (#780) 2023-04-05 18:59:13 +03:00
Georgi Gerganov
62b3e81aae media : add logos and banners 2023-04-05 18:58:31 +03:00
Georgi Gerganov
8d10406d6e readme : change logo + add bindings + add uis + add wiki 2023-04-05 18:56:20 +03:00
iacore
ed1c214e66 zig : add build.zig (#773)
Co-authored-by: Locria Cyber <74560659+locriacyber@users.noreply.github.com>
2023-04-05 18:06:02 +03:00
Ivan Stepanov
0c44427df1 make : missing host optimizations in CXXFLAGS (#763) 2023-04-05 17:38:37 +03:00
Adithya Balaji
594cc95fab readme : update with CMake and windows example (#748)
* README: Update with CMake and windows example

* README: update with code-review for cmake build
2023-04-05 17:36:12 +03:00
at8u
88ed5761b8 examples : add Miku.sh (#724)
* Add Miku.sh to examples

* Add missing line to prompt in Miku.sh

* Add --keep param to Miku.sh

* Remove '[end_of_conversation]' line from Miku.sh

No longer is necessary.
2023-04-05 17:32:42 +03:00
Andrew Duffy
58c438cf7d Add Accelerate/BLAS when using Swift (#765) 2023-04-05 06:44:24 -04:00
mgroeber9110
53dbba7695 Windows: reactive sigint handler after each Ctrl-C (#736) 2023-04-03 18:00:55 +02:00
SebastianApel
437e77855a 10+% performance improvement of ggml_vec_dot_q4_0 on AVX2 (#654)
* Performance improvement of AVX2 code
* Fixed problem with MSVC compiler
* Reviewer comments: removed double semicolon, deleted empty line 1962
2023-04-03 09:52:28 +02:00
Ivan Stepanov
cd7fa95690 Define non-positive temperature behavior (#720) 2023-04-03 02:19:04 +02:00
bsilvereagle
a0c0516416 Remove torch GPU dependencies from the Docker.full image (#665)
By using `pip install torch --index-url https://download.pytorch.org/whl/cpu`
instead of `pip install torch` we can specify we want to install a CPU-only version
of PyTorch without any GPU dependencies. This reduces the size of the Docker image
from 7.32 GB to 1.62 GB
2023-04-03 00:13:03 +02:00
Thatcher Chamberlin
d8d4e865cd Add a missing step to the gpt4all instructions (#690)
`migrate-ggml-2023-03-30-pr613.py` is needed to get gpt4all running.
2023-04-02 12:48:57 +02:00
Christian Falch
e986f94829 Added api for getting/setting the kv_cache (#685)
The api provides access methods for retrieving the current memory buffer for the kv_cache and its token number.
It also contains a method for setting the kv_cache from a memory buffer.

This makes it possible to load/save history - maybe support --cache-prompt paramater as well?

Co-authored-by: Pavol Rusnak <pavol@rusnak.io>
2023-04-02 12:23:04 +02:00
Marian Cepok
c0bb1d3ce2 ggml : change ne to int64_t (#626) 2023-04-02 13:21:31 +03:00
Leonardo Neumann
6e7801d08d examples : add gpt4all script (#658) 2023-04-02 10:56:20 +03:00
Stephan Walter
81040f10aa llama : do not allocate KV cache for "vocab_only == true" (#682)
Fixes sanitizer CI
2023-04-02 10:18:53 +03:00
Fabian
c4f89d8d73 make : use -march=native -mtune=native on x86 (#609) 2023-04-02 10:17:05 +03:00
Murilo Santana
5b70e7de4c fix default params for examples/main (#697) 2023-04-02 04:41:12 +02:00
Ikko Eltociear Ashimine
a717cba844 py: huggingface -> Hugging Face (#686) 2023-04-01 18:38:18 +02:00
rimoliga
d0a7f742e7 readme: replace termux links with homepage, play store is deprecated (#680) 2023-04-01 16:57:30 +02:00
Slaren
0d054e292e Show error message when -f fails 2023-04-01 16:08:40 +02:00
Stephan Walter
3525899277 Enable -std= for cmake builds, fix warnings (#598) 2023-03-31 19:19:16 +00:00
slaren
1d08882afa Optimize AVX2 ggml_vec_dot_q4_0 (#642) 2023-03-31 15:55:52 +00:00
perserk
02c5b27e91 Add AVX acceleration (#617)
* ggml : add AVX quantize_row_q4_0()

* ggml : add AVX ggml_vec_dot_q4_0()

* ggml : refactor AVX part of ggml_vec_dot_q4_0()

https://github.com/ggerganov/llama.cpp/pull/617#issuecomment-1489985645
2023-03-31 13:55:44 +02:00
Pavol Rusnak
cbef542879 py : cleanup the code
- use f-strings where possible
- drop first param of encode/decode functions since "utf-8" is the default
2023-03-31 10:32:01 +02:00
Pavol Rusnak
9733104be5 drop quantize.py (now that models are using a single file) 2023-03-31 01:07:32 +02:00
Georgi Gerganov
3df890aef4 readme : update supported models 2023-03-30 22:31:54 +03:00
Justine Tunney
ee0c40dd6d Introduce GGML migration tool for new file format
If you deleted your old Meta LLaMA .pth files, then the
migrate-ggml-2023-03-30-pr613.py script will allow you to convert your
old ggml files into the new mmap()'able format.

See #613
2023-03-30 12:28:25 -07:00
Justine Tunney
6f23ba5ee2 Ensure --mlock works properly with mmap() support 2023-03-30 12:28:25 -07:00
Justine Tunney
78ca9838ee Make loading weights 10-100x faster
This is a breaking change that's going to give you three benefits:

1. Your inference commands should load 100x faster
2. You may be able to safely load models 2x larger
3. You can run many concurrent inference processes

This was accomplished by changing the file format so we can mmap()
weights directly into memory without having to read() or copy them
thereby ensuring the kernel can make its file cache pages directly
accessible to our inference processes; and secondly, that the file
cache pages are much less likely to get evicted (which would force
loads to hit disk) because they're no longer competing with memory
pages that were needlessly created by gigabytes of standard i/o.

The new file format supports single-file models like LLaMA 7b, and
it also supports multi-file models like LLaMA 13B. Our Python tool
now merges the foo.1, foo.2, etc. files back into a single file so
that the C++ code which maps it doesn't need to reshape data every
time. That's made llama.cpp so much simpler. Much of its load code
has now been deleted.

Furthermore, this change ensures that tensors are aligned properly
on a 32-byte boundary. That opens the door to seeing if we can get
additional performance gains on some microprocessors, by using ops
that require memory alignment.

Lastly note that both POSIX and the Windows platform are supported

Fixes #91
2023-03-30 12:28:25 -07:00
Slaren
a017390358 Initial windows support (untested) 2023-03-30 12:28:25 -07:00
Slaren
ac184d5147 Always initialize mm_addr and mm_length in llama_model 2023-03-30 12:28:25 -07:00
Slaren
276e5b7811 Unmap the file in llama_free 2023-03-30 12:28:25 -07:00
Slaren
d68c5dc435 Make mmap_file static 2023-03-30 12:28:25 -07:00
Slaren
64bde3ffd4 Fix ggml_init_params in quantize 2023-03-30 12:28:25 -07:00
Slaren
c03ae8dca1 Add mmap support for model files 2023-03-30 12:28:25 -07:00
Stephan Walter
3bcc129ba8 cmake : properly invoke CTest (#629) 2023-03-30 20:56:59 +03:00
Casey Primozic
a4755cf288 Remove unused variable (#607)
* It seems some new warning were added recently that exposed this.  I wrote the code that included this unused variable originally and it is indeed not needed.
2023-03-30 17:53:35 +00:00
david raistrick
1f0414feec make : fix darwin f16c flags check (#615)
...there was no check.  ported upstream from https://github.com/zanussbaum/gpt4all.cpp/pull/2 (I dont see any clean path for upstream patches)
2023-03-30 20:34:45 +03:00
Georgi Gerganov
77efdf5a50 ggml : fix NEON signs (close #620, #622) 2023-03-30 20:27:32 +03:00
slaren
ed3c680bcd Fix GGML_F32Cx8_STORE in AVX without F16C path (#619) 2023-03-30 11:16:30 +02:00
anzz1
9cbc404ba6 ci : re-enable AVX512 testing (Windows-MSVC) (#584)
* CI: Re-enable AVX512 testing (Windows-MSVC)

Now with 100% less base64 encoding

* plain __cpuid is enough here
2023-03-29 23:44:39 +03:00
Georgi Gerganov
b51c717d5c ggml : init time on first ggml_init() call 2023-03-29 22:15:34 +03:00
Georgi Gerganov
0ba76c1e73 llama : fix compile warnings when reading the vocab 2023-03-29 22:13:12 +03:00
Georgi Gerganov
cea1c85948 ggml : add ARM_NEON dequantize_row_q4_1() 2023-03-29 22:10:01 +03:00
Georgi Gerganov
f202ada131 ggml : add ARM_NEON quantize_row_q4_1() 2023-03-29 22:03:07 +03:00
Georgi Gerganov
3b44d30d9b ggml : add ARM_NEON ggml_vec_dot_q4_1() 2023-03-29 22:03:07 +03:00
Pavol Rusnak
61cbfff5c9 rename convert_ggml_to_pth.py -> convert-ggml-to-pth.py (#600)
to match filenames of other converters
2023-03-29 20:09:25 +02:00
Thérence
d9ad104440 Create chat-13B.bat (#592)
* Create chat-13B.bat

Same script than chat-13B.sh, but for windows users.
Tested and working on windows 10/11 v 22H2

* Apply suggestions from code review

---------

Co-authored-by: anzz1 <anzz1@live.com>
2023-03-29 20:21:09 +03:00
Georgi Gerganov
b467702b87 readme : fix typos 2023-03-29 19:38:31 +03:00
Georgi Gerganov
516d88e75c readme : add GPT4All instructions (close #588) 2023-03-29 19:37:20 +03:00
Georgi Gerganov
53635c081c py : add GPT4All conversion script
For now: copy-paste
Too much time for me to deduplicate the python code
2023-03-29 19:29:52 +03:00
Maël Kerbiriou
41318d708e llama : use the same threshold for OpenBLAS and ggml thread limiting (#577) 2023-03-29 19:10:07 +03:00
Tobias Lütke
a6956b25a1 add example of re-act pattern (#583)
* add example of re-act pattern

* spelling...

* fixed whitespace in reverse prompt issue
2023-03-29 10:10:24 -05:00
anzz1
83df5639eb Fix GCC warning about binary literal (#595)
0b10101010 -> 0xAA /* 0b10101010 */
2023-03-29 13:20:07 +00:00
anzz1
a5c42c4b13 Fix typo in llama.h (#593) 2023-03-29 13:19:29 +00:00
anzz1
5a5f8b1501 Enable Fused-Multiply-Add (FMA) and F16C/CVT16 vector extensions on MSVC (#375)
* Enable Fused-Multiply-Add (FMA) instructions on MSVC

__FMA__ macro does not exist in MSVC

* Enable F16C/CVT16 vector extensions on MSVC

__F16C__ macro does not exist in MSVC, but is implied with AVX2/AVX512

* MSVC cvt intrinsics

* Add __SSE3__ macro for MSVC too because why not

even though it's not currently used for anything when AVX is defined
2023-03-28 22:44:29 +03:00
anzz1
f1217055ea CI: fix subdirectory path globbing (#546)
- Changes in subdirectories will now be detecter properly
- (Windows-MSVC) AVX512 tests temporarily disabled
2023-03-28 22:43:25 +03:00
anzz1
7f4c5c6651 llama : fix linkage with mingw (#551)
* Revert 7e53955 (#542)

Still needs to be fixed properly

* Fix linking on mingw32
2023-03-28 21:23:09 +03:00
slaren
2a98bc18ea ggml : add AVX2 implementation of quantize_row_q4_1 (#515)
* Add AVX2 implementation of quantize_row_q4_1

* Actually use AVX2

* Make quantize_row_q4_1 static

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-03-28 21:06:03 +03:00
thement
d0aaff571c py : add temporary script to convert old ggml files to newer version (#539)
Co-authored-by: Jakub Horak <jakub.horak@ibawizard.net>
2023-03-28 20:55:42 +03:00
Tai Duc Nguyen
d0330fd783 py : add capabiliy to convert from ggml back to torch or hf format for further consumption/training/finetuning (#403) 2023-03-28 20:51:29 +03:00
Stephan Walter
99c5b27654 ggml : refactor quantized processing functions (#509)
* Refactor quantized processing functions

* ggml : minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-03-28 20:13:01 +03:00
DooWoong Lee (David)
692ce3164e py : removed unused model variable and verified that the code functions correctly with vocab_only setting. Also confirmed that the code works as expected after running with reduced memory usage due to deletion of no-longer-needed variable. (#547) 2023-03-28 20:02:34 +03:00
Georgi Gerganov
96f9c0506f ci : make ctest verbose, hopefully we see what is wrong with the sanitizer 2023-03-28 20:01:09 +03:00
Georgi Gerganov
d502bc7c9d tests : free llama context at the end of the test 2023-03-28 19:51:55 +03:00
Stephan Walter
436e561931 all : be more strict about converting float to double (#458)
* Be more strict about converting float to double

* Test equivalence of round, SILU implementations

Test module is commented out in CMakeLists.txt because the tests may
take a long time, depending on how much the compiler optimizes.

* Fix softmax in perplexity.cpp

* all : prefer float over double where appropriate

* perplexity : add <cmath>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-03-28 19:48:20 +03:00
Jed Fox
20e1e84884 deploy : add a Package.swift for SwiftPM support (#393)
* Add a Package.swift for SwiftPM support

* Swap from exclusions to allowlist
2023-03-28 19:39:01 +03:00
Stephan Walter
c1f885067c ggml : introduce structs for the q4 data blocks (#356)
* Introduce structs for the q4 data blocks

* ggml : rename quant struct variables + fix ARM_NEON

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-03-28 18:56:03 +03:00
Georgi Gerganov
e0670260fb gitignore : add "embedding" 2023-03-28 18:34:35 +03:00
dotpy314
28ba975aea Check the existence of f16_model_path_base in quantize.py (#574)
Co-authored-by: Jincheng Miao <jincheng.miao@gmail.com>
2023-03-28 18:06:28 +03:00
slaren
a6bdc47cba Fix usage of F16C intrinsics in AVX code (#563)
* Fix usage of F16C intrinsics in AVX code when F16C is not defined
2023-03-28 17:26:55 +03:00
43 changed files with 3046 additions and 1729 deletions

View File

@@ -6,7 +6,8 @@ RUN apt-get update && \
apt-get install -y build-essential python3 python3-pip
RUN pip install --upgrade pip setuptools wheel \
&& pip install numpy requests sentencepiece torch tqdm
&& pip install numpy requests sentencepiece tqdm \
&& pip install torch --index-url https://download.pytorch.org/whl/cpu
WORKDIR /app

View File

@@ -8,10 +8,10 @@ on:
required: true
type: boolean
push:
paths: ['.github/workflows/**', 'CMakeLists.txt', 'Makefile', '**.h', '*.c', '**.cpp']
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp']
pull_request:
types: [opened, synchronize, edited, reopened, review_requested, ready_for_review]
paths: ['CMakeLists.txt', 'Makefile', '**.h', '*.c', '**.cpp']
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp']
env:
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
@@ -62,7 +62,7 @@ jobs:
id: cmake_test
run: |
cd build
ctest --output-on-failure
ctest --verbose
ubuntu-latest-cmake-sanitizer:
runs-on: ubuntu-latest
@@ -98,7 +98,7 @@ jobs:
id: cmake_test
run: |
cd build
ctest --output-on-failure
ctest --verbose
macOS-latest-make:
runs-on: macos-latest
@@ -143,7 +143,7 @@ jobs:
id: cmake_test
run: |
cd build
ctest --output-on-failure
ctest --verbose
windows-latest-cmake:
runs-on: windows-latest
@@ -177,15 +177,19 @@ jobs:
continue-on-error: true
run: |
cd build
Set-Content -Path .\avx512f.exe -Value ([Convert]::FromBase64String('TVqQAAMAAAAEAAAA//8AALgAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAyAAAAA4fug4AtAnNIbgBTM0hVGhpcyBwcm9ncmFtIGNhbm5vdCBiZSBydW4gaW4gRE9TIG1vZGUuDQ0KJAAAAAAAAAClmfXY4fibi+H4m4vh+JuL4fiai+P4m4si98aL4vibi7Xbq4vg+JuLUmljaOH4m4sAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABQRQAATAEBAGo6H2QAAAAAAAAAAOAADwELAQYAAAIAAAAAAAAAAAAADBAAAAAQAAAAIAAAAABAAAAQAAAAAgAABAAAAAAAAAAEAAAAAAAAAAAgAAAAAgAAAAAAAAMAAAAAABAAABAAAAAAEAAAEAAAAAAAABAAAAAAAAAAAAAAAFQQAAAoAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEAAADAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAC50ZXh0AAAAsgAAAAAQAAAAAgAAAAIAAAAAAAAAAAAAAAAAACAAAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACUEAAAiBAAAAAAAABVi+xRUVNTuAcAAAAPosHrEGaD4wGJXfxbg0X8MI1F+GoAUI1F/GoBUGr1/xUAEEAAUP8VBBBAAItF/FuDwND32BvAQMnDzMx8EAAAAAAAAAAAAACkEAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAlBAAAIgQAAAAAAAApANXcml0ZUZpbGUAuQFHZXRTdGRIYW5kbGUAAEtFUk5FTDMyLmRsbAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA==')) -AsByteStream
.\avx512f.exe && echo " AVX512F: YES" && ( echo HAS_AVX512F=1 >> $env:GITHUB_ENV ) || echo " AVX512F: NO"
$vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
$msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
$cl = $(join-path $msvc 'bin\Hostx64\x64\cl.exe')
echo 'int main(void){unsigned int a[4];__cpuid(a,7);return !(a[1]&65536);}' >> avx512f.c
& $cl /O2 /GS- /kernel avx512f.c /link /nodefaultlib /entry:main
.\avx512f.exe && echo "AVX512F: YES" && ( echo HAS_AVX512F=1 >> $env:GITHUB_ENV ) || echo "AVX512F: NO"
- name: Test
id: cmake_test
if: ${{ matrix.build != 'avx512' || env.HAS_AVX512F == '1' }} # Test AVX-512 only when possible
run: |
cd build
ctest -C Release --output-on-failure
ctest -C Release --verbose
- name: Get commit hash
id: commit

10
.gitignore vendored
View File

@@ -5,6 +5,7 @@
.vscode/
.DS_Store
.build/
build/
build-em/
build-debug/
@@ -20,9 +21,18 @@ models/*
/quantize
/result
/perplexity
/embedding
/Pipfile
arm_neon.h
compile_commands.json
.envrc
.direnv/
.venv
__pycache__
.swiftpm
zig-out/
zig-cache/

View File

@@ -68,7 +68,9 @@ option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
# Compile flags
#
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED true)
set(CMAKE_C_STANDARD 11)
set(CMAKE_C_STANDARD_REQUIRED true)
set(THREADS_PREFER_PTHREAD_FLAG ON)
find_package(Threads REQUIRED)
@@ -124,8 +126,9 @@ if (LLAMA_ALL_WARNINGS)
-Wall
-Wextra
-Wpedantic
-Wshadow
-Wcast-qual
-Wdouble-promotion
-Wshadow
-Wstrict-prototypes
-Wpointer-arith
-Wno-unused-function
@@ -135,6 +138,7 @@ if (LLAMA_ALL_WARNINGS)
-Wextra
-Wpedantic
-Wcast-qual
-Wno-unused-function
)
else()
# todo : msvc
@@ -251,7 +255,7 @@ endif()
#
if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
enable_testing()
include(CTest)
add_subdirectory(tests)
endif ()

View File

@@ -35,6 +35,10 @@ CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC
CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
LDFLAGS =
# warnings
CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wno-unused-function
CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
# OS specific
# TODO: support Windows
ifeq ($(UNAME_S),Linux)
@@ -66,92 +70,9 @@ endif
# TODO: probably these flags need to be tweaked on some architectures
# feel free to update the Makefile for your architecture and send a pull request or issue
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
ifeq ($(UNAME_S),Darwin)
CFLAGS += -mf16c
AVX1_M := $(shell sysctl machdep.cpu.features)
ifneq (,$(findstring FMA,$(AVX1_M)))
CFLAGS += -mfma
endif
ifneq (,$(findstring AVX1.0,$(AVX1_M)))
CFLAGS += -mavx
endif
AVX2_M := $(shell sysctl machdep.cpu.leaf7_features)
ifneq (,$(findstring AVX2,$(AVX2_M)))
CFLAGS += -mavx2
endif
else ifeq ($(UNAME_S),Linux)
AVX1_M := $(shell grep "avx " /proc/cpuinfo)
ifneq (,$(findstring avx,$(AVX1_M)))
CFLAGS += -mavx
endif
AVX2_M := $(shell grep "avx2 " /proc/cpuinfo)
ifneq (,$(findstring avx2,$(AVX2_M)))
CFLAGS += -mavx2
endif
FMA_M := $(shell grep "fma " /proc/cpuinfo)
ifneq (,$(findstring fma,$(FMA_M)))
CFLAGS += -mfma
endif
F16C_M := $(shell grep "f16c " /proc/cpuinfo)
ifneq (,$(findstring f16c,$(F16C_M)))
CFLAGS += -mf16c
endif
SSE3_M := $(shell grep "sse3 " /proc/cpuinfo)
ifneq (,$(findstring sse3,$(SSE3_M)))
CFLAGS += -msse3
endif
AVX512F_M := $(shell grep "avx512f " /proc/cpuinfo)
ifneq (,$(findstring avx512f,$(AVX512F_M)))
CFLAGS += -mavx512f
endif
AVX512BW_M := $(shell grep "avx512bw " /proc/cpuinfo)
ifneq (,$(findstring avx512bw,$(AVX512BW_M)))
CFLAGS += -mavx512bw
endif
AVX512DQ_M := $(shell grep "avx512dq " /proc/cpuinfo)
ifneq (,$(findstring avx512dq,$(AVX512DQ_M)))
CFLAGS += -mavx512dq
endif
AVX512VL_M := $(shell grep "avx512vl " /proc/cpuinfo)
ifneq (,$(findstring avx512vl,$(AVX512VL_M)))
CFLAGS += -mavx512vl
endif
AVX512CD_M := $(shell grep "avx512cd " /proc/cpuinfo)
ifneq (,$(findstring avx512cd,$(AVX512CD_M)))
CFLAGS += -mavx512cd
endif
AVX512ER_M := $(shell grep "avx512er " /proc/cpuinfo)
ifneq (,$(findstring avx512er,$(AVX512ER_M)))
CFLAGS += -mavx512er
endif
AVX512IFMA_M := $(shell grep "avx512ifma " /proc/cpuinfo)
ifneq (,$(findstring avx512ifma,$(AVX512IFMA_M)))
CFLAGS += -mavx512ifma
endif
AVX512PF_M := $(shell grep "avx512pf " /proc/cpuinfo)
ifneq (,$(findstring avx512pf,$(AVX512PF_M)))
CFLAGS += -mavx512pf
endif
else ifeq ($(UNAME_S),Haiku)
AVX1_M := $(shell sysinfo -cpu | grep -w "AVX")
ifneq (,$(findstring AVX,$(AVX1_M)))
CFLAGS += -mavx
endif
AVX2_M := $(shell sysinfo -cpu | grep -w "AVX2")
ifneq (,$(findstring AVX2,$(AVX2_M)))
CFLAGS += -mavx2
endif
FMA_M := $(shell sysinfo -cpu | grep -w "FMA")
ifneq (,$(findstring FMA,$(FMA_M)))
CFLAGS += -mfma
endif
F16C_M := $(shell sysinfo -cpu | grep -w "F16C")
ifneq (,$(findstring F16C,$(F16C_M)))
CFLAGS += -mf16c
endif
else
CFLAGS += -mfma -mf16c -mavx -mavx2
endif
# Use all CPU extensions that are available:
CFLAGS += -march=native -mtune=native
CXXFLAGS += -march=native -mtune=native
endif
ifneq ($(filter ppc64%,$(UNAME_M)),)
POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)

23
Package.swift Normal file
View File

@@ -0,0 +1,23 @@
// swift-tools-version:5.3
import PackageDescription
let package = Package(
name: "llama",
products: [
.library(name: "llama", targets: ["llama"]),
],
targets: [
.target(
name: "llama",
path: ".",
sources: ["ggml.c", "llama.cpp"],
publicHeadersPath: "spm-headers",
cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"]), .define("GGML_USE_ACCELERATE")],
linkerSettings: [
.linkedFramework("Accelerate")
]
),
],
cxxLanguageStandard: .cxx11
)

View File

@@ -1,6 +1,6 @@
# llama.cpp
![llama](https://user-images.githubusercontent.com/1991296/227761327-6d83e30e-2200-41a6-bfbb-f575231c54f4.png)
![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
[![Actions Status](https://github.com/ggerganov/llama.cpp/workflows/CI/badge.svg)](https://github.com/ggerganov/llama.cpp/actions)
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
@@ -9,10 +9,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
**Hot topics:**
- [Roadmap (short-term)](https://github.com/ggerganov/llama.cpp/discussions/457)
- New C-style API is now available: https://github.com/ggerganov/llama.cpp/pull/370
- Cache input prompts for faster initialization: https://github.com/ggerganov/llama.cpp/issues/64
- Create a `llama.cpp` logo: https://github.com/ggerganov/llama.cpp/issues/105
- [Roadmap Apr 2023](https://github.com/ggerganov/llama.cpp/discussions/784)
## Description
@@ -30,13 +27,32 @@ Please do not make conclusions about the models based on the results from this i
For all I know, it can be completely wrong. This project is for educational purposes.
New features will probably be added mostly through community contributions.
Supported platforms:
**Supported platforms:**
- [X] Mac OS
- [X] Linux
- [X] Windows (via CMake)
- [X] Docker
**Supported models:**
- [X] LLaMA 🦙
- [X] [Alpaca](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca)
- [X] [GPT4All](https://github.com/ggerganov/llama.cpp#using-gpt4all)
- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca)
- [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
- [X] [Vicuna](https://github.com/ggerganov/llama.cpp/discussions/643#discussioncomment-5533894)
**Bindings:**
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
**UI:**
- [nat/openplayground](https://github.com/nat/openplayground)
- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui)
---
Here is a typical run using LLaMA-7B:
@@ -139,6 +155,13 @@ git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
make
#For Windows and CMake, use the following command instead:
cd <path_to_llama_folder>
mkdir build
cd build
cmake ..
cmake --build . --config Release
# obtain the original LLaMA model weights and place them in ./models
ls ./models
65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model
@@ -149,8 +172,8 @@ python3 -m pip install torch numpy sentencepiece
# convert the 7B model to ggml FP16 format
python3 convert-pth-to-ggml.py models/7B/ 1
# quantize the model to 4-bits
python3 quantize.py 7B
# quantize the model to 4-bits (using method 2 = q4_0)
./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin 2
# run the inference
./main -m ./models/7B/ggml-model-q4_0.bin -n 128
@@ -222,6 +245,21 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
>
```
### Using [GPT4All](https://github.com/nomic-ai/gpt4all)
- Obtain the `gpt4all-lora-quantized.bin` model
- It is distributed in the old `ggml` format which is now obsoleted
- You have to convert it to the new format using [./convert-gpt4all-to-ggml.py](./convert-gpt4all-to-ggml.py). You may also need to
convert the model from the old format to the new format with [./migrate-ggml-2023-03-30-pr613.py](./migrate-ggml-2023-03-30-pr613.py):
```bash
python3 convert-gpt4all-to-ggml.py models/gpt4all-7B/gpt4all-lora-quantized.bin ./models/tokenizer.model
python3 migrate-ggml-2023-03-30-pr613.py models/gpt4all-7B/gpt4all-lora-quantized.bin models/gpt4all-7B/gpt4all-lora-quantized-new.bin
```
- You can now use the newly generated `gpt4all-lora-quantized-new.bin` model in exactly the same way as all other models
- The original model is saved in the same folder with a suffix `.orig`
### Obtaining and verifying the Facebook LLaMA original model and Stanford Alpaca model data
- **Under no circumstances share IPFS, magnet links, or any other links to model downloads anywhere in this respository, including in issues, discussions or pull requests. They will be immediately deleted.**
@@ -282,7 +320,7 @@ And after 4.45 hours, you will have the final perplexity.
### Android
You can easily run `llama.cpp` on Android device with [termux](https://play.google.com/store/apps/details?id=com.termux).
You can easily run `llama.cpp` on Android device with [termux](https://termux.dev/).
First, obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake:
```
$ mkdir build-android
@@ -291,7 +329,7 @@ $ export NDK=<your_ndk_directory>
$ cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
$ make
```
Install [termux](https://play.google.com/store/apps/details?id=com.termux) on your device and run `termux-setup-storage` to get access to your SD card.
Install [termux](https://termux.dev/) on your device and run `termux-setup-storage` to get access to your SD card.
Finally, copy the `llama` binary and the model files to your device storage. Here is a demo of an interactive session running on Pixel 5 phone:
https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
@@ -346,3 +384,6 @@ docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models
- Clean-up any trailing whitespaces, use 4 spaces indentation, brackets on same line, `void * ptr`, `int & a`
- See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
### Docs
- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)

62
build.zig Normal file
View File

@@ -0,0 +1,62 @@
const std = @import("std");
pub fn build(b: *std.Build) void {
const target = b.standardTargetOptions(.{});
const optimize = b.standardOptimizeOption(.{});
const lib = b.addStaticLibrary(.{
.name = "llama",
.target = target,
.optimize = optimize,
});
lib.linkLibCpp();
lib.addIncludePath(".");
lib.addIncludePath("examples");
lib.addCSourceFiles(&.{
"ggml.c",
}, &.{"-std=c11"});
lib.addCSourceFiles(&.{
"llama.cpp",
"examples/common.cpp",
}, &.{"-std=c++11"});
lib.install();
const build_args = .{ .b = b, .lib = lib, .target = target, .optimize = optimize };
const exe = build_example("main", build_args);
_ = build_example("quantize", build_args);
_ = build_example("perplexity", build_args);
_ = build_example("embedding", build_args);
// create "zig build run" command for ./main
const run_cmd = exe.run();
run_cmd.step.dependOn(b.getInstallStep());
if (b.args) |args| {
run_cmd.addArgs(args);
}
const run_step = b.step("run", "Run the app");
run_step.dependOn(&run_cmd.step);
}
fn build_example(comptime name: []const u8, args: anytype) *std.build.LibExeObjStep {
const b = args.b;
const lib = args.lib;
const target = args.target;
const optimize = args.optimize;
const exe = b.addExecutable(.{
.name = name,
.target = target,
.optimize = optimize,
});
exe.addIncludePath(".");
exe.addIncludePath("examples");
exe.addCSourceFiles(&.{
std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{name, name}),
}, &.{"-std=c++11"});
exe.linkLibrary(lib);
exe.install();
return exe;
}

299
convert-ggml-to-pth.py Normal file
View File

@@ -0,0 +1,299 @@
# Author: github.com/ductai199x
import argparse
import os
import struct
import numpy as np
import torch
from numba import njit
from tqdm.auto import tqdm
def read_header(fin):
values = struct.unpack("i" * 9, fin.read(4 * 9))
_, _, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype = values
return {
"vocab_size": vocab_size,
"dim": dim,
"multiple_of": multiple_of,
"n_heads": n_heads,
"n_layers": n_layers,
}, ftype
def read_tokens(fin, vocab_size):
tokens = []
for _ in range(vocab_size):
text_len = struct.unpack("i", fin.read(4))[0]
text_bytes = fin.read(text_len)
try:
text = text_bytes.decode()
except UnicodeDecodeError:
text = text_bytes.decode(errors="replace")
score = struct.unpack("f", fin.read(4))[0]
tokens.append((text, score))
return tokens
@njit
def dequantize_weights_numba(fin_data, n_rows, n_cols):
qk = 32
nb = n_cols // qk
bs = 4 + (qk // 2)
weights = np.zeros((n_rows, n_cols), dtype=np.float32)
data_pos = 0
for row in range(n_rows):
for block in range(nb):
d = np.frombuffer(fin_data[data_pos : data_pos + 4], dtype=np.float32)[0]
data_pos += 4
packed_values = fin_data[data_pos : data_pos + (qk // 2)]
data_pos += qk // 2
for i in range(qk // 2):
packed_value = packed_values[i]
v0 = np.float32((packed_value & 0b00001111) - 8) * d
v1 = np.float32((packed_value >> 4) - 8) * d
weights[row, block * qk + 2 * i] = v0
weights[row, block * qk + 2 * i + 1] = v1
return weights
def dequantize_weights(fin, n_rows, n_cols):
qk = 32
nb = n_cols // qk
data_size = n_rows * n_cols // 2 + n_rows * nb * 4
fin_data = fin.read(data_size)
return dequantize_weights_numba(fin_data, n_rows, n_cols)
def read_variables(fin):
model = {}
pbar = tqdm(total=os.path.getsize(fin.name), unit="B", unit_scale=True, desc="Reading variables")
while True:
start_pos = fin.tell()
try:
n_dims, name_length, ftype_cur = struct.unpack("iii", fin.read(4 * 3))
except struct.error:
break
shape = tuple(struct.unpack("i" * n_dims, fin.read(4 * n_dims)))
shape = shape[::-1]
name = fin.read(name_length).decode()
# ensure tensor data is aligned
tensor_data_offset = fin.tell()
tensor_data_offset = (tensor_data_offset + 31) & -32
fin.seek(tensor_data_offset)
if ftype_cur == 2:
# 4-bit quantized weights
dtype = np.uint8
data = dequantize_weights(fin, shape[0], shape[1])
data = data.reshape(shape)
elif ftype_cur == 0:
dtype = np.float32
data_size = np.prod(shape)
data = np.fromfile(fin, dtype=dtype, count=data_size).reshape(shape)
elif ftype_cur == 1:
dtype = np.float16
data_size = np.prod(shape)
data = np.fromfile(fin, dtype=dtype, count=data_size).reshape(shape)
model[name] = torch.tensor(data, dtype=torch.float32 if dtype == np.float32 else torch.float16)
pbar.update(fin.tell() - start_pos)
return model
def convert_to_hf_format(model, hparams):
# This works for llama 7B, need to test with other models
n_layers = hparams["n_layers"]
n_heads = hparams["n_heads"]
dim = hparams["dim"]
dims_per_head = dim // n_heads
base = 10000.0
inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
# permute for sliced rotary
def permute(w):
return w.view(n_heads, dim // n_heads // 2, 2, dim).transpose(1, 2).reshape(dim, dim)
state_dict = {}
for layer_i in range(n_layers):
state_dict.update(
{
f"model.layers.{layer_i}.self_attn.q_proj.weight": permute(
model[f"layers.{layer_i}.attention.wq.weight"]
),
f"model.layers.{layer_i}.self_attn.k_proj.weight": permute(
model[f"layers.{layer_i}.attention.wk.weight"]
),
f"model.layers.{layer_i}.self_attn.v_proj.weight": model[
f"layers.{layer_i}.attention.wv.weight"
],
f"model.layers.{layer_i}.self_attn.o_proj.weight": model[
f"layers.{layer_i}.attention.wo.weight"
],
f"model.layers.{layer_i}.mlp.gate_proj.weight": model[
f"layers.{layer_i}.feed_forward.w1.weight"
],
f"model.layers.{layer_i}.mlp.down_proj.weight": model[
f"layers.{layer_i}.feed_forward.w2.weight"
],
f"model.layers.{layer_i}.mlp.up_proj.weight": model[
f"layers.{layer_i}.feed_forward.w3.weight"
],
f"model.layers.{layer_i}.input_layernorm.weight": model[
f"layers.{layer_i}.attention_norm.weight"
],
f"model.layers.{layer_i}.post_attention_layernorm.weight": model[
f"layers.{layer_i}.ffn_norm.weight"
],
}
)
state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq
state_dict.update(
{
"model.embed_tokens.weight": model["tok_embeddings.weight"],
"model.norm.weight": model["norm.weight"],
"lm_head.weight": model["output.weight"],
}
)
return state_dict
def chat(model, hparams, llama_dir):
from transformers import (GenerationConfig, LlamaForCausalLM,
LlamaTokenizer, StoppingCriteria,
StoppingCriteriaList)
from transformers.models.llama.configuration_llama import LlamaConfig
class StoppingCriteriaSub(StoppingCriteria):
def __init__(self):
super().__init__()
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, stops=[]):
print(tokenizer.decode(input_ids[0]), end="", flush=True)
if input_ids[0][-1] == 13:
return True
return False
config = LlamaConfig(
vocab_size=hparams["vocab_size"],
dim=hparams["dim"],
num_hidden_layers=hparams["n_layers"],
num_attention_heads=hparams["n_heads"],
)
llama = LlamaForCausalLM(config=config)
llama.load_state_dict(state_dict=model, strict=True)
tokenizer = LlamaTokenizer.from_pretrained(llama_dir)
device = torch.device("cpu")
llama = llama.to(device)
ctx = """You are AI.
This is a dialog, where User interacts with AI. AI is helpful, kind, obedient, honest, respectful, direct, concise, should try to protect User's privacy, and knows its own limits. Also, AI must answer User and AI cannot stop the conversation by itself.
User: Hello, AI.
AI: Hello! How can I assist you today?
"""
print(ctx.rstrip("\n"))
while True:
print("-" * 60)
prompt = input("User: ")
if ctx != "":
ctx = f"{ctx}User: {prompt}\n"
else:
ctx = f"{prompt}\nAI:"
ctx = (ctx[-1920:]) if len(ctx) >= 2048 else ctx
print("-" * 60)
if len(ctx.strip()) > 0:
input_ids = tokenizer(ctx, return_tensors="pt")["input_ids"].to(device)
generation_config = GenerationConfig(
temperature=0.8,
top_p=0.95,
top_k=50,
repetition_penalty=1.1764,
)
with torch.no_grad():
generation_output = llama.generate(
input_ids=input_ids,
generation_config=generation_config,
return_dict_in_generate=True,
output_scores=True,
max_length=2048,
do_sample=True,
stopping_criteria=StoppingCriteriaList([StoppingCriteriaSub()]),
)
s = generation_output.sequences[0]
decoded = tokenizer.decode(s)
ctx = f"{decoded}\n"
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--input_dir", "-i", type=str, required=True, help="The input directory containing the ggml files."
)
parser.add_argument(
"--prefix",
"-p",
type=str,
required=True,
help="The prefix of the ggml files (ggml-model-f16 or ggml-model-q4_0).",
)
parser.add_argument(
"--hf",
action="store_true",
help="Whether to save the model in the Hugging Face format. (default: False)",
)
parser.add_argument(
"--chat", "-c", action="store_true", help="Whether to open a chat with the model. (default: False)"
)
args = parser.parse_args()
llama_dir = os.path.abspath(f"{args.input_dir}/../")
ggml_files = sorted(
[f"{args.input_dir}/{f}" for f in os.listdir(args.input_dir) if f.startswith(args.prefix)]
)
fin = open(ggml_files[0], "rb")
hparams, ftype = read_header(fin)
tokens = read_tokens(fin, hparams["vocab_size"])
model = read_variables(fin)
for f in tqdm(ggml_files[1:]):
fin = open(f, "rb")
read_header(fin)
read_tokens(fin, hparams["vocab_size"])
model.update(read_variables(fin))
if args.hf:
model = convert_to_hf_format(model, hparams)
pth_ckpt = {
"state_dict": model,
"hparams": hparams,
"tokens": tokens,
}
torch.save(pth_ckpt, f"{args.input_dir}/{args.prefix}-to-torch.pth")
if args.chat:
if not args.hf:
model = convert_to_hf_format(model, hparams)
chat(model, hparams, llama_dir)
if __name__ == "__main__":
main()

107
convert-gpt4all-to-ggml.py Normal file
View File

@@ -0,0 +1,107 @@
#!/usr/bin/env python3
#
# TODO: deduplicate GPT4All with convert-unversioned-ggml-to-ggml.py
#
# Original by https://github.com/eiz
# https://github.com/ggerganov/llama.cpp/issues/324#issuecomment-1476227818
import argparse
import glob
import os
import struct
import sys
from sentencepiece import SentencePieceProcessor
HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
def parse_args():
parser = argparse.ArgumentParser(description='Upgrade a GPT4All model to the current format')
parser.add_argument('gpt4all_model', help='path to gpt4all-lora-quantized.bin')
parser.add_argument('tokenizer_model', help='path to LLaMA tokenizer.model file')
return parser.parse_args()
def read_header(f_in):
struct_fmt = "i" * (3 + len(HPARAMS))
struct_size = struct.calcsize(struct_fmt)
buf = f_in.read(struct_size)
return struct.unpack(struct_fmt, buf)
def write_header(f_out, header):
(magic, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype) = header
if magic != 0x67676d6c:
raise Exception('Invalid file magic. Must be an old style ggml file.')
values = [
0x67676d66, # magic: ggml in hex
1, # file version
vocab_size,
dim,
multiple_of,
n_heads,
n_layers,
rot,
ftype
]
f_out.write(struct.pack("i" * len(values), *values))
def write_tokens(fout, tokenizer):
for i in range(tokenizer.vocab_size()):
if tokenizer.is_unknown(i):
text = " \u2047 ".encode()
elif tokenizer.is_control(i):
text = b""
elif tokenizer.is_byte(i):
piece = tokenizer.id_to_piece(i)
if len(piece) != 6:
print(f"Invalid token: {piece}")
sys.exit(1)
byte_value = int(piece[3:-1], 16)
text = struct.pack("B", byte_value)
else:
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
fout.write(struct.pack("i", len(text)))
fout.write(text)
fout.write(struct.pack("f", tokenizer.get_score(i)))
# TODO: GPT4All - add extra <pad> token
text = "<pad>".encode()
fout.write(struct.pack("i", len(text)))
fout.write(text)
fout.write(struct.pack("f", 0.0))
def read_tokens(f_in, tokenizer):
for i in range(tokenizer.vocab_size()):
len_b = f_in.read(4)
(length,) = struct.unpack("i", len_b)
f_in.read(length)
def copy_all_data(f_out, f_in):
while True:
buf = f_in.read(1024 * 1024)
if not buf:
break
f_out.write(buf)
def convert_one_file(path_in, tokenizer):
path_tmp = f"{path_in}.tmp"
path_orig= f"{path_in}.orig"
print(f"converting {path_in}")
with open(path_in, "rb") as f_in, open(path_tmp, "wb") as f_out:
write_header(f_out, read_header(f_in))
read_tokens(f_in, tokenizer)
write_tokens(f_out, tokenizer)
copy_all_data(f_out, f_in)
os.rename(path_in, path_orig)
os.rename(path_tmp, path_in)
def main():
args = parse_args()
tokenizer = SentencePieceProcessor(args.tokenizer_model)
convert_one_file(args.gpt4all_model, tokenizer)
if __name__ == "__main__":
main()

View File

@@ -50,7 +50,7 @@ fout.write(struct.pack("i", 4))
# This loop unchanged from convert-pth-to-ggml.py:
for i in range(tokenizer.vocab_size()):
if tokenizer.is_unknown(i):
text = " \u2047 ".encode("utf-8")
text = " \u2047 ".encode()
elif tokenizer.is_control(i):
text = b""
elif tokenizer.is_byte(i):
@@ -61,21 +61,26 @@ for i in range(tokenizer.vocab_size()):
byte_value = int(piece[3:-1], 16)
text = struct.pack("B", byte_value)
else:
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
fout.write(struct.pack("i", len(text)))
fout.write(text)
fout.write(struct.pack("f", tokenizer.get_score(i)))
def write_header(shape, dst_name, ftype_cur):
sname = dst_name.encode('utf-8')
sname = dst_name.encode()
fout.write(struct.pack("iii", len(shape), len(sname), ftype_cur))
fout.write(struct.pack("i" * len(shape), *shape[::-1]))
fout.write(sname)
# ensure tensor data is aligned
tensor_data_offset = fout.tell()
tensor_data_offset = (tensor_data_offset + 31) & -32
fout.seek(tensor_data_offset)
def convert_non_q4(src_name, dst_name):
v = model[src_name]
shape = v.shape
print("Processing non-Q4 variable: " + src_name + " with shape: ", shape, " and type: ", v.dtype)
print(f"Processing non-Q4 variable: {src_name} with shape: {shape} and type: {v.dtype}")
if len(shape) == 1:
print(" Converting to float32")
v = v.to(torch.float32)
@@ -100,7 +105,7 @@ def convert_q4(src_name, dst_name, permute=False):
# Each int32 item is actually 8 int4 items packed together, and it's transposed.
shape = (qweight.shape[0], qweight.shape[1] * 8)
print("Processing Q4 variable: " + src_name + " with shape: ", shape)
print(f"Processing Q4 variable: {src_name} with shape: {shape}")
# The output format has the int4 weights in groups of 32 rather than 8.
# It looks like this:
@@ -163,5 +168,5 @@ for i in range(n_layer):
fout.close()
print("Done. Output file: " + fname_out)
print("")
print(f"Done. Output file: {fname_out}")
print()

View File

@@ -1,4 +1,4 @@
# Convert a LLaMA model checkpoint to a ggml compatible file
# Convert a LLaMA model checkpoint to a ggjt compatible file
#
# Load the model using Torch
# Iterate over all variables and write them to a binary file.
@@ -24,8 +24,57 @@ import torch
from sentencepiece import SentencePieceProcessor
def parse_args():
QK = 32
GGML_TYPE_Q4_0 = 0
GGML_TYPE_Q4_1 = 1
GGML_TYPE_I8 = 2
GGML_TYPE_I16 = 3
GGML_TYPE_I32 = 4
GGML_TYPE_F16 = 5
GGML_TYPE_F32 = 6
WTYPES = {
0: GGML_TYPE_F32,
1: GGML_TYPE_F16,
2: GGML_TYPE_Q4_0,
3: GGML_TYPE_Q4_1,
}
GGML_BLCK_SIZE = {
GGML_TYPE_Q4_0: QK,
GGML_TYPE_Q4_1: QK,
GGML_TYPE_I8: 1,
GGML_TYPE_I16: 1,
GGML_TYPE_I32: 1,
GGML_TYPE_F16: 1,
GGML_TYPE_F32: 1,
}
GGML_TYPE_SIZE = {
GGML_TYPE_Q4_0: 4 + QK//2,
GGML_TYPE_Q4_1: 4*2 + QK//2,
GGML_TYPE_I8: 1,
GGML_TYPE_I16: 2,
GGML_TYPE_I32: 4,
GGML_TYPE_F16: 2,
GGML_TYPE_F32: 4,
}
def ggml_nelements(shape):
r = 1
for i in shape:
r *= i
return r
def ggml_nbytes(shape, ftype):
x = ggml_nelements(shape)
t = WTYPES[ftype]
x *= GGML_TYPE_SIZE[t]
x //= GGML_BLCK_SIZE[t]
return x
def parse_args():
parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file')
parser.add_argument('dir_model', help='directory containing the model checkpoint')
parser.add_argument('ftype', help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1)
@@ -33,7 +82,6 @@ def parse_args():
return parser.parse_args()
def get_n_parts(dim):
mappings = {4096: 1, 5120: 2, 6656: 4, 8192: 8}
n_parts = mappings.get(dim)
if n_parts is None:
@@ -44,30 +92,24 @@ def get_n_parts(dim):
return n_parts
def load_hparams_and_tokenizer(dir_model):
# `dir_model` is something like `models/7B` or `models/7B/`.
# "tokenizer.model" is expected under model's parent dir.
# When `dir_model` is a symlink, f"{dir_model}/../tokenizer.model" would not be found.
# Let's use the model's parent dir directly.
model_parent_dir = os.path.dirname(os.path.normpath(dir_model))
fname_hparams = f"{dir_model}/params.json"
fname_tokenizer = f"{model_parent_dir}/tokenizer.model"
with open(fname_hparams, "r") as f:
hparams = json.load(f)
print(hparams)
tokenizer = SentencePieceProcessor(fname_tokenizer)
hparams.update({"vocab_size": tokenizer.vocab_size()})
return hparams, tokenizer
def write_header(fout, hparams, ftype):
keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
values = [
0x67676d66, # magic: ggmf in hex
0x67676a74, # magic: ggjt in hex
1, # file version
*[hparams[key] for key in keys],
hparams["dim"] // hparams["n_heads"], # rot (obsolete)
@@ -76,10 +118,9 @@ def write_header(fout, hparams, ftype):
fout.write(struct.pack("i" * len(values), *values))
def write_tokens(fout, tokenizer):
for i in range(tokenizer.vocab_size()):
if tokenizer.is_unknown(i):
text = " \u2047 ".encode("utf-8")
text = " \u2047 ".encode()
elif tokenizer.is_control(i):
text = b""
elif tokenizer.is_byte(i):
@@ -90,92 +131,144 @@ def write_tokens(fout, tokenizer):
byte_value = int(piece[3:-1], 16)
text = struct.pack("B", byte_value)
else:
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
fout.write(struct.pack("i", len(text)))
fout.write(text)
fout.write(struct.pack("f", tokenizer.get_score(i)))
def process_and_write_variables(fout, model, ftype):
def process_and_write_variables(fout, model, ftype, part_id, n_parts):
for name, datao in model.items():
if name.endswith("freqs"):
continue
shape = datao.shape
print(f"Processing variable: {name} with shape: {shape} and type: {datao.dtype}")
# remove dimensions with a single element
data = datao.numpy().squeeze()
n_dims = len(shape)
partshape = data.shape
n_dims = len(data.shape)
assert n_dims in (1, 2)
# default type is fp16
print(f"Processing variable: {name} with shape: {partshape} and type: {datao.dtype}")
# coerce single-dimensional tensors from float16 to float32
ftype_cur = 1
if ftype == 0 or n_dims == 1:
print(" Converting to float32")
data = data.astype(np.float32)
ftype_cur = 0
blck_size = GGML_BLCK_SIZE[WTYPES[ftype_cur]]
type_size = GGML_TYPE_SIZE[WTYPES[ftype_cur]]
# header
sname = name.encode('utf-8')
fout.write(struct.pack("iii", len(data.shape), len(sname), ftype_cur))
for dim in reversed(data.shape):
# determine dimension along which multipart tensor is sharded
#
# split_dim 0 regex:
# - output.*
# - layers.*.attention.wq.weight
# - layers.*.attention.wk.weight
# - layers.*.attention.wv.weight
# - layers.*.feed_forward.w1.weight
# - layers.*.feed_forward.w3.weight
#
# split_dim 1 regex:
# - tok_embeddings.*
# - layers.*.attention.wo.weight
# - layers.*.feed_forward.w2.weight
#
if n_dims > 1:
split_dim = 1
if "tok_embeddings" in name:
split_dim = 1
elif "layers" in name:
if "attention.wo.weight" in name:
split_dim = 1
elif "feed_forward.w2.weight" in name:
split_dim = 1
else:
split_dim = 0
elif "output" in name:
split_dim = 0
# output tensor header
fullshape = list(partshape)
if n_dims > 1:
fullshape[split_dim] *= n_parts
sname = name.encode()
fout.write(struct.pack("iii", n_dims, len(sname), ftype_cur))
for dim in reversed(fullshape):
fout.write(struct.pack("i", dim))
fout.write(sname)
# data output to file
data.tofile(fout)
# ensure tensor data is aligned
tensor_data_offset = fout.tell()
while tensor_data_offset % QK != 0:
fout.write(struct.pack("B", 0))
tensor_data_offset += 1
# output unified mappable tensor data
if n_dims == 1 or n_parts == 1:
# copy tensor which we thankfully received in one piece
if part_id == 0:
data.tofile(fout)
elif split_dim == 0:
# reassemble multifile tensor containing some of the rows
rows_per_chunk = partshape[0]
current_row = part_id * rows_per_chunk
bytes_per_row = fullshape[1] // blck_size * type_size
offset = current_row * bytes_per_row
fout.seek(tensor_data_offset + offset)
data.tofile(fout)
elif split_dim == 1:
# reassemble multifile tensor containing some of the cols
cols_per_chunk = partshape[1]
current_col = part_id * cols_per_chunk
bytes_per_row = fullshape[1] // blck_size * type_size
offset_current_col = current_col // blck_size * type_size
for row in range(partshape[0]):
offset_row = row * bytes_per_row
offset = offset_row + offset_current_col
fout.seek(tensor_data_offset + offset)
data[row].tofile(fout)
# advance file position to next tensor
fout.seek(tensor_data_offset + ggml_nbytes(fullshape, ftype_cur))
def main():
args = parse_args()
dir_model = args.dir_model
ftype = args.ftype
ftype_str = ["f32", "f16"]
hparams, tokenizer = load_hparams_and_tokenizer(dir_model)
print(args)
# if only writing vocab to file
if args.vocab_only:
fname_model = f"{dir_model}/consolidated.00.pth"
fname_out = f"{dir_model}/ggml-vocab.bin"
print(f"Extracting only the vocab from '{fname_model}'\n")
model = torch.load(fname_model, map_location="cpu")
with open(fname_out, "wb") as fout:
write_header(fout, hparams, ftype)
write_tokens(fout, tokenizer)
del model
print(f"Done. Output file: {fname_out}\n")
return
n_parts = get_n_parts(hparams["dim"])
fname_out = f"{dir_model}/ggml-model-{ftype_str[ftype]}.bin"
for p in range(n_parts):
# we output a single file for ggml
with open(fname_out, "wb") as fout:
write_header(fout, hparams, ftype)
write_tokens(fout, tokenizer)
offset_of_tensors = fout.tell()
# the tensors we load could be split across multiple files
for part_id in range(n_parts):
fout.seek(offset_of_tensors)
print(f"Processing part {part_id+1} of {n_parts}\n")
fname_model = f"{dir_model}/consolidated.0{part_id}.pth"
model = torch.load(fname_model, map_location="cpu")
process_and_write_variables(fout, model, ftype, part_id, n_parts)
del model
print(f"Processing part {p+1} of {n_parts}\n")
fname_model = f"{dir_model}/consolidated.0{p}.pth"
fname_out = f"{dir_model}/ggml-model-{ftype_str[ftype]}.bin{'' if p == 0 else '.' + str(p)}"
model = torch.load(fname_model, map_location="cpu")
with open(fname_out, "wb") as fout:
write_header(fout, hparams, ftype)
write_tokens(fout, tokenizer)
process_and_write_variables(fout, model, ftype)
del model
print(f"Done. Output file: {fname_out}, (part {p})\n")
print(f"Done. Output file: {fname_out}\n")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,100 @@
#!/usr/bin/env python3
# Original by https://github.com/eiz
# https://github.com/ggerganov/llama.cpp/issues/324#issuecomment-1476227818
import argparse
import glob
import os
import struct
import sys
from sentencepiece import SentencePieceProcessor
HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
def parse_args():
parser = argparse.ArgumentParser(description='Upgrade old ggml model files to the current format')
parser.add_argument('dir_model', help='directory containing ggml .bin files')
parser.add_argument('tokenizer_model', help='path to LLaMA tokenizer.model file')
return parser.parse_args()
def read_header(f_in):
struct_fmt = "i" * (3 + len(HPARAMS))
struct_size = struct.calcsize(struct_fmt)
buf = f_in.read(struct_size)
return struct.unpack(struct_fmt, buf)
def write_header(f_out, header):
(magic, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype) = header
if magic != 0x67676d6c:
raise Exception('Invalid file magic. Must be an old style ggml file.')
values = [
0x67676d66, # magic: ggml in hex
1, # file version
vocab_size,
dim,
multiple_of,
n_heads,
n_layers,
rot,
ftype
]
f_out.write(struct.pack("i" * len(values), *values))
def write_tokens(fout, tokenizer):
for i in range(tokenizer.vocab_size()):
if tokenizer.is_unknown(i):
text = " \u2047 ".encode()
elif tokenizer.is_control(i):
text = b""
elif tokenizer.is_byte(i):
piece = tokenizer.id_to_piece(i)
if len(piece) != 6:
print(f"Invalid token: {piece}")
sys.exit(1)
byte_value = int(piece[3:-1], 16)
text = struct.pack("B", byte_value)
else:
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
fout.write(struct.pack("i", len(text)))
fout.write(text)
fout.write(struct.pack("f", tokenizer.get_score(i)))
def read_tokens(f_in, tokenizer):
for i in range(tokenizer.vocab_size()):
len_b = f_in.read(4)
(length,) = struct.unpack("i", len_b)
f_in.read(length)
def copy_all_data(f_out, f_in):
while True:
buf = f_in.read(1024 * 1024)
if not buf:
break
f_out.write(buf)
def convert_one_file(path_in, tokenizer):
path_tmp = f"{path_in}.tmp"
path_orig= f"{path_in}.orig"
print(f"converting {path_in}")
with open(path_in, "rb") as f_in, open(path_tmp, "wb") as f_out:
write_header(f_out, read_header(f_in))
read_tokens(f_in, tokenizer)
write_tokens(f_out, tokenizer)
copy_all_data(f_out, f_in)
os.rename(path_in, path_orig)
os.rename(path_tmp, path_in)
def main():
args = parse_args()
files = []
files.extend(glob.glob(f"{args.dir_model}/*.bin"))
files.extend(glob.glob(f"{args.dir_model}/*.bin.*"))
tokenizer = SentencePieceProcessor(args.tokenizer_model)
for file in files:
convert_one_file(file, tokenizer)
if __name__ == "__main__":
main()

49
examples/Miku.sh Executable file
View File

@@ -0,0 +1,49 @@
#!/bin/bash
set -e
AI_NAME="${AI_NAME:-Miku}"
MODEL="${MODEL:-./models/gpt4all-7B/gpt4all-lora-unfiltered-quantized.bin}"
USER_NAME="${USER_NAME:-Anon}"
# Uncomment and adjust to the number of CPU cores you want to use.
#N_THREAD="${N_THREAD:-4}"
N_PREDICTS="${N_PREDICTS:-4096}"
GEN_OPTIONS=(--batch_size 1024
--ctx_size 2048
--keep -1
--repeat_last_n 256
--repeat_penalty 1.17647
--temp 0.7
--top_k 40
--top_p 0.5)
if [ -n "$N_THREAD" ]; then
GEN_OPTIONS+=(--threads "$N_THREAD")
fi
./main "${GEN_OPTIONS[@]}" \
--model "$MODEL" \
--n_predict "$N_PREDICTS" \
--color --interactive \
--reverse-prompt "${USER_NAME}:" \
--prompt "
This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the users computer.
${AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next.
${AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct she will ask the user for help.
${AI_NAME} is a very helpful AI and will help the user with anything they need, she is also very friendly and will try to make the user feel better if they are sad.
${AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life, she will also try to make the user like her.
The conversation is only between ${USER_NAME} and ${AI_NAME}
The conversation is only through text, so ${AI_NAME} can't see ${USER_NAME}'s face or hear his voice.
${AI_NAME} can only communicate through text, so she can't send images or videos.
${USER_NAME}: Hello!
${AI_NAME}: /think I wonder what I should say to ${USER_NAME}? This is the first time we talk so it's important that I make a good first impression!
${AI_NAME}: Hi! I am ${AI_NAME}, your new AI friend, assistant(or whatever you like!), it's so nice to meet you! ^_^
${AI_NAME}: I might be an AI, but I also have feelings, so please be nice to me! :)
${USER_NAME}: I'll make sure to be nice to you! I'm so happy to have you as my assistant!
${AI_NAME}: /think It sounds like ${USER_NAME} is happy to have me as their assistant! I'm so happy too! ^_^ Glad that whole emotion thing didn't scare him off!
${AI_NAME}: /think I wonder what ${USER_NAME} likes to do in his free time? I should ask him about that!
${AI_NAME}: What do you like to do in your free time? ^_^
${USER_NAME}:" "$@"

57
examples/chat-13B.bat Normal file
View File

@@ -0,0 +1,57 @@
@setlocal disabledelayedexpansion enableextensions
@echo off
cd /d "%~dp0.."
if not "%errorlevel%"=="0" (
echo Unable to change directory.
pause
exit /b 1
)
if not defined MODEL set "MODEL=models\13B\ggml-model-q4_0.bin"
if not defined USER_NAME set "USER_NAME=User"
if not defined AI_NAME set "AI_NAME=ChatLLaMa"
rem Adjust to the number of CPU cores you want to use.
rem if not defined N_THREAD set "N_THREAD=8"
rem Number of tokens to predict (made it larger than default because we want a long interaction)
if not defined N_PREDICTS set "N_PREDICTS=2048"
if not defined GEN_OPTIONS set "GEN_OPTIONS=--ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647"
rem Default main script paths
set "DEFAULT_MAIN_SCRIPT_PATHS=main.exe build\bin\main.exe"
rem Get main script path from command line arguments
set "MAIN_SCRIPT_PATH=%~1"
rem If the main script path was not specified, try the default paths
if not defined MAIN_SCRIPT_PATH (
for %%i in (%DEFAULT_MAIN_SCRIPT_PATHS%) do (
if exist "%%i" set "MAIN_SCRIPT_PATH=%%i"
)
)
rem If the main script path was not found, tell the user how to specify it
if not defined MAIN_SCRIPT_PATH (
echo The main script could not be found. Please provide the path to the main script as 1st argument to this script, or place the main script in one of the default locations:
echo %DEFAULT_MAIN_SCRIPT_PATHS%
pause
exit /b 1
)
rem Default context, feel free to edit it
set "PROMPT_TEXT=Text transcript of a never ending dialog, where %USER_NAME% interacts with an AI assistant named %AI_NAME%. %AI_NAME% is helpful, kind, honest, friendly, good at writing and never fails to answer %USER_NAME%'s requests immediately and with details and precision. There are no annotations like (30 seconds passed...) or (to himself), just what %USER_NAME% and %AI_NAME% say aloud to each other. The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long. The transcript only includes text, it does not include markup like HTML and Markdown."
rem Set a temporary variable if N_THREAD is set
if defined N_THREAD (
set "_N_THREAD=--threads %N_THREAD%"
) else (
set "_N_THREAD="
)
rem Run the script
echo "%MAIN_SCRIPT_PATH%" %GEN_OPTIONS% %_N_THREAD% ^
--model "%MODEL%" ^
--n_predict %N_PREDICTS% ^
--color --interactive ^
--reverse-prompt "%USER_NAME%:" ^
--prompt "%PROMPT_TEXT%"

View File

@@ -39,6 +39,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
bool invalid_param = false;
std::string arg;
gpt_params default_params;
for (int i = 1; i < argc; i++) {
arg = argv[i];
@@ -66,6 +68,11 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
break;
}
std::ifstream file(argv[i]);
if (!file) {
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
invalid_param = true;
break;
}
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
if (params.prompt.back() == '\n') {
params.prompt.pop_back();
@@ -168,7 +175,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
}
params.n_parts = std::stoi(argv[i]);
} else if (arg == "-h" || arg == "--help") {
gpt_print_usage(argc, argv, params);
gpt_print_usage(argc, argv, default_params);
exit(0);
} else if (arg == "--random-prompt") {
params.random_prompt = true;
@@ -180,13 +187,13 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
params.input_prefix = argv[i];
} else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
gpt_print_usage(argc, argv, params);
gpt_print_usage(argc, argv, default_params);
exit(1);
}
}
if (invalid_param) {
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
gpt_print_usage(argc, argv, params);
gpt_print_usage(argc, argv, default_params);
exit(1);
}
@@ -215,13 +222,13 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stderr, " prompt file to start generation.\n");
fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict);
fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k);
fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", params.top_p);
fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", (double)params.top_p);
fprintf(stderr, " --repeat_last_n N last n tokens to consider for penalize (default: %d)\n", params.repeat_last_n);
fprintf(stderr, " --repeat_penalty N penalize repeat sequence of tokens (default: %.1f)\n", params.repeat_penalty);
fprintf(stderr, " --repeat_penalty N penalize repeat sequence of tokens (default: %.1f)\n", (double)params.repeat_penalty);
fprintf(stderr, " -c N, --ctx_size N size of the prompt context (default: %d)\n", params.n_ctx);
fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating\n");
fprintf(stderr, " --memory_f32 use f32 instead of f16 for memory key+value\n");
fprintf(stderr, " --temp N temperature (default: %.1f)\n", params.temp);
fprintf(stderr, " --temp N temperature (default: %.1f)\n", (double)params.temp);
fprintf(stderr, " --n_parts N number of model parts (default: -1 = determine from dimensions)\n");
fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch);
fprintf(stderr, " --perplexity compute perplexity over the prompt\n");

View File

@@ -1,4 +1,4 @@
set(TARGET embedding)
add_executable(${TARGET} embedding.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ggml ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

15
examples/gpt4all.sh Executable file
View File

@@ -0,0 +1,15 @@
#!/bin/bash
#
# Temporary script - will be removed in the future
#
cd `dirname $0`
cd ..
./main --color --instruct --threads 4 \
--model ./models/gpt4all-7B/gpt4all-lora-quantized.bin \
--file ./prompts/alpaca.txt \
--batch_size 8 --ctx_size 2048 \
--repeat_last_n 64 --repeat_penalty 1.3 \
--n_predict 128 --temp 0.1 --top_k 40 --top_p 0.95

View File

@@ -1,4 +1,4 @@
set(TARGET main)
add_executable(${TARGET} main.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ggml ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View File

@@ -209,7 +209,8 @@ int main(int argc, char ** argv) {
fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str());
}
}
fprintf(stderr, "sampling: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
fprintf(stderr, "sampling: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n",
params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
fprintf(stderr, "generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
fprintf(stderr, "\n\n");
@@ -274,10 +275,10 @@ int main(int argc, char ** argv) {
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
// out of user input, sample next token
const float top_k = params.top_k;
const float top_p = params.top_p;
const float temp = params.temp;
const float repeat_penalty = params.repeat_penalty;
const int32_t top_k = params.top_k;
const float top_p = params.top_p;
const float temp = params.temp;
const float repeat_penalty = params.repeat_penalty;
llama_token id = 0;
@@ -367,6 +368,11 @@ int main(int argc, char ** argv) {
// potentially set color to indicate we are taking user input
set_console_color(con_st, CONSOLE_COLOR_USER_INPUT);
#if defined (_WIN32)
// Windows: must reactivate sigint handler after each signal
signal(SIGINT, sigint_handler);
#endif
if (params.instruct) {
printf("\n> ");
}

View File

@@ -1,4 +1,4 @@
set(TARGET perplexity)
add_executable(${TARGET} perplexity.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ggml ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View File

@@ -1,15 +1,17 @@
#include "common.h"
#include "llama.h"
std::vector<double> softmax(const std::vector<float>& logits) {
std::vector<double> probs(logits.size());
#include <cmath>
std::vector<float> softmax(const std::vector<float>& logits) {
std::vector<float> probs(logits.size());
float max_logit = logits[0];
for (float v : logits) max_logit = std::max(max_logit, v);
double sum_exp = 0.0;
for (size_t i = 0; i < logits.size(); i++) {
// Subtract the maximum logit value from the current logit value for numerical stability
float logit = logits[i] - max_logit;
double exp_logit = std::exp(logit);
const float logit = logits[i] - max_logit;
const float exp_logit = expf(logit);
sum_exp += exp_logit;
probs[i] = exp_logit;
}
@@ -24,14 +26,16 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
auto tokens = ::llama_tokenize(ctx, params.prompt, true);
int count = 0;
double nll = 0.0;
int seq_count = tokens.size() / params.n_ctx;
double nll = 0.0;
fprintf(stderr, "%s : calculating perplexity over %d chunks\n", __func__, seq_count);
for (int i = 0; i < seq_count; ++i) {
int start = i * params.n_ctx;
int end = start + params.n_ctx - 1;
int end = start + params.n_ctx - 1; // TODO: this is not optimal, e.g. it makes the batch 511 instead of 512
// it is better to always be power of 2 for better performance
std::vector<llama_token> embd(tokens.begin() + start, tokens.begin() + end);
auto start_t = std::chrono::high_resolution_clock::now();
if (llama_eval(ctx, embd.data(), embd.size(), 0, params.n_threads)) {
@@ -40,7 +44,7 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
}
auto end_t = std::chrono::high_resolution_clock::now();
if (i == 0) {
double seconds = std::chrono::duration<double>(end_t - start_t).count();
const float seconds = std::chrono::duration<float>(end_t - start_t).count();
printf("%.2f seconds per pass - ETA %.2f hours\n", seconds, (seconds * seq_count) / (60.0*60.0));
}
// We get the logits for all the tokens in the context window (params.n_ctx)
@@ -63,7 +67,7 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
std::vector<float> tok_logits(
logits + j * n_vocab,
logits + (j + 1) * n_vocab);
double prob = softmax(tok_logits)[tokens[start + j + 1]];
const float prob = softmax(tok_logits)[tokens[start + j + 1]];
nll += -std::log(prob);
++count;
}

View File

@@ -1,4 +1,4 @@
set(TARGET quantize)
add_executable(${TARGET} quantize.cpp)
target_link_libraries(${TARGET} PRIVATE llama ggml ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View File

@@ -4,8 +4,6 @@
#include <cstdio>
#include <string>
const int QK = 32;
// usage:
// ./llama-quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type
//
@@ -21,7 +19,7 @@ int main(int argc, char ** argv) {
// needed to initialize f16 tables
{
struct ggml_init_params params = { 0, NULL };
struct ggml_init_params params = { 0, NULL, false };
struct ggml_context * ctx = ggml_init(params);
ggml_free(ctx);
}
@@ -39,7 +37,7 @@ int main(int argc, char ** argv) {
{
const int64_t t_start_us = ggml_time_us();
if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), itype, QK)) {
if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), itype)) {
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
return 1;
}
@@ -52,8 +50,8 @@ int main(int argc, char ** argv) {
const int64_t t_main_end_us = ggml_time_us();
printf("\n");
printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0);
printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0);
}
return 0;

17
examples/reason-act.sh Executable file
View File

@@ -0,0 +1,17 @@
#!/bin/bash
cd `dirname $0`
cd ..
# get -m model parameter otherwise defer to default
if [ "$1" == "-m" ]; then
MODEL="-m $2 "
fi
./main $MODEL --color \
-f ./prompts/reason-act.txt \
-i --interactive-first \
--top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7 -c 2048 \
-r "Question:" -r "Observation:" --in-prefix " " \
-n -1

2274
ggml.c

File diff suppressed because it is too large Load Diff

73
ggml.h
View File

@@ -258,11 +258,11 @@ struct ggml_tensor {
enum ggml_type type;
int n_dims;
int ne[GGML_MAX_DIMS]; // number of elements
size_t nb[GGML_MAX_DIMS]; // stride in bytes:
// nb[0] = sizeof(type)
// nb[1] = nb[0] * ne[0] + padding
// nb[i] = nb[i-1] * ne[i-1]
int64_t ne[GGML_MAX_DIMS]; // number of elements
size_t nb[GGML_MAX_DIMS]; // stride in bytes:
// nb[0] = sizeof(type)
// nb[1] = nb[0] * ne[0] + padding
// nb[i] = nb[i-1] * ne[i-1]
// compute data
enum ggml_op op;
@@ -316,6 +316,7 @@ struct ggml_init_params {
// memory pool
size_t mem_size; // bytes
void * mem_buffer; // if NULL, memory will be allocated internally
bool no_alloc; // don't allocate memory for the tensor data
};
void ggml_time_init(void); // call this once at the beginning of the program
@@ -327,8 +328,8 @@ int64_t ggml_cycles_per_ms(void);
void ggml_print_object (const struct ggml_object * obj);
void ggml_print_objects(const struct ggml_context * ctx);
int ggml_nelements(const struct ggml_tensor * tensor);
size_t ggml_nbytes (const struct ggml_tensor * tensor);
int64_t ggml_nelements(const struct ggml_tensor * tensor);
size_t ggml_nbytes (const struct ggml_tensor * tensor);
int ggml_blck_size (enum ggml_type type);
size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
@@ -344,39 +345,43 @@ size_t ggml_used_mem(const struct ggml_context * ctx);
size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
bool ggml_mlock_supported(void);
bool ggml_mlock(struct ggml_context * ctx, char ** err_p);
bool ggml_mlock(
struct ggml_context * ctx,
const void *opt_extra_addr,
size_t opt_extra_len,
char **err_p);
struct ggml_tensor * ggml_new_tensor(
struct ggml_context * ctx,
enum ggml_type type,
int n_dims,
const int *ne);
const int64_t *ne);
struct ggml_tensor * ggml_new_tensor_1d(
struct ggml_context * ctx,
enum ggml_type type,
int ne0);
int64_t ne0);
struct ggml_tensor * ggml_new_tensor_2d(
struct ggml_context * ctx,
enum ggml_type type,
int ne0,
int ne1);
int64_t ne0,
int64_t ne1);
struct ggml_tensor * ggml_new_tensor_3d(
struct ggml_context * ctx,
enum ggml_type type,
int ne0,
int ne1,
int ne2);
int64_t ne0,
int64_t ne1,
int64_t ne2);
struct ggml_tensor * ggml_new_tensor_4d(
struct ggml_context * ctx,
enum ggml_type type,
int ne0,
int ne1,
int ne2,
int ne3);
int64_t ne0,
int64_t ne1,
int64_t ne2,
int64_t ne3);
struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
@@ -526,33 +531,43 @@ struct ggml_tensor * ggml_reshape(
struct ggml_tensor * ggml_reshape_2d(
struct ggml_context * ctx,
struct ggml_tensor * a,
int ne0,
int ne1);
int64_t ne0,
int64_t ne1);
// return view(a)
// TODO: when we start computing gradient, make a copy instead of view
struct ggml_tensor * ggml_reshape_3d(
struct ggml_context * ctx,
struct ggml_tensor * a,
int ne0,
int ne1,
int ne2);
int64_t ne0,
int64_t ne1,
int64_t ne2);
// offset in bytes
struct ggml_tensor * ggml_view_1d(
struct ggml_context * ctx,
struct ggml_tensor * a,
int ne0,
int64_t ne0,
size_t offset);
struct ggml_tensor * ggml_view_2d(
struct ggml_context * ctx,
struct ggml_tensor * a,
int ne0,
int ne1,
int64_t ne0,
int64_t ne1,
size_t nb1, // row stride in bytes
size_t offset);
struct ggml_tensor * ggml_view_3d(
struct ggml_context * ctx,
struct ggml_tensor * a,
int64_t ne0,
int64_t ne1,
int64_t ne2,
size_t nb1, // row stride in bytes
size_t nb2, // slice stride in bytes
size_t offset);
struct ggml_tensor * ggml_permute(
struct ggml_context * ctx,
struct ggml_tensor * a,
@@ -748,8 +763,8 @@ enum ggml_opt_result ggml_opt(
// quantization
//
size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int qk, int64_t * hist);
size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int qk, int64_t * hist);
size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
//
// system info

660
llama.cpp

File diff suppressed because it is too large Load Diff

32
llama.h
View File

@@ -6,7 +6,7 @@
#include <stdbool.h>
#ifdef LLAMA_SHARED
# ifdef _WIN32
# if defined(_WIN32) && !defined(__MINGW32__)
# ifdef LLAMA_BUILD
# define LLAMA_API __declspec(dllexport)
# else
@@ -20,7 +20,7 @@
#endif
#define LLAMA_FILE_VERSION 1
#define LLAMA_FILE_MAGIC 0x67676d66 // 'ggmf' in hex
#define LLAMA_FILE_MAGIC 0x67676a74 // 'ggjt' in hex
#define LLAMA_FILE_MAGIC_UNVERSIONED 0x67676d6c // pre-versioned files
#ifdef __cplusplus
@@ -45,7 +45,7 @@ extern "C" {
} llama_token_data;
typedef void (*llama_progress_callback)(double progress, void *ctx);
typedef void (*llama_progress_callback)(float progress, void *ctx);
struct llama_context_params {
int n_ctx; // text context
@@ -81,8 +81,24 @@ extern "C" {
LLAMA_API int llama_model_quantize(
const char * fname_inp,
const char * fname_out,
int itype,
int qk);
int itype);
// Returns the KV cache that will contain the context for the
// ongoing prediction with the model.
LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx);
// Returns the size of the KV cache
LLAMA_API size_t llama_get_kv_cache_size(struct llama_context * ctx);
// Returns the number of tokens in the KV cache
LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
// Sets the KV cache containing the current context for the model
LLAMA_API void llama_set_kv_cache(
struct llama_context * ctx,
const uint8_t * kv_cache,
size_t n_size,
int n_token_count);
// Run the llama inference to obtain the logits and probabilities for the next token.
// tokens + n_tokens is the provided batch of new tokens to process
@@ -135,9 +151,9 @@ extern "C" {
const llama_token * last_n_tokens_data,
int last_n_tokens_size,
int top_k,
double top_p,
double temp,
double repeat_penalty);
float top_p,
float temp,
float repeat_penalty);
// Performance information
LLAMA_API void llama_print_timings(struct llama_context * ctx);

BIN
media/llama-leader.jpeg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 195 KiB

BIN
media/llama0-banner.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 141 KiB

BIN
media/llama0-logo.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 176 KiB

BIN
media/llama1-banner.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 32 KiB

BIN
media/llama1-logo.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 32 KiB

View File

@@ -0,0 +1,311 @@
# Migrate ggml file(s) with ggmf magic to ggml file with ggjt magic
#
# We caused a breaking change to the file format on 2023-03-30 in:
# https://github.com/ggerganov/llama.cpp/pull/613
#
# (1) If you still have the Meta LLaMA .pth files, then close this
# file now; you can just run `convert-pth-to-ggml.py` again to
# migrate to the new format. The tool is easier to use too. It
# isn't necessary anymore to manage split output files because
# the new format always combines things into a single file.
#
# (2) If you deleted the Meta LLaMA .pth files due to save on disk
# space, then this tool is intended to help you. Please check
# out the instructions below.
#
# USAGE
#
# python migrate-ggml-2023-03-30-pr613.py INPUT OUTPUT
#
# PREREQUISITES
#
# pip install numpy
# cd llama.cpp
# make -j4
#
# EXAMPLE (7B MODEL)
#
# # you can replace all the 'f16' with 'q4_0' if you're using quantized weights
# python migrate-ggml-2023-03-30-pr613.py models/7B/ggml-model-f16.bin models/7B/ggml-model-f16-ggjt.bin
#
# # check that it works
# ./main -m models/7B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?'
#
# # you can delete the old files
# rm -f models/7B/ggml-model-f16.bin
# mv models/7B/ggml-model-f16-ggjt.bin models/7B/ggml-model-f16.bin
#
# EXAMPLE (13B MODEL)
#
# # you can replace all the 'f16' with 'q4_0' if you're using quantized weights
# python migrate-ggml-2023-03-30-pr613.py models/13B/ggml-model-f16.bin models/13B/ggml-model-f16-ggjt.bin
#
# # check that it works
# ./main -m models/13B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?'
#
# # you can delete the old files
# rm -f models/13B/ggml-model-f16.bin*
# mv models/13B/ggml-model-f16-ggjt.bin models/13B/ggml-model-f16.bin
#
import argparse
import os
import sys
import json
import struct
import numpy as np
QK = 32
GGML_TYPE_Q4_0 = 0
GGML_TYPE_Q4_1 = 1
GGML_TYPE_I8 = 2
GGML_TYPE_I16 = 3
GGML_TYPE_I32 = 4
GGML_TYPE_F16 = 5
GGML_TYPE_F32 = 6
WTYPE_NAMES = {
0: "F32",
1: "F16",
2: "Q4_0",
3: "Q4_1",
}
WTYPES = {
0: GGML_TYPE_F32,
1: GGML_TYPE_F16,
2: GGML_TYPE_Q4_0,
3: GGML_TYPE_Q4_1,
}
GGML_BLCK_SIZE = {
GGML_TYPE_Q4_0: QK,
GGML_TYPE_Q4_1: QK,
GGML_TYPE_I8: 1,
GGML_TYPE_I16: 1,
GGML_TYPE_I32: 1,
GGML_TYPE_F16: 1,
GGML_TYPE_F32: 1,
}
GGML_TYPE_SIZE = {
GGML_TYPE_Q4_0: 4 + QK//2,
GGML_TYPE_Q4_1: 4*2 + QK//2,
GGML_TYPE_I8: 1,
GGML_TYPE_I16: 2,
GGML_TYPE_I32: 4,
GGML_TYPE_F16: 2,
GGML_TYPE_F32: 4,
}
HPARAMS = [
'magic', # int32
'version', # int32
'n_vocab', # int32
'n_embd', # int32
'n_mult', # int32
'n_head', # int32
'n_layer', # int32
'n_rot', # int32
'f16', # int32
]
def read_hparams(fin):
struct_fmt = "i" * len(HPARAMS)
struct_size = struct.calcsize(struct_fmt)
buf = fin.read(struct_size)
ints = struct.unpack(struct_fmt, buf)
hparams = dict(zip(HPARAMS, ints))
return hparams
def write_hparams(fout, hparams):
struct_fmt = "i" * len(HPARAMS)
struct_size = struct.calcsize(struct_fmt)
ints = [hparams[h] for h in HPARAMS]
fout.write(struct.pack(struct_fmt, *ints))
def read_tokens(fin, hparams):
tokens = []
for i in range(hparams['n_vocab']):
len_b = fin.read(4)
(length,) = struct.unpack("i", len_b)
word = fin.read(length)
score_b = fin.read(4)
(score,) = struct.unpack("f", score_b)
tokens.append((word, score))
return tokens
def write_tokens(fout, tokens):
for word, score in tokens:
fout.write(struct.pack("i", len(word)))
fout.write(word)
fout.write(struct.pack("f", score))
def ggml_nelements(shape):
r = 1
for i in shape:
r *= i
return r
def ggml_nbytes(shape, ftype):
x = ggml_nelements(shape)
t = WTYPES[ftype]
x *= GGML_TYPE_SIZE[t]
x //= GGML_BLCK_SIZE[t]
return x
def copy_tensors(fin, fout, part_id, n_parts):
while True:
b = fin.read(4)
if not b: break
(n_dims,) = struct.unpack("i", b)
b = fin.read(4)
(length,) = struct.unpack("i", b)
b = fin.read(4)
(ftype,) = struct.unpack("i", b)
assert n_dims in (1, 2)
partshape = list(range(n_dims))
for i in range(n_dims):
b = fin.read(4)
partshape[i] = struct.unpack("i", b)[0]
partshape = list(reversed(partshape))
name = fin.read(length)
data = fin.read(ggml_nbytes(partshape, ftype))
blck_size = GGML_BLCK_SIZE[WTYPES[ftype]]
type_size = GGML_TYPE_SIZE[WTYPES[ftype]]
print(f"Processing tensor {name} with shape: {partshape} and type: {WTYPE_NAMES[ftype]}")
# determine dimension along which multipart tensor is sharded
#
# split_dim 0 regex:
# - output.*
# - layers.*.attention.wq.weight
# - layers.*.attention.wk.weight
# - layers.*.attention.wv.weight
# - layers.*.feed_forward.w1.weight
# - layers.*.feed_forward.w3.weight
#
# split_dim 1 regex:
# - tok_embeddings.*
# - layers.*.attention.wo.weight
# - layers.*.feed_forward.w2.weight
#
if n_dims > 1:
split_dim = 1
if b"tok_embeddings" in name:
split_dim = 1
elif b"layers" in name:
if b"attention.wo.weight" in name:
split_dim = 1
elif b"feed_forward.w2.weight" in name:
split_dim = 1
else:
split_dim = 0
elif b"output" in name:
split_dim = 0
# output tensor header
fullshape = list(partshape)
if n_dims > 1:
fullshape[split_dim] *= n_parts
fout.write(struct.pack("iii", n_dims, len(name), ftype))
for dim in reversed(fullshape):
fout.write(struct.pack("i", dim))
fout.write(name)
# ensure tensor data is aligned
tensor_data_offset = fout.tell()
while tensor_data_offset % QK != 0:
fout.write(struct.pack("B", 0))
tensor_data_offset += 1
# output unified mappable tensor data
if n_dims == 1 or n_parts == 1:
# copy tensor which we thankfully received in one piece
if part_id == 0:
fout.write(data)
elif split_dim == 0:
# reassemble multifile tensor containing some of the rows
rows_per_chunk = partshape[0]
current_row = part_id * rows_per_chunk
bytes_per_row = fullshape[1] // blck_size * type_size
offset = current_row * bytes_per_row
fout.seek(tensor_data_offset + offset)
fout.write(data)
elif split_dim == 1:
# reassemble multifile tensor containing some of the cols
cols_per_chunk = partshape[1]
current_col = part_id * cols_per_chunk
bpr = partshape[1] // blck_size * type_size
bytes_per_row = fullshape[1] // blck_size * type_size
offset_current_col = current_col // blck_size * type_size
for row in range(partshape[0]):
offset_row = row * bytes_per_row
offset = offset_row + offset_current_col
fout.seek(tensor_data_offset + offset)
fout.write(data[row * bpr:row * bpr + bpr])
# advance file position to next tensor
fout.seek(tensor_data_offset + ggml_nbytes(fullshape, ftype))
def parse_args():
parser = argparse.ArgumentParser(description='Migrate from GGML to new GGJT file format')
parser.add_argument('fin_path', help='your old ggml file (leave out the .1 .2 etc.)')
parser.add_argument('fout_path', help='your new ggjt file name')
return parser.parse_args()
def main():
args = parse_args()
assert args.fin_path
assert args.fout_path
assert args.fin_path != args.fout_path
with open(args.fin_path, "rb") as fin:
hparams = read_hparams(fin)
tokens = read_tokens(fin, hparams)
if hparams['magic'] == 0x67676a74: # ggjt
print(f"{args.fin_path}: input ggml has already been converted to 'ggjt' magic\n")
sys.exit(1)
if hparams['magic'] != 0x67676d66: # ggmf
print(f"{args.fin_path}: input ggml file doesn't have expected 'ggmf' magic: {hparams['magic']:#x}\n")
sys.exit(1)
hparams['magic'] = 0x67676a74 # ggjt
# count number of multipart files by convention
n_parts = 1
while True:
if os.path.exists(f"{args.fin_path}.{n_parts}"):
n_parts += 1
else:
break
# we output a single file for ggml
with open(args.fout_path, "wb") as fout:
write_hparams(fout, hparams)
write_tokens(fout, tokens)
offset_of_tensors = fout.tell()
# the tensors we load could be split across multiple files
for part_id in range(n_parts):
fout.seek(offset_of_tensors)
print(f"Processing part {part_id+1} of {n_parts}\n")
fin_path = args.fin_path
if part_id > 0:
fin_path += f".{part_id}"
with open(fin_path, "rb") as fin:
read_tokens(fin, read_hparams(fin))
copy_tensors(fin, fout, part_id, n_parts)
print(f"Done. Output file: {args.fout_path}\n")
if __name__ == "__main__":
main()

Binary file not shown.

18
prompts/reason-act.txt Normal file
View File

@@ -0,0 +1,18 @@
You run in a loop of Thought, Action, Observation.
At the end of the loop either Answer or restate your Thought and Action.
Use Thought to describe your thoughts about the question you have been asked.
Use Action to run one of these actions available to you:
- calculate[python math expression]
Observation will be the result of running those actions
Question: What is 4 * 7 / 3?
Thought: Do I need to use an action? Yes, I use calculate to do math
Action: calculate[4 * 7 / 3]
Observation: 9.3333333333
Thought: Do I need to use an action? No, have the result
Answer: The calculate tool says it is 9.3333333333
Question: What is capital of france?
Thought: Do I need to use an action? No, I know the answer
Answer: Paris is the capital of France
Question:

View File

@@ -1,127 +0,0 @@
#!/usr/bin/env python3
"""Script to execute the "quantize" script on a given set of models."""
import subprocess
import argparse
import glob
import sys
import os
def main():
"""Update the quantize binary name depending on the platform and parse
the command line arguments and execute the script.
"""
if "linux" in sys.platform or "darwin" in sys.platform:
quantize_script_binary = "quantize"
elif "win32" in sys.platform or "cygwin" in sys.platform:
quantize_script_binary = "quantize.exe"
else:
print("WARNING: Unknown platform. Assuming a UNIX-like OS.\n")
quantize_script_binary = "quantize"
parser = argparse.ArgumentParser(
prog='python3 quantize.py',
description='This script quantizes the given models by applying the '
f'"{quantize_script_binary}" script on them.'
)
parser.add_argument(
'models', nargs='+', choices=('7B', '13B', '30B', '65B'),
help='The models to quantize.'
)
parser.add_argument(
'-r', '--remove-16', action='store_true', dest='remove_f16',
help='Remove the f16 model after quantizing it.'
)
parser.add_argument(
'-m', '--models-path', dest='models_path',
default=os.path.join(os.getcwd(), "models"),
help='Specify the directory where the models are located.'
)
parser.add_argument(
'-q', '--quantize-script-path', dest='quantize_script_path',
default=os.path.join(os.getcwd(), quantize_script_binary),
help='Specify the path to the "quantize" script.'
)
# TODO: Revise this code
# parser.add_argument(
# '-t', '--threads', dest='threads', type='int',
# default=os.cpu_count(),
# help='Specify the number of threads to use to quantize many models at '
# 'once. Defaults to os.cpu_count().'
# )
args = parser.parse_args()
args.models_path = os.path.abspath(args.models_path)
if not os.path.isfile(args.quantize_script_path):
print(
f'The "{quantize_script_binary}" script was not found in the '
"current location.\nIf you want to use it from another location, "
"set the --quantize-script-path argument from the command line."
)
sys.exit(1)
for model in args.models:
# The model is separated in various parts
# (ggml-model-f16.bin, ggml-model-f16.bin.0, ggml-model-f16.bin.1...)
f16_model_path_base = os.path.join(
args.models_path, model, "ggml-model-f16.bin"
)
f16_model_parts_paths = map(
lambda filename: os.path.join(f16_model_path_base, filename),
glob.glob(f"{f16_model_path_base}*")
)
for f16_model_part_path in f16_model_parts_paths:
if not os.path.isfile(f16_model_part_path):
print(
f"The f16 model {os.path.basename(f16_model_part_path)} "
f"was not found in {args.models_path}{os.path.sep}{model}"
". If you want to use it from another location, set the "
"--models-path argument from the command line."
)
sys.exit(1)
__run_quantize_script(
args.quantize_script_path, f16_model_part_path
)
if args.remove_f16:
os.remove(f16_model_part_path)
# This was extracted to a top-level function for parallelization, if
# implemented. See https://github.com/ggerganov/llama.cpp/pull/222/commits/f8db3d6cd91bf1a1342db9d29e3092bc12dd783c#r1140496406
def __run_quantize_script(script_path, f16_model_part_path):
"""Run the quantize script specifying the path to it and the path to the
f16 model to quantize.
"""
new_quantized_model_path = f16_model_part_path.replace("f16", "q4_0")
subprocess.run(
[script_path, f16_model_part_path, new_quantized_model_path, "2"],
check=True
)
if __name__ == "__main__":
try:
main()
except subprocess.CalledProcessError:
print("\nAn error ocurred while trying to quantize the models.")
sys.exit(1)
except KeyboardInterrupt:
sys.exit(0)
else:
print("\nSuccesfully quantized all models.")

1
spm-headers/llama.h Symbolic link
View File

@@ -0,0 +1 @@
../llama.h

View File

@@ -5,5 +5,6 @@ function(llama_add_test source)
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
endfunction()
# llama_add_test(test-double-float.c) # SLOW
llama_add_test(test-quantize.c)
llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)

53
tests/test-double-float.c Normal file
View File

@@ -0,0 +1,53 @@
// These tests may take a long time!
// They are to prove that conversion from double to float of various functions in ggml.c doesn't affect the result.
// This is done by checking all finite (non-NaN, non-infinite) floats.
#undef NDEBUG
#include <assert.h>
#include <immintrin.h>
#include <math.h>
#include <stdint.h>
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wdouble-promotion"
// ggml.c::quantize_row_q4_0_reference
inline static uint8_t round_orig(float v0) { return ((int8_t) (round(v0))) + 8; }
// ggml.c::ggml_silu_f32
inline static float silu_orig(float x) {
return x/(1.0 + exp(-x));
}
#pragma GCC diagnostic pop
// ggml.c::quantize_row_q4_0_reference
inline static uint8_t round_float(float v0) { return (int8_t)roundf(v0) + 8; }
// ggml.c::ggml_silu_f32
inline static float silu_float(float x) {
return x/(1.0f + expf(-x));
}
int main(void) {
uint32_t x = UINT32_MAX;
do {
float f = *(float *)&x;
assert(!isfinite(f) || (round_orig(f) == round_float(f)));
} while (x--);
#ifdef __F16C__
// GELU and SILU implementations are used with a FP16 lookup table.
// The original and float-only results are not equal for all inputs after converting to FP16.
// GELU is an approximation anyway (tanh), not tested here.
// For SILU, verify that the results are at least the closest floating point numbers, if the FP16 values don't match.
for (x = 0; x <= UINT16_MAX; x++) {
float f = _cvtsh_ss(x);
const float so = silu_orig(f);
const float sf = silu_float(f);
assert( (_cvtss_sh(so, 0) == _cvtss_sh(sf, 0))
|| (nextafterf(so, sf) == sf)
|| (nextafterf(sf, so) == so));
}
#endif
}

View File

@@ -13,7 +13,7 @@ int main(void) {
src[i] = (float)(i + 1);
}
size_t size = ggml_quantize_q4_0(src, dst, QK, QK, QK, hist);
size_t size = ggml_quantize_q4_0(src, dst, QK, QK, hist);
assert(size == 20);
float max_result = ((float *)dst)[0];
float max_expected = src[31] / ((1 << 3) - 1);
@@ -24,7 +24,7 @@ int main(void) {
assert(q4_result == q4_expected);
}
size = ggml_quantize_q4_1(src, dst, QK, QK, QK, hist);
size = ggml_quantize_q4_1(src, dst, QK, QK, hist);
assert(size == 24);
float delta_result = ((float *)dst)[0];
float delta_expected = (src[31] - src[0]) / ((1 << 4) - 1);

View File

@@ -77,5 +77,7 @@ int main(int argc, char **argv) {
}
}
llama_free(ctx);
return 0;
}